Exception handling
Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :)
This commit is contained in:
parent
77ca7ebcb9
commit
0e02f22d08
1 changed files with 69 additions and 44 deletions
113
crawl/crawl.py
113
crawl/crawl.py
|
@ -1,7 +1,9 @@
|
|||
from threading import Thread
|
||||
from urllib.robotparser import RobotFileParser
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ssl import CertificateError
|
||||
from random import sample, randrange
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
|
|||
FOOTER_URL = re.compile(".*footer.*")
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'BlahBlah'
|
||||
USER_AGENT = 'Blah'
|
||||
|
||||
settings = Settings()
|
||||
startup_time = datetime.now()
|
||||
|
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
|
|||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Get only the body
|
||||
body = soup.find('body')
|
||||
if not body:
|
||||
return links_list
|
||||
# remove the body
|
||||
if body.footer:
|
||||
body.footer.decompose()
|
||||
|
@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
|
|||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
print("Retrieving footers")
|
||||
footers = soup.findAll(id=FOOTER_URL)
|
||||
for footer in footers:
|
||||
footer.extract()
|
||||
|
||||
# Remove all bookmark links pointing to the current html page.
|
||||
links = map(lambda link: link["href"], body.find_all("a"))
|
||||
links = map(lambda link: link.get("href", ""), body.find_all("a"))
|
||||
for link in links:
|
||||
if link.startswith("http"):
|
||||
links_list.append(link)
|
||||
elif link.startswith('/'): #Internal link, linking to page root url
|
||||
links_list.append(root_url + link)
|
||||
elif link.startswith("#"):
|
||||
print("Invalid link : internal bookmark")
|
||||
else:
|
||||
links_list.append(current_page + link)
|
||||
if link: #Edge case, if no href found.
|
||||
if link.startswith("http"):
|
||||
links_list.append(link)
|
||||
elif link.startswith('/'): #Internal link, linking to page root url
|
||||
links_list.append(root_url + link)
|
||||
elif link.startswith("#"):
|
||||
print("Invalid link : internal bookmark")
|
||||
else:
|
||||
links_list.append(current_page + "/" + link)
|
||||
|
||||
## uniqifier works with python <= 3.6
|
||||
#seen = set()
|
||||
|
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
|
|||
# uniqifier
|
||||
# Works only with python >= 3.6
|
||||
links_list = list(dict.fromkeys(links_list))
|
||||
print(links_list)
|
||||
|
||||
return links_list
|
||||
|
||||
|
@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.last_crawled = datetime.fromtimestamp(0)
|
||||
robots_url = self.urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read() # TODO async?
|
||||
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
self.dead = False
|
||||
try:
|
||||
robots_url = self.urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read() # TODO async?
|
||||
except (URLError, CertificateError):
|
||||
try:
|
||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read()
|
||||
except URLError: # Almost surely an offline website.
|
||||
self.dead = True
|
||||
self.crawl_delay = 0
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise e
|
||||
if not self.dead:
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
|
||||
def urlroot(self):
|
||||
''' Get the root url for this website '''
|
||||
return 'https://{}/'.format(self.name)
|
||||
|
||||
def unsafe_urlroot(self):
|
||||
return 'http://{}/'.format(self.name)
|
||||
|
||||
def fetch_delay(self):
|
||||
''' Get the delay needed before fetching a page is possible '''
|
||||
can_fetch_time = self.last_crawled + self.crawl_delay
|
||||
|
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
|
||||
def can_fetch(self, url):
|
||||
''' Check whether this program can fetch a given page '''
|
||||
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
||||
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
||||
|
||||
def fetching(self):
|
||||
''' Tell the scheduler that a page is being fetched now '''
|
||||
|
@ -140,8 +159,8 @@ class CrawlingThread(Thread):
|
|||
|
||||
def run(self):
|
||||
tasks = []
|
||||
tasks.append(async_crawler("https://python.org/"))
|
||||
#tasks.append(async_print('https://python.org/about/gettingstarted'))
|
||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||
tasks.append(async_print('https://python.org/'))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
@ -156,7 +175,7 @@ class PageGetter:
|
|||
self.url = url
|
||||
self.session = session
|
||||
|
||||
async def get(self):
|
||||
async def get(self, ssl=True):
|
||||
""" Actually retrieve the webpage """
|
||||
scheduler = WebsiteScheduler(self.url)
|
||||
if not scheduler.can_fetch(self.url):
|
||||
|
@ -168,14 +187,17 @@ class PageGetter:
|
|||
delay = scheduler.fetch_delay()
|
||||
scheduler.fetching()
|
||||
async with async_timeout.timeout(10):
|
||||
async with self.session.get(self.url) as resp:
|
||||
return await resp.text()
|
||||
async with self.session.get(self.url, ssl=ssl) as resp:
|
||||
try:
|
||||
return await resp.text()
|
||||
except UnicodeDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
async def async_print(url):
|
||||
""" Debug function to follow what's actually happening """
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await PageGetter(session, url).get()
|
||||
html = await PageGetter(session, url).get(ssl=False)
|
||||
|
||||
print('GOT {}HTML for {} at {}'.format(
|
||||
'None ' if html is None else '',
|
||||
|
@ -194,19 +216,22 @@ async def async_crawler(url):
|
|||
return crawled
|
||||
parsed_url = urlparse(url)
|
||||
print("Crawling {}".format(url))
|
||||
html = await PageGetter(session, url).get()
|
||||
new_urls = url_getter(
|
||||
html,
|
||||
url,
|
||||
parsed_url.scheme + "://" + parsed_url.netloc
|
||||
)
|
||||
crawled += [url]
|
||||
sampled = sample(
|
||||
new_urls,
|
||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||
)
|
||||
queue += [sample_url for sample_url in sampled if sample_url not in
|
||||
queue and sample_url not in crawled]
|
||||
html = await PageGetter(session, url).get(ssl=False)
|
||||
if html:
|
||||
new_urls = url_getter(
|
||||
html,
|
||||
url,
|
||||
parsed_url.scheme + "://" + parsed_url.netloc
|
||||
)
|
||||
crawled += [url]
|
||||
if new_urls:
|
||||
sampled = sample(
|
||||
new_urls,
|
||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||
)
|
||||
queue += [sample_url for sample_url in sampled if
|
||||
sample_url not in queue and sample_url not in
|
||||
crawled]
|
||||
print(crawled)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue