From 0e02f22d089df9274af91e863bc3872ae19c869a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Fri, 23 Feb 2018 00:37:36 +0100 Subject: [PATCH] Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) --- crawl/crawl.py | 113 ++++++++++++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index ee32971..46c7707 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,7 +1,9 @@ from threading import Thread from urllib.robotparser import RobotFileParser +from urllib.error import URLError from urllib.parse import urlparse +from ssl import CertificateError from random import sample, randrange import re from datetime import datetime, timedelta @@ -24,7 +26,7 @@ MAX_PER_PAGE = 10 FOOTER_URL = re.compile(".*footer.*") class Settings: - USER_AGENT = 'BlahBlah' + USER_AGENT = 'Blah' settings = Settings() startup_time = datetime.now() @@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url): soup = BeautifulSoup(html, "html.parser") # Get only the body body = soup.find('body') + if not body: + return links_list # remove the body if body.footer: body.footer.decompose() @@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url): for comment in comments: comment.extract() - print("Retrieving footers") footers = soup.findAll(id=FOOTER_URL) for footer in footers: footer.extract() # Remove all bookmark links pointing to the current html page. - links = map(lambda link: link["href"], body.find_all("a")) + links = map(lambda link: link.get("href", ""), body.find_all("a")) for link in links: - if link.startswith("http"): - links_list.append(link) - elif link.startswith('/'): #Internal link, linking to page root url - links_list.append(root_url + link) - elif link.startswith("#"): - print("Invalid link : internal bookmark") - else: - links_list.append(current_page + link) + if link: #Edge case, if no href found. + if link.startswith("http"): + links_list.append(link) + elif link.startswith('/'): #Internal link, linking to page root url + links_list.append(root_url + link) + elif link.startswith("#"): + print("Invalid link : internal bookmark") + else: + links_list.append(current_page + "/" + link) ## uniqifier works with python <= 3.6 #seen = set() @@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url): # uniqifier # Works only with python >= 3.6 links_list = list(dict.fromkeys(links_list)) - print(links_list) return links_list @@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): def __init__(self, name): self.name = name self.last_crawled = datetime.fromtimestamp(0) - robots_url = self.urlroot() + 'robots.txt' - self.robot_parser = RobotFileParser(robots_url) - self.robot_parser.read() # TODO async? - - delay = self.robot_parser.crawl_delay(settings.USER_AGENT) - if delay is None: - req_rate = self.robot_parser.request_rate(settings.USER_AGENT) - if req_rate is None: - delay = 5 - else: - delay = req_rate.requests, req_rate.seconds - self.crawl_delay = timedelta(seconds=delay) + self.dead = False + try: + robots_url = self.urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() # TODO async? + except (URLError, CertificateError): + try: + robots_url = self.unsafe_urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() + except URLError: # Almost surely an offline website. + self.dead = True + self.crawl_delay = 0 + except Exception as e: + print(e) + raise e + if not self.dead: + delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + if delay is None: + req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + if req_rate is None: + delay = 5 + else: + delay = req_rate.requests, req_rate.seconds + self.crawl_delay = timedelta(seconds=delay) def urlroot(self): ''' Get the root url for this website ''' return 'https://{}/'.format(self.name) + def unsafe_urlroot(self): + return 'http://{}/'.format(self.name) + def fetch_delay(self): ''' Get the delay needed before fetching a page is possible ''' can_fetch_time = self.last_crawled + self.crawl_delay @@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' - return self.robot_parser.can_fetch(settings.USER_AGENT, url) + return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' @@ -140,8 +159,8 @@ class CrawlingThread(Thread): def run(self): tasks = [] - tasks.append(async_crawler("https://python.org/")) - #tasks.append(async_print('https://python.org/about/gettingstarted')) + #tasks.append(async_crawler("http://plus.google.com/+Python")) + tasks.append(async_print('https://python.org/')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -156,7 +175,7 @@ class PageGetter: self.url = url self.session = session - async def get(self): + async def get(self, ssl=True): """ Actually retrieve the webpage """ scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): @@ -168,14 +187,17 @@ class PageGetter: delay = scheduler.fetch_delay() scheduler.fetching() async with async_timeout.timeout(10): - async with self.session.get(self.url) as resp: - return await resp.text() + async with self.session.get(self.url, ssl=ssl) as resp: + try: + return await resp.text() + except UnicodeDecodeError: + return None async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: - html = await PageGetter(session, url).get() + html = await PageGetter(session, url).get(ssl=False) print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', @@ -194,19 +216,22 @@ async def async_crawler(url): return crawled parsed_url = urlparse(url) print("Crawling {}".format(url)) - html = await PageGetter(session, url).get() - new_urls = url_getter( - html, - url, - parsed_url.scheme + "://" + parsed_url.netloc - ) - crawled += [url] - sampled = sample( - new_urls, - randrange(min(MAX_PER_PAGE, len(new_urls))) - ) - queue += [sample_url for sample_url in sampled if sample_url not in - queue and sample_url not in crawled] + html = await PageGetter(session, url).get(ssl=False) + if html: + new_urls = url_getter( + html, + url, + parsed_url.scheme + "://" + parsed_url.netloc + ) + crawled += [url] + if new_urls: + sampled = sample( + new_urls, + randrange(min(MAX_PER_PAGE, len(new_urls))) + ) + queue += [sample_url for sample_url in sampled if + sample_url not in queue and sample_url not in + crawled] print(crawled) if __name__ == '__main__':