From 0e02f22d089df9274af91e863bc3872ae19c869a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Fri, 23 Feb 2018 00:37:36 +0100
Subject: [PATCH] Exception handling

Big problem with the url https:/plus.google.com/+Python concerning
robots parsing.
Didn't find the bug. @tobast, if you have some time to look at it :)
---
 crawl/crawl.py | 113 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index ee32971..46c7707 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,7 +1,9 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.error import URLError
 from urllib.parse import urlparse
 
+from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
@@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 
 class Settings:
-    USER_AGENT = 'BlahBlah'
+    USER_AGENT = 'Blah'
 
 settings = Settings()
 startup_time = datetime.now()
@@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
     soup = BeautifulSoup(html, "html.parser")
     # Get only the body
     body = soup.find('body')
+    if not body:
+        return links_list
     # remove the body
     if body.footer:
         body.footer.decompose()
@@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
     for comment in comments:
         comment.extract()
 
-    print("Retrieving footers")
     footers = soup.findAll(id=FOOTER_URL)
     for footer in footers:
         footer.extract()
 
     # Remove all bookmark links pointing to the current html page.
-    links = map(lambda link: link["href"], body.find_all("a"))
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
     for link in links:
-        if link.startswith("http"):
-            links_list.append(link)
-        elif link.startswith('/'): #Internal link, linking to page root url
-            links_list.append(root_url + link)
-        elif link.startswith("#"):
-            print("Invalid link : internal bookmark")
-        else:
-            links_list.append(current_page + link)
+        if link: #Edge case, if no href found.
+            if link.startswith("http"):
+                links_list.append(link)
+            elif link.startswith('/'): #Internal link, linking to page root url
+                links_list.append(root_url + link)
+            elif link.startswith("#"):
+                print("Invalid link : internal bookmark")
+            else:
+                links_list.append(current_page + "/" + link)
 
     ## uniqifier works with python <= 3.6
     #seen = set()
@@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
     # uniqifier
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(links_list))
-    print(links_list)
 
     return links_list
 
@@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
     def __init__(self, name):
         self.name = name
         self.last_crawled = datetime.fromtimestamp(0)
-        robots_url = self.urlroot() + 'robots.txt'
-        self.robot_parser = RobotFileParser(robots_url)
-        self.robot_parser.read()  # TODO async?
-
-        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-        if delay is None:
-            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-            if req_rate is None:
-                delay = 5
-            else:
-                delay = req_rate.requests, req_rate.seconds
-        self.crawl_delay = timedelta(seconds=delay)
+        self.dead = False
+        try:
+            robots_url = self.urlroot() + 'robots.txt'
+            self.robot_parser = RobotFileParser(robots_url)
+            self.robot_parser.read()  # TODO async?
+        except (URLError, CertificateError):
+            try:
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                self.robot_parser = RobotFileParser(robots_url)
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
+                self.dead = True
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.dead:
+            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+            if delay is None:
+                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                if req_rate is None:
+                    delay = 5
+                else:
+                    delay = req_rate.requests, req_rate.seconds
+            self.crawl_delay = timedelta(seconds=delay)
 
     def urlroot(self):
         ''' Get the root url for this website '''
         return 'https://{}/'.format(self.name)
 
+    def unsafe_urlroot(self):
+        return 'http://{}/'.format(self.name)
+
     def fetch_delay(self):
         ''' Get the delay needed before fetching a page is possible '''
         can_fetch_time = self.last_crawled + self.crawl_delay
@@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
     def can_fetch(self, url):
         ''' Check whether this program can fetch a given page '''
-        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
 
     def fetching(self):
         ''' Tell the scheduler that a page is being fetched now '''
@@ -140,8 +159,8 @@ class CrawlingThread(Thread):
 
     def run(self):
         tasks = []
-        tasks.append(async_crawler("https://python.org/"))
-        #tasks.append(async_print('https://python.org/about/gettingstarted'))
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        tasks.append(async_print('https://python.org/'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -156,7 +175,7 @@ class PageGetter:
         self.url = url
         self.session = session
 
-    async def get(self):
+    async def get(self, ssl=True):
         """ Actually retrieve the webpage """
         scheduler = WebsiteScheduler(self.url)
         if not scheduler.can_fetch(self.url):
@@ -168,14 +187,17 @@ class PageGetter:
             delay = scheduler.fetch_delay()
         scheduler.fetching()
         async with async_timeout.timeout(10):
-            async with self.session.get(self.url) as resp:
-                return await resp.text()
+            async with self.session.get(self.url, ssl=ssl) as resp:
+                try:
+                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None
 
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get()
+        html = await PageGetter(session, url).get(ssl=False)
 
         print('GOT {}HTML for {} at {}'.format(
             'None ' if html is None else '',
@@ -194,19 +216,22 @@ async def async_crawler(url):
                 return crawled
             parsed_url = urlparse(url)
             print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get()
-            new_urls = url_getter(
-                html,
-                url,
-                parsed_url.scheme + "://" + parsed_url.netloc
-            )
-            crawled += [url]
-            sampled = sample(
-                new_urls,
-                randrange(min(MAX_PER_PAGE, len(new_urls)))
-            )
-            queue += [sample_url for sample_url in sampled if sample_url not in
-                      queue and sample_url not in crawled]
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
+                new_urls = url_getter(
+                    html,
+                    url,
+                    parsed_url.scheme + "://" + parsed_url.netloc
+                )
+                crawled += [url]
+                if new_urls:
+                    sampled = sample(
+                        new_urls,
+                        randrange(min(MAX_PER_PAGE, len(new_urls)))
+                    )
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
     print(crawled)
 
 if __name__ == '__main__':