Exception handling

Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :)
2018-02-23 00:37:36 +01:00 · 2018-02-23 00:37:36 +01:00 · 0e02f22d08
commit 0e02f22d08
parent 77ca7ebcb9
1 changed files with 69 additions and 44 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,7 +1,9 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.error import URLError
 from urllib.parse import urlparse

+from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")

 class Settings:
-    USER_AGENT = 'BlahBlah'
+    USER_AGENT = 'Blah'

 settings = Settings()
 startup_time = datetime.now()
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
+    if not body:
+        return links_list
    # remove the body
    if body.footer:
        body.footer.decompose()
@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
    for comment in comments:
        comment.extract()

-    print("Retrieving footers")
    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()

    # Remove all bookmark links pointing to the current html page.
-    links = map(lambda link: link["href"], body.find_all("a"))
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
    for link in links:
-        if link.startswith("http"):
-            links_list.append(link)
-        elif link.startswith('/'): #Internal link, linking to page root url
-            links_list.append(root_url + link)
-        elif link.startswith("#"):
-            print("Invalid link : internal bookmark")
-        else:
-            links_list.append(current_page + link)
+        if link: #Edge case, if no href found.
+            if link.startswith("http"):
+                links_list.append(link)
+            elif link.startswith('/'): #Internal link, linking to page root url
+                links_list.append(root_url + link)
+            elif link.startswith("#"):
+                print("Invalid link : internal bookmark")
+            else:
+                links_list.append(current_page + "/" + link)

    ## uniqifier works with python <= 3.6
    #seen = set()
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))
-    print(links_list)

    return links_list

@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
-        robots_url = self.urlroot() + 'robots.txt'
-        self.robot_parser = RobotFileParser(robots_url)
-        self.robot_parser.read()  # TODO async?
-
-        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-        if delay is None:
-            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-            if req_rate is None:
-                delay = 5
-            else:
-                delay = req_rate.requests, req_rate.seconds
-        self.crawl_delay = timedelta(seconds=delay)
+        self.dead = False
+        try:
+            robots_url = self.urlroot() + 'robots.txt'
+            self.robot_parser = RobotFileParser(robots_url)
+            self.robot_parser.read()  # TODO async?
+        except (URLError, CertificateError):
+            try:
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                self.robot_parser = RobotFileParser(robots_url)
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
+                self.dead = True
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.dead:
+            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+            if delay is None:
+                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                if req_rate is None:
+                    delay = 5
+                else:
+                    delay = req_rate.requests, req_rate.seconds
+            self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)

+    def unsafe_urlroot(self):
+        return 'http://{}/'.format(self.name)
+
    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -140,8 +159,8 @@ class CrawlingThread(Thread):

    def run(self):
        tasks = []
-        tasks.append(async_crawler("https://python.org/"))
-        #tasks.append(async_print('https://python.org/about/gettingstarted'))
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        tasks.append(async_print('https://python.org/'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -156,7 +175,7 @@ class PageGetter:
        self.url = url
        self.session = session

-    async def get(self):
+    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
@ -168,14 +187,17 @@ class PageGetter:
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
-            async with self.session.get(self.url) as resp:
-                return await resp.text()
+            async with self.session.get(self.url, ssl=ssl) as resp:
+                try:
+                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None


 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get()
+        html = await PageGetter(session, url).get(ssl=False)

        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
@ -194,19 +216,22 @@ async def async_crawler(url):
                return crawled
            parsed_url = urlparse(url)
            print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get()
-            new_urls = url_getter(
-                html,
-                url,
-                parsed_url.scheme + "://" + parsed_url.netloc
-            )
-            crawled += [url]
-            sampled = sample(
-                new_urls,
-                randrange(min(MAX_PER_PAGE, len(new_urls)))
-            )
-            queue += [sample_url for sample_url in sampled if sample_url not in
-                      queue and sample_url not in crawled]
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
+                new_urls = url_getter(
+                    html,
+                    url,
+                    parsed_url.scheme + "://" + parsed_url.netloc
+                )
+                crawled += [url]
+                if new_urls:
+                    sampled = sample(
+                        new_urls,
+                        randrange(min(MAX_PER_PAGE, len(new_urls)))
+                    )
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
    print(crawled)

 if __name__ == '__main__':