Exception handling

Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :)
2018-02-23 00:37:36 +01:00 · 2018-02-23 00:37:36 +01:00 · 0e02f22d08
commit 0e02f22d08
parent 77ca7ebcb9
1 changed files with 69 additions and 44 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,7 +1,9 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.error import URLError
 from urllib.parse import urlparse

+from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")

 class Settings:
-    USER_AGENT = 'BlahBlah'
+    USER_AGENT = 'Blah'

 settings = Settings()
 startup_time = datetime.now()
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
+    if not body:
+        return links_list
    # remove the body
    if body.footer:
        body.footer.decompose()
@ -43,14 +47,14 @@ def url_getter(html, current_page, root_url):
    for comment in comments:
        comment.extract()

-    print("Retrieving footers")
    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()

    # Remove all bookmark links pointing to the current html page.
-    links = map(lambda link: link["href"], body.find_all("a"))
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
    for link in links:
+        if link: #Edge case, if no href found.
            if link.startswith("http"):
                links_list.append(link)
            elif link.startswith('/'): #Internal link, linking to page root url
@ -58,7 +62,7 @@ def url_getter(html, current_page, root_url):
            elif link.startswith("#"):
                print("Invalid link : internal bookmark")
            else:
-            links_list.append(current_page + link)
+                links_list.append(current_page + "/" + link)

    ## uniqifier works with python <= 3.6
    #seen = set()
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))
-    print(links_list)

    return links_list

@ -98,10 +101,23 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
+        self.dead = False
+        try:
            robots_url = self.urlroot() + 'robots.txt'
            self.robot_parser = RobotFileParser(robots_url)
            self.robot_parser.read()  # TODO async?
-
+        except (URLError, CertificateError):
+            try:
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                self.robot_parser = RobotFileParser(robots_url)
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
+                self.dead = True
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.dead:
            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
            if delay is None:
                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
@ -115,6 +131,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)

+    def unsafe_urlroot(self):
+        return 'http://{}/'.format(self.name)
+
    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -140,8 +159,8 @@ class CrawlingThread(Thread):

    def run(self):
        tasks = []
-        tasks.append(async_crawler("https://python.org/"))
-        #tasks.append(async_print('https://python.org/about/gettingstarted'))
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        tasks.append(async_print('https://python.org/'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -156,7 +175,7 @@ class PageGetter:
        self.url = url
        self.session = session

-    async def get(self):
+    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
@ -168,14 +187,17 @@ class PageGetter:
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
-            async with self.session.get(self.url) as resp:
+            async with self.session.get(self.url, ssl=ssl) as resp:
+                try:
                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None


 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get()
+        html = await PageGetter(session, url).get(ssl=False)

        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
@ -194,19 +216,22 @@ async def async_crawler(url):
                return crawled
            parsed_url = urlparse(url)
            print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get()
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
                new_urls = url_getter(
                    html,
                    url,
                    parsed_url.scheme + "://" + parsed_url.netloc
                )
                crawled += [url]
+                if new_urls:
                    sampled = sample(
                        new_urls,
                        randrange(min(MAX_PER_PAGE, len(new_urls)))
                    )
-            queue += [sample_url for sample_url in sampled if sample_url not in
-                      queue and sample_url not in crawled]
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
    print(crawled)

 if __name__ == '__main__':