Exception handling

Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :)
2018-02-23 00:37:36 +01:00 · 2018-02-23 00:37:36 +01:00 · 0e02f22d08
commit 0e02f22d08
parent 77ca7ebcb9
1 changed files with 69 additions and 44 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,7 +1,9 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 from urllib.error import URLError
 from urllib.parse import urlparse
 from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 class Settings:
-    USER_AGENT = 'BlahBlah'
+    USER_AGENT = 'Blah'
 settings = Settings()
 startup_time = datetime.now()
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
    if not body:
        return links_list
    # remove the body
    if body.footer:
        body.footer.decompose()
@ -43,14 +47,14 @@ def url_getter(html, current_page, root_url):
    for comment in comments:
        comment.extract()
    print("Retrieving footers")
    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()
    # Remove all bookmark links pointing to the current html page.
-    links = map(lambda link: link["href"], body.find_all("a"))
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
    for link in links:
        if link: #Edge case, if no href found.
            if link.startswith("http"):
                links_list.append(link)
            elif link.startswith('/'): #Internal link, linking to page root url
@ -58,7 +62,7 @@ def url_getter(html, current_page, root_url):
            elif link.startswith("#"):
                print("Invalid link : internal bookmark")
            else:
-            links_list.append(current_page + link)
+                links_list.append(current_page + "/" + link)
    ## uniqifier works with python <= 3.6
    #seen = set()
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))
    print(links_list)
    return links_list
@ -98,10 +101,23 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        try:
            robots_url = self.urlroot() + 'robots.txt'
            self.robot_parser = RobotFileParser(robots_url)
            self.robot_parser.read()  # TODO async?
-
+        except (URLError, CertificateError):
            try:
                robots_url = self.unsafe_urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
                self.robot_parser.read()
            except URLError: # Almost surely an offline website.
                self.dead = True
                self.crawl_delay = 0
        except Exception as e:
            print(e)
            raise e
        if not self.dead:
            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
            if delay is None:
                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
@ -115,6 +131,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)
    def unsafe_urlroot(self):
        return 'http://{}/'.format(self.name)
    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -140,8 +159,8 @@ class CrawlingThread(Thread):
    def run(self):
        tasks = []
-        tasks.append(async_crawler("https://python.org/"))
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
-        #tasks.append(async_print('https://python.org/about/gettingstarted'))
+        tasks.append(async_print('https://python.org/'))
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -156,7 +175,7 @@ class PageGetter:
        self.url = url
        self.session = session
-    async def get(self):
+    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
@ -168,14 +187,17 @@ class PageGetter:
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
-            async with self.session.get(self.url) as resp:
+            async with self.session.get(self.url, ssl=ssl) as resp:
                try:
                    return await resp.text()
                except UnicodeDecodeError:
                    return None
 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get()
+        html = await PageGetter(session, url).get(ssl=False)
        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
@ -194,19 +216,22 @@ async def async_crawler(url):
                return crawled
            parsed_url = urlparse(url)
            print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get()
+            html = await PageGetter(session, url).get(ssl=False)
            if html:
                new_urls = url_getter(
                    html,
                    url,
                    parsed_url.scheme + "://" + parsed_url.netloc
                )
                crawled += [url]
                if new_urls:
                    sampled = sample(
                        new_urls,
                        randrange(min(MAX_PER_PAGE, len(new_urls)))
                    )
-            queue += [sample_url for sample_url in sampled if sample_url not in
+                    queue += [sample_url for sample_url in sampled if
-                      queue and sample_url not in crawled]
+                              sample_url not in queue and sample_url not in
                              crawled]
    print(crawled)
 if __name__ == '__main__':