Make the code somewhat readable

2018-02-21 11:54:41 +01:00 · 2018-02-21 11:54:41 +01:00 · b05e642c79
parent c97acb22b5
commit b05e642c79
1 changed files with 13 additions and 6 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,8 +1,6 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 import random
 import re
 from datetime import datetime, timedelta
@ -10,8 +8,8 @@ import asyncio
 import aiohttp
 import async_timeout
-#from django.conf import settings
+# Ugly hack to use this module alone instead of integrating it with Django
-
+# from django.conf import settings
 class Settings:
    USER_AGENT = 'Blah'
@ -21,10 +19,14 @@ startup_time = datetime.now()
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
    def canonical_url(cls, url):
        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]
    def __call__(cls, url, *args, **kwargs):
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """
    def __init__(self):
        super(CrawlingThread, self).__init__()
    def run(self):
        tasks = []
        tasks.append(async_print('https://python.org'))
        tasks.append(async_print('https://python.org/webstats/'))
        tasks.append(async_print('https://python.org/3.5/'))
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -92,11 +95,14 @@ class CrawlingThread(Thread):
 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """
    def __init__(self, session, url):
        self.url = url
        self.session = session
    async def get(self):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None
@ -112,6 +118,7 @@ class PageGetter:
 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get()
        print('GOT {}HTML for {} at {}'.format(