Make the code somewhat readable

2018-02-21 11:54:41 +01:00 · 2018-02-21 11:54:41 +01:00 · b05e642c79
commit b05e642c79
parent c97acb22b5
1 changed files with 13 additions and 6 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,8 +1,6 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser

-import random
-
 import re
 from datetime import datetime, timedelta

@ -10,8 +8,8 @@ import asyncio
 import aiohttp
 import async_timeout

-#from django.conf import settings
-
+# Ugly hack to use this module alone instead of integrating it with Django
+# from django.conf import settings

 class Settings:
    USER_AGENT = 'Blah'
@ -21,10 +19,14 @@ startup_time = datetime.now()


 class WebsiteSchedulerMeta(type):
+    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
+    interface, but spawning one instance per canonical website URL """
+
    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')

    def canonical_url(cls, url):
+        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]

    def __call__(cls, url, *args, **kwargs):
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):


 class CrawlingThread(Thread):
+    """ A separate thread for the crawling task. This is needed to use asyncio,
+    since the thread will need its own event loop. """
+
    def __init__(self):
        super(CrawlingThread, self).__init__()

    def run(self):
        tasks = []
        tasks.append(async_print('https://python.org'))
-        tasks.append(async_print('https://python.org/webstats/'))
-        tasks.append(async_print('https://python.org/3.5/'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -92,11 +95,14 @@ class CrawlingThread(Thread):


 class PageGetter:
+    """ Asynchronously get a webpage, abiding by robots.txt """
+
    def __init__(self, session, url):
        self.url = url
        self.session = session

    async def get(self):
+        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None
@ -112,6 +118,7 @@ class PageGetter:


 async def async_print(url):
+    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get()
        print('GOT {}HTML for {} at {}'.format(