From b05e642c79fabd33105c72c9a19fcd7e797a43d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Wed, 21 Feb 2018 11:54:41 +0100
Subject: [PATCH] Make the code somewhat readable

---
 crawl/crawl.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 7d22422..6baafad 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,8 +1,6 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 
-import random
-
 import re
 from datetime import datetime, timedelta
 
@@ -10,8 +8,8 @@ import asyncio
 import aiohttp
 import async_timeout
 
-#from django.conf import settings
-
+# Ugly hack to use this module alone instead of integrating it with Django
+# from django.conf import settings
 
 class Settings:
     USER_AGENT = 'Blah'
@@ -21,10 +19,14 @@ startup_time = datetime.now()
 
 
 class WebsiteSchedulerMeta(type):
+    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
+    interface, but spawning one instance per canonical website URL """
+
     _instances = {}
     _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
 
     def canonical_url(cls, url):
+        """ Canonicalize a url """
         return cls._canonicalize.search(url).groups()[1]
 
     def __call__(cls, url, *args, **kwargs):
@@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
 
 class CrawlingThread(Thread):
+    """ A separate thread for the crawling task. This is needed to use asyncio,
+    since the thread will need its own event loop. """
+
     def __init__(self):
         super(CrawlingThread, self).__init__()
 
     def run(self):
         tasks = []
         tasks.append(async_print('https://python.org'))
-        tasks.append(async_print('https://python.org/webstats/'))
-        tasks.append(async_print('https://python.org/3.5/'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -92,11 +95,14 @@ class CrawlingThread(Thread):
 
 
 class PageGetter:
+    """ Asynchronously get a webpage, abiding by robots.txt """
+
     def __init__(self, session, url):
         self.url = url
         self.session = session
 
     async def get(self):
+        """ Actually retrieve the webpage """
         scheduler = WebsiteScheduler(self.url)
         if not scheduler.can_fetch(self.url):
             return None
@@ -112,6 +118,7 @@ class PageGetter:
 
 
 async def async_print(url):
+    """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
         html = await PageGetter(session, url).get()
         print('GOT {}HTML for {} at {}'.format(