From b05e642c79fabd33105c72c9a19fcd7e797a43d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Wed, 21 Feb 2018 11:54:41 +0100 Subject: [PATCH] Make the code somewhat readable --- crawl/crawl.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 7d22422..6baafad 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,8 +1,6 @@ from threading import Thread from urllib.robotparser import RobotFileParser -import random - import re from datetime import datetime, timedelta @@ -10,8 +8,8 @@ import asyncio import aiohttp import async_timeout -#from django.conf import settings - +# Ugly hack to use this module alone instead of integrating it with Django +# from django.conf import settings class Settings: USER_AGENT = 'Blah' @@ -21,10 +19,14 @@ startup_time = datetime.now() class WebsiteSchedulerMeta(type): + """ Meta-class for WebsiteScheduler, allowing a singleton class-like + interface, but spawning one instance per canonical website URL """ + _instances = {} _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') def canonical_url(cls, url): + """ Canonicalize a url """ return cls._canonicalize.search(url).groups()[1] def __call__(cls, url, *args, **kwargs): @@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): class CrawlingThread(Thread): + """ A separate thread for the crawling task. This is needed to use asyncio, + since the thread will need its own event loop. """ + def __init__(self): super(CrawlingThread, self).__init__() def run(self): tasks = [] tasks.append(async_print('https://python.org')) - tasks.append(async_print('https://python.org/webstats/')) - tasks.append(async_print('https://python.org/3.5/')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -92,11 +95,14 @@ class CrawlingThread(Thread): class PageGetter: + """ Asynchronously get a webpage, abiding by robots.txt """ + def __init__(self, session, url): self.url = url self.session = session async def get(self): + """ Actually retrieve the webpage """ scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): return None @@ -112,6 +118,7 @@ class PageGetter: async def async_print(url): + """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() print('GOT {}HTML for {} at {}'.format(