Make the code somewhat readable

This commit is contained in:
Théophile Bastian 2018-02-21 11:54:41 +01:00
parent c97acb22b5
commit b05e642c79

View file

@ -1,8 +1,6 @@
from threading import Thread
from urllib.robotparser import RobotFileParser
import random
import re
from datetime import datetime, timedelta
@ -10,8 +8,8 @@ import asyncio
import aiohttp
import async_timeout
#from django.conf import settings
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
class Settings:
USER_AGENT = 'Blah'
@ -21,10 +19,14 @@ startup_time = datetime.now()
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
_instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url):
""" Canonicalize a url """
return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs):
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self):
super(CrawlingThread, self).__init__()
def run(self):
tasks = []
tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/webstats/'))
tasks.append(async_print('https://python.org/3.5/'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -92,11 +95,14 @@ class CrawlingThread(Thread):
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
self.url = url
self.session = session
async def get(self):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url):
return None
@ -112,6 +118,7 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get()
print('GOT {}HTML for {} at {}'.format(