Make the code somewhat readable

This commit is contained in:
Théophile Bastian 2018-02-21 11:54:41 +01:00
parent c97acb22b5
commit b05e642c79

View file

@ -1,8 +1,6 @@
from threading import Thread from threading import Thread
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
import random
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -10,8 +8,8 @@ import asyncio
import aiohttp import aiohttp
import async_timeout import async_timeout
#from django.conf import settings # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
class Settings: class Settings:
USER_AGENT = 'Blah' USER_AGENT = 'Blah'
@ -21,10 +19,14 @@ startup_time = datetime.now()
class WebsiteSchedulerMeta(type): class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
_instances = {} _instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url): def canonical_url(cls, url):
""" Canonicalize a url """
return cls._canonicalize.search(url).groups()[1] return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs): def __call__(cls, url, *args, **kwargs):
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
class CrawlingThread(Thread): class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self): def __init__(self):
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
def run(self): def run(self):
tasks = [] tasks = []
tasks.append(async_print('https://python.org')) tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/webstats/'))
tasks.append(async_print('https://python.org/3.5/'))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -92,11 +95,14 @@ class CrawlingThread(Thread):
class PageGetter: class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url): def __init__(self, session, url):
self.url = url self.url = url
self.session = session self.session = session
async def get(self): async def get(self):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url) scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url): if not scheduler.can_fetch(self.url):
return None return None
@ -112,6 +118,7 @@ class PageGetter:
async def async_print(url): async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get() html = await PageGetter(session, url).get()
print('GOT {}HTML for {} at {}'.format( print('GOT {}HTML for {} at {}'.format(