Make the code somewhat readable
This commit is contained in:
parent
c97acb22b5
commit
b05e642c79
1 changed files with 13 additions and 6 deletions
|
@ -1,8 +1,6 @@
|
|||
from threading import Thread
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import random
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
@ -10,8 +8,8 @@ import asyncio
|
|||
import aiohttp
|
||||
import async_timeout
|
||||
|
||||
#from django.conf import settings
|
||||
|
||||
# Ugly hack to use this module alone instead of integrating it with Django
|
||||
# from django.conf import settings
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Blah'
|
||||
|
@ -21,10 +19,14 @@ startup_time = datetime.now()
|
|||
|
||||
|
||||
class WebsiteSchedulerMeta(type):
|
||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||
interface, but spawning one instance per canonical website URL """
|
||||
|
||||
_instances = {}
|
||||
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
||||
|
||||
def canonical_url(cls, url):
|
||||
""" Canonicalize a url """
|
||||
return cls._canonicalize.search(url).groups()[1]
|
||||
|
||||
def __call__(cls, url, *args, **kwargs):
|
||||
|
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
|
||||
|
||||
class CrawlingThread(Thread):
|
||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||
since the thread will need its own event loop. """
|
||||
|
||||
def __init__(self):
|
||||
super(CrawlingThread, self).__init__()
|
||||
|
||||
def run(self):
|
||||
tasks = []
|
||||
tasks.append(async_print('https://python.org'))
|
||||
tasks.append(async_print('https://python.org/webstats/'))
|
||||
tasks.append(async_print('https://python.org/3.5/'))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
@ -92,11 +95,14 @@ class CrawlingThread(Thread):
|
|||
|
||||
|
||||
class PageGetter:
|
||||
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||
|
||||
def __init__(self, session, url):
|
||||
self.url = url
|
||||
self.session = session
|
||||
|
||||
async def get(self):
|
||||
""" Actually retrieve the webpage """
|
||||
scheduler = WebsiteScheduler(self.url)
|
||||
if not scheduler.can_fetch(self.url):
|
||||
return None
|
||||
|
@ -112,6 +118,7 @@ class PageGetter:
|
|||
|
||||
|
||||
async def async_print(url):
|
||||
""" Debug function to follow what's actually happening """
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await PageGetter(session, url).get()
|
||||
print('GOT {}HTML for {} at {}'.format(
|
||||
|
|
Loading…
Reference in a new issue