Make the code somewhat readable
This commit is contained in:
parent
c97acb22b5
commit
b05e642c79
1 changed files with 13 additions and 6 deletions
|
@ -1,8 +1,6 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
|
|
||||||
import random
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -10,8 +8,8 @@ import asyncio
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import async_timeout
|
import async_timeout
|
||||||
|
|
||||||
#from django.conf import settings
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
|
# from django.conf import settings
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Blah'
|
USER_AGENT = 'Blah'
|
||||||
|
@ -21,10 +19,14 @@ startup_time = datetime.now()
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||||
|
interface, but spawning one instance per canonical website URL """
|
||||||
|
|
||||||
_instances = {}
|
_instances = {}
|
||||||
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
||||||
|
|
||||||
def canonical_url(cls, url):
|
def canonical_url(cls, url):
|
||||||
|
""" Canonicalize a url """
|
||||||
return cls._canonicalize.search(url).groups()[1]
|
return cls._canonicalize.search(url).groups()[1]
|
||||||
|
|
||||||
def __call__(cls, url, *args, **kwargs):
|
def __call__(cls, url, *args, **kwargs):
|
||||||
|
@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
|
|
||||||
|
|
||||||
class CrawlingThread(Thread):
|
class CrawlingThread(Thread):
|
||||||
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
tasks = []
|
tasks = []
|
||||||
tasks.append(async_print('https://python.org'))
|
tasks.append(async_print('https://python.org'))
|
||||||
tasks.append(async_print('https://python.org/webstats/'))
|
|
||||||
tasks.append(async_print('https://python.org/3.5/'))
|
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
@ -92,11 +95,14 @@ class CrawlingThread(Thread):
|
||||||
|
|
||||||
|
|
||||||
class PageGetter:
|
class PageGetter:
|
||||||
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||||
|
|
||||||
def __init__(self, session, url):
|
def __init__(self, session, url):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.session = session
|
self.session = session
|
||||||
|
|
||||||
async def get(self):
|
async def get(self):
|
||||||
|
""" Actually retrieve the webpage """
|
||||||
scheduler = WebsiteScheduler(self.url)
|
scheduler = WebsiteScheduler(self.url)
|
||||||
if not scheduler.can_fetch(self.url):
|
if not scheduler.can_fetch(self.url):
|
||||||
return None
|
return None
|
||||||
|
@ -112,6 +118,7 @@ class PageGetter:
|
||||||
|
|
||||||
|
|
||||||
async def async_print(url):
|
async def async_print(url):
|
||||||
|
""" Debug function to follow what's actually happening """
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url).get()
|
html = await PageGetter(session, url).get()
|
||||||
print('GOT {}HTML for {} at {}'.format(
|
print('GOT {}HTML for {} at {}'.format(
|
||||||
|
|
Loading…
Reference in a new issue