133 lines
4.1 KiB
Python
133 lines
4.1 KiB
Python
from threading import Thread
|
|
from urllib.robotparser import RobotFileParser
|
|
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import async_timeout
|
|
|
|
# Ugly hack to use this module alone instead of integrating it with Django
|
|
# from django.conf import settings
|
|
|
|
class Settings:
|
|
USER_AGENT = 'Blah'
|
|
|
|
settings = Settings()
|
|
startup_time = datetime.now()
|
|
|
|
|
|
class WebsiteSchedulerMeta(type):
|
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
|
interface, but spawning one instance per canonical website URL """
|
|
|
|
_instances = {}
|
|
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
|
|
|
def canonical_url(cls, url):
|
|
""" Canonicalize a url """
|
|
return cls._canonicalize.search(url).groups()[1]
|
|
|
|
def __call__(cls, url, *args, **kwargs):
|
|
canonical = cls.canonical_url(url)
|
|
if canonical not in cls._instances:
|
|
cls._instances[canonical] = \
|
|
super(WebsiteSchedulerMeta, cls) \
|
|
.__call__(canonical, *args, **kwargs)
|
|
return cls._instances[canonical]
|
|
|
|
|
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|
""" Schedule the accesses to a website as of robots.txt """
|
|
def __init__(self, name):
|
|
self.name = name
|
|
self.last_crawled = datetime.fromtimestamp(0)
|
|
robots_url = self.urlroot() + 'robots.txt'
|
|
self.robot_parser = RobotFileParser(robots_url)
|
|
self.robot_parser.read() # TODO async?
|
|
|
|
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
|
if delay is None:
|
|
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
|
if req_rate is None:
|
|
delay = 5
|
|
else:
|
|
delay = req_rate.requests, req_rate.seconds
|
|
|
|
self.crawl_delay = timedelta(seconds=delay)
|
|
|
|
def urlroot(self):
|
|
''' Get the root url for this website '''
|
|
return 'https://{}/'.format(self.name)
|
|
|
|
def fetch_delay(self):
|
|
''' Get the delay needed before fetching a page is possible '''
|
|
can_fetch_time = self.last_crawled + self.crawl_delay
|
|
if can_fetch_time < datetime.now():
|
|
return timedelta(0)
|
|
return can_fetch_time - datetime.now()
|
|
|
|
def can_fetch(self, url):
|
|
''' Check whether this program can fetch a given page '''
|
|
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
|
|
|
def fetching(self):
|
|
''' Tell the scheduler that a page is being fetched now '''
|
|
self.last_crawled = datetime.now()
|
|
|
|
|
|
class CrawlingThread(Thread):
|
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
|
since the thread will need its own event loop. """
|
|
|
|
def __init__(self):
|
|
super(CrawlingThread, self).__init__()
|
|
|
|
def run(self):
|
|
tasks = []
|
|
tasks.append(async_print('https://python.org'))
|
|
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
loop.run_until_complete(asyncio.wait(tasks))
|
|
loop.close()
|
|
|
|
|
|
class PageGetter:
|
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
|
|
|
def __init__(self, session, url):
|
|
self.url = url
|
|
self.session = session
|
|
|
|
async def get(self):
|
|
""" Actually retrieve the webpage """
|
|
scheduler = WebsiteScheduler(self.url)
|
|
if not scheduler.can_fetch(self.url):
|
|
return None
|
|
|
|
delay = scheduler.fetch_delay()
|
|
while delay > timedelta(0):
|
|
await asyncio.sleep(delay.total_seconds())
|
|
delay = scheduler.fetch_delay()
|
|
scheduler.fetching()
|
|
async with async_timeout.timeout(10):
|
|
async with self.session.get(self.url) as resp:
|
|
return await resp.text()
|
|
|
|
|
|
async def async_print(url):
|
|
""" Debug function to follow what's actually happening """
|
|
async with aiohttp.ClientSession() as session:
|
|
html = await PageGetter(session, url).get()
|
|
print('GOT {}HTML for {} at {}'.format(
|
|
'None ' if html is None else '',
|
|
url,
|
|
datetime.now() - startup_time))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
crawl = CrawlingThread()
|
|
crawl.start()
|
|
crawl.join()
|