mpri-webdam/crawl/crawl.py

127 lines
3.7 KiB
Python
Raw Normal View History

from threading import Thread
from urllib.robotparser import RobotFileParser
import random
import re
from datetime import datetime, timedelta
import asyncio
import aiohttp
import async_timeout
#from django.conf import settings
class Settings:
USER_AGENT = 'Blah'
settings = Settings()
startup_time = datetime.now()
class WebsiteSchedulerMeta(type):
_instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url):
return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs):
canonical = cls.canonical_url(url)
if canonical not in cls._instances:
cls._instances[canonical] = \
super(WebsiteSchedulerMeta, cls) \
.__call__(canonical, *args, **kwargs)
return cls._instances[canonical]
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
return 'https://{}/'.format(self.name)
def fetch_delay(self):
''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay
if can_fetch_time < datetime.now():
return timedelta(0)
return can_fetch_time - datetime.now()
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
self.last_crawled = datetime.now()
class CrawlingThread(Thread):
def __init__(self):
super(CrawlingThread, self).__init__()
def run(self):
tasks = []
tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/webstats/'))
tasks.append(async_print('https://python.org/3.5/'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
class PageGetter:
def __init__(self, session, url):
self.url = url
self.session = session
async def get(self):
scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url):
return None
delay = scheduler.fetch_delay()
while delay > timedelta(0):
await asyncio.sleep(delay.total_seconds())
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url) as resp:
return await resp.text()
async def async_print(url):
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get()
print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
if __name__ == '__main__':
crawl = CrawlingThread()
crawl.start()
crawl.join()