from threading import Thread from urllib.robotparser import RobotFileParser import random import re from datetime import datetime, timedelta import asyncio import aiohttp import async_timeout #from django.conf import settings class Settings: USER_AGENT = 'Blah' settings = Settings() startup_time = datetime.now() class WebsiteSchedulerMeta(type): _instances = {} _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') def canonical_url(cls, url): return cls._canonicalize.search(url).groups()[1] def __call__(cls, url, *args, **kwargs): canonical = cls.canonical_url(url) if canonical not in cls._instances: cls._instances[canonical] = \ super(WebsiteSchedulerMeta, cls) \ .__call__(canonical, *args, **kwargs) return cls._instances[canonical] class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): """ Schedule the accesses to a website as of robots.txt """ def __init__(self, name): self.name = name self.last_crawled = datetime.fromtimestamp(0) robots_url = self.urlroot() + 'robots.txt' self.robot_parser = RobotFileParser(robots_url) self.robot_parser.read() # TODO async? delay = self.robot_parser.crawl_delay(settings.USER_AGENT) if delay is None: req_rate = self.robot_parser.request_rate(settings.USER_AGENT) if req_rate is None: delay = 5 else: delay = req_rate.requests, req_rate.seconds self.crawl_delay = timedelta(seconds=delay) def urlroot(self): ''' Get the root url for this website ''' return 'https://{}/'.format(self.name) def fetch_delay(self): ''' Get the delay needed before fetching a page is possible ''' can_fetch_time = self.last_crawled + self.crawl_delay if can_fetch_time < datetime.now(): return timedelta(0) return can_fetch_time - datetime.now() def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' return self.robot_parser.can_fetch(settings.USER_AGENT, url) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' self.last_crawled = datetime.now() class CrawlingThread(Thread): def __init__(self): super(CrawlingThread, self).__init__() def run(self): tasks = [] tasks.append(async_print('https://python.org')) tasks.append(async_print('https://python.org/webstats/')) tasks.append(async_print('https://python.org/3.5/')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(asyncio.wait(tasks)) loop.close() class PageGetter: def __init__(self, session, url): self.url = url self.session = session async def get(self): scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): return None delay = scheduler.fetch_delay() while delay > timedelta(0): await asyncio.sleep(delay.total_seconds()) delay = scheduler.fetch_delay() scheduler.fetching() async with async_timeout.timeout(10): async with self.session.get(self.url) as resp: return await resp.text() async def async_print(url): async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', url, datetime.now() - startup_time)) if __name__ == '__main__': crawl = CrawlingThread() crawl.start() crawl.join()