From c97acb22b585ecda2e67507a158a7ebe76aaed43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 20 Feb 2018 12:48:53 +0100 Subject: [PATCH] Add tentative crawl file Nothing functional, just tests --- crawl/crawl.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 crawl/crawl.py diff --git a/crawl/crawl.py b/crawl/crawl.py new file mode 100644 index 0000000..7d22422 --- /dev/null +++ b/crawl/crawl.py @@ -0,0 +1,126 @@ +from threading import Thread +from urllib.robotparser import RobotFileParser + +import random + +import re +from datetime import datetime, timedelta + +import asyncio +import aiohttp +import async_timeout + +#from django.conf import settings + + +class Settings: + USER_AGENT = 'Blah' + +settings = Settings() +startup_time = datetime.now() + + +class WebsiteSchedulerMeta(type): + _instances = {} + _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') + + def canonical_url(cls, url): + return cls._canonicalize.search(url).groups()[1] + + def __call__(cls, url, *args, **kwargs): + canonical = cls.canonical_url(url) + if canonical not in cls._instances: + cls._instances[canonical] = \ + super(WebsiteSchedulerMeta, cls) \ + .__call__(canonical, *args, **kwargs) + return cls._instances[canonical] + + +class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): + """ Schedule the accesses to a website as of robots.txt """ + def __init__(self, name): + self.name = name + self.last_crawled = datetime.fromtimestamp(0) + robots_url = self.urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() # TODO async? + + delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + if delay is None: + req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + if req_rate is None: + delay = 5 + else: + delay = req_rate.requests, req_rate.seconds + + self.crawl_delay = timedelta(seconds=delay) + + def urlroot(self): + ''' Get the root url for this website ''' + return 'https://{}/'.format(self.name) + + def fetch_delay(self): + ''' Get the delay needed before fetching a page is possible ''' + can_fetch_time = self.last_crawled + self.crawl_delay + if can_fetch_time < datetime.now(): + return timedelta(0) + return can_fetch_time - datetime.now() + + def can_fetch(self, url): + ''' Check whether this program can fetch a given page ''' + return self.robot_parser.can_fetch(settings.USER_AGENT, url) + + def fetching(self): + ''' Tell the scheduler that a page is being fetched now ''' + self.last_crawled = datetime.now() + + +class CrawlingThread(Thread): + def __init__(self): + super(CrawlingThread, self).__init__() + + def run(self): + tasks = [] + tasks.append(async_print('https://python.org')) + tasks.append(async_print('https://python.org/webstats/')) + tasks.append(async_print('https://python.org/3.5/')) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(asyncio.wait(tasks)) + loop.close() + + +class PageGetter: + def __init__(self, session, url): + self.url = url + self.session = session + + async def get(self): + scheduler = WebsiteScheduler(self.url) + if not scheduler.can_fetch(self.url): + return None + + delay = scheduler.fetch_delay() + while delay > timedelta(0): + await asyncio.sleep(delay.total_seconds()) + delay = scheduler.fetch_delay() + scheduler.fetching() + async with async_timeout.timeout(10): + async with self.session.get(self.url) as resp: + return await resp.text() + + +async def async_print(url): + async with aiohttp.ClientSession() as session: + html = await PageGetter(session, url).get() + print('GOT {}HTML for {} at {}'.format( + 'None ' if html is None else '', + url, + datetime.now() - startup_time)) + + +if __name__ == '__main__': + crawl = CrawlingThread() + crawl.start() + crawl.join()