from threading import Thread from urllib.robotparser import RobotFileParser from bs4 import BeautifulSoup, Comment import re from datetime import datetime, timedelta import asyncio import aiohttp import async_timeout # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings # Gets all the direct bookmarks in the html. # We want this to avoid following this kind of bookmark BOOKMARK_URL = "#.*" class Settings: USER_AGENT = 'Blah' settings = Settings() startup_time = datetime.now() def url_getter(html): soup = BeautifulSoup(html, "html.parser") # Get only the body body = soup.find('body') # remove the body body.footer.decompose() # remove all comments comments = soup.findAll(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() # Remove all bookmark links pointing to the current html page. links = body.find_all("a") for link in links: if re.match(BOOKMARK_URL, link["href"]): link.extract() class WebsiteSchedulerMeta(type): """ Meta-class for WebsiteScheduler, allowing a singleton class-like interface, but spawning one instance per canonical website URL """ _instances = {} _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') def canonical_url(cls, url): """ Canonicalize a url """ return cls._canonicalize.search(url).groups()[1] def __call__(cls, url, *args, **kwargs): canonical = cls.canonical_url(url) if canonical not in cls._instances: cls._instances[canonical] = \ super(WebsiteSchedulerMeta, cls) \ .__call__(canonical, *args, **kwargs) return cls._instances[canonical] class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): """ Schedule the accesses to a website as of robots.txt """ def __init__(self, name): self.name = name self.last_crawled = datetime.fromtimestamp(0) robots_url = self.urlroot() + 'robots.txt' self.robot_parser = RobotFileParser(robots_url) self.robot_parser.read() # TODO async? delay = self.robot_parser.crawl_delay(settings.USER_AGENT) if delay is None: req_rate = self.robot_parser.request_rate(settings.USER_AGENT) if req_rate is None: delay = 5 else: delay = req_rate.requests, req_rate.seconds self.crawl_delay = timedelta(seconds=delay) def urlroot(self): ''' Get the root url for this website ''' return 'https://{}/'.format(self.name) def fetch_delay(self): ''' Get the delay needed before fetching a page is possible ''' can_fetch_time = self.last_crawled + self.crawl_delay if can_fetch_time < datetime.now(): return timedelta(0) return can_fetch_time - datetime.now() def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' return self.robot_parser.can_fetch(settings.USER_AGENT, url) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' self.last_crawled = datetime.now() class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ def __init__(self): super(CrawlingThread, self).__init__() def run(self): tasks = [] tasks.append(async_print('https://python.org')) tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(asyncio.wait(tasks)) loop.close() class PageGetter: """ Asynchronously get a webpage, abiding by robots.txt """ def __init__(self, session, url): self.url = url self.session = session async def get(self): """ Actually retrieve the webpage """ scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): return None delay = scheduler.fetch_delay() while delay > timedelta(0): await asyncio.sleep(delay.total_seconds()) delay = scheduler.fetch_delay() scheduler.fetching() async with async_timeout.timeout(10): async with self.session.get(self.url) as resp: return await resp.text() async def async_parser(html_text) async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', url, datetime.now() - startup_time)) if __name__ == '__main__': crawl = CrawlingThread() crawl.start() crawl.join()