mpri-webdam/crawl/crawl.py

from threading import Thread
from urllib.robotparser import RobotFileParser

from bs4 import BeautifulSoup, Comment
import re
from datetime import datetime, timedelta

import asyncio
import aiohttp
import async_timeout

# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings

# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark

BOOKMARK_URL = "#.*"

class Settings:
    USER_AGENT = 'Blah'

settings = Settings()
startup_time = datetime.now()


def url_getter(html):
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
    # remove the body
    body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    # Remove all bookmark links pointing to the current html page.
    links = body.find_all("a")
    for link in links:
        if re.match(BOOKMARK_URL, link["href"]):
            link.extract()


class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """

    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')

    def canonical_url(cls, url):
        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]

    def __call__(cls, url, *args, **kwargs):
        canonical = cls.canonical_url(url)
        if canonical not in cls._instances:
            cls._instances[canonical] = \
                super(WebsiteSchedulerMeta, cls) \
                .__call__(canonical, *args, **kwargs)
        return cls._instances[canonical]


class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        robots_url = self.urlroot() + 'robots.txt'
        self.robot_parser = RobotFileParser(robots_url)
        self.robot_parser.read()  # TODO async?

        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
        if delay is None:
            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
            if req_rate is None:
                delay = 5
            else:
                delay = req_rate.requests, req_rate.seconds
        self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)

    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
        if can_fetch_time < datetime.now():
            return timedelta(0)
        return can_fetch_time - datetime.now()

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
        return self.robot_parser.can_fetch(settings.USER_AGENT, url)

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
        self.last_crawled = datetime.now()


class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

    def __init__(self):
        super(CrawlingThread, self).__init__()

    def run(self):
        tasks = []
        tasks.append(async_print('https://python.org'))
        tasks.append(async_print('https://python.org/about/gettingstarted'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()


class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

    def __init__(self, session, url):
        self.url = url
        self.session = session

    async def get(self):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None

        delay = scheduler.fetch_delay()
        while delay > timedelta(0):
            await asyncio.sleep(delay.total_seconds())
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url) as resp:
                return await resp.text()

async def async_parser(html_text)

async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get()

        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
            url,
            datetime.now() - startup_time))


if __name__ == '__main__':
    crawl = CrawlingThread()
    crawl.start()
    crawl.join()