mpri-webdam/crawl/crawl.py

from threading import Thread
from urllib.robotparser import RobotFileParser

from bs4 import BeautifulSoup, Comment
import re
from datetime import datetime, timedelta

import asyncio
import aiohttp
import async_timeout

# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings

# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark

BOOKMARK_URL = "#.*"

class Settings:
    USER_AGENT = 'Blah'

settings = Settings()
startup_time = datetime.now()


def url_getter(html, current_page, root_url):
    links_list = [] # The final resutl
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
    # remove the body
    body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    # Remove all bookmark links pointing to the current html page.
    links = body.find_all("a")
    for link in links:
        if link.startswith("http"):
            links_list.append(link)
        elif link.startswith('/'): #Internal link, linking to page root url
            link_list.append(root_url + link)
        elif link.startswith("#"):
            print("Invalid link : internal bookmark")
        else:
            links_list.append(current_page + link)

    ## uniqifier works with python <= 3.6
    #seen = set()
    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]

    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(seq))


class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """

    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')

    def canonical_url(cls, url):
        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]

    def __call__(cls, url, *args, **kwargs):
        canonical = cls.canonical_url(url)
        if canonical not in cls._instances:
            cls._instances[canonical] = \
                super(WebsiteSchedulerMeta, cls) \
                .__call__(canonical, *args, **kwargs)
        return cls._instances[canonical]


class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        robots_url = self.urlroot() + 'robots.txt'
        self.robot_parser = RobotFileParser(robots_url)
        self.robot_parser.read()  # TODO async?

        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
        if delay is None:
            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
            if req_rate is None:
                delay = 5
            else:
                delay = req_rate.requests, req_rate.seconds
        self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)

    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
        if can_fetch_time < datetime.now():
            return timedelta(0)
        return can_fetch_time - datetime.now()

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
        return self.robot_parser.can_fetch(settings.USER_AGENT, url)

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
        self.last_crawled = datetime.now()


class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

    def __init__(self):
        super(CrawlingThread, self).__init__()

    def run(self):
        tasks = []
        tasks.append(async_print('https://python.org'))
        tasks.append(async_print('https://python.org/about/gettingstarted'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()


class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

    def __init__(self, session, url):
        self.url = url
        self.session = session

    async def get(self):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None

        delay = scheduler.fetch_delay()
        while delay > timedelta(0):
            await asyncio.sleep(delay.total_seconds())
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url) as resp:
                return await resp.text()


async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get()

        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
            url,
            datetime.now() - startup_time))


if __name__ == '__main__':
    crawl = CrawlingThread()
    crawl.start()
    crawl.join()
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`from threading import Thread`
			`from urllib.robotparser import RobotFileParser`

Start of url getter function 2018-02-21 19:06:46 +01:00			`from bs4 import BeautifulSoup, Comment`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`import re`
			`from datetime import datetime, timedelta`

			`import asyncio`
			`import aiohttp`
			`import async_timeout`

Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`# Ugly hack to use this module alone instead of integrating it with Django`
			`# from django.conf import settings`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Start of url getter function 2018-02-21 19:06:46 +01:00			`# Gets all the direct bookmarks in the html.`
			`# We want this to avoid following this kind of bookmark`

			`BOOKMARK_URL = "#.*"`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`class Settings:`
			`USER_AGENT = 'Blah'`

			`settings = Settings()`
			`startup_time = datetime.now()`


Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00			`def url_getter(html, current_page, root_url):`
			`links_list = [] # The final resutl`
Start of url getter function 2018-02-21 19:06:46 +01:00			`soup = BeautifulSoup(html, "html.parser")`
			`# Get only the body`
			`body = soup.find('body')`
			`# remove the body`
			`body.footer.decompose()`
			`# remove all comments`
			`comments = soup.findAll(text=lambda text:isinstance(text, Comment))`
			`for comment in comments:`
			`comment.extract()`

			`# Remove all bookmark links pointing to the current html page.`
			`links = body.find_all("a")`
			`for link in links:`
Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00			`if link.startswith("http"):`
			`links_list.append(link)`
			`elif link.startswith('/'): #Internal link, linking to page root url`
			`link_list.append(root_url + link)`
			`elif link.startswith("#"):`
			`print("Invalid link : internal bookmark")`
			`else:`
			`links_list.append(current_page + link)`

			`## uniqifier works with python <= 3.6`
			`#seen = set()`
			`#links_list = [x for x in links_list if x not in seen and not seen.add(x)]`

			`# uniqifier`
			`# Works only with python >= 3.6`
			`links_list = list(dict.fromkeys(seq))`


Start of url getter function 2018-02-21 19:06:46 +01:00

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`class WebsiteSchedulerMeta(type):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Meta-class for WebsiteScheduler, allowing a singleton class-like`
			`interface, but spawning one instance per canonical website URL """`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`_instances = {}`
			`_canonicalize = re.compile(r'(https?://)?([^/]+)(/?\|$)')`

			`def canonical_url(cls, url):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Canonicalize a url """`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`return cls._canonicalize.search(url).groups()[1]`

			`def __call__(cls, url, args, *kwargs):`
			`canonical = cls.canonical_url(url)`
			`if canonical not in cls._instances:`
			`cls._instances[canonical] = \`
			`super(WebsiteSchedulerMeta, cls) \`
			`.__call__(canonical, args, *kwargs)`
			`return cls._instances[canonical]`


			`class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):`
			`""" Schedule the accesses to a website as of robots.txt """`
			`def __init__(self, name):`
			`self.name = name`
			`self.last_crawled = datetime.fromtimestamp(0)`
			`robots_url = self.urlroot() + 'robots.txt'`
			`self.robot_parser = RobotFileParser(robots_url)`
			`self.robot_parser.read() # TODO async?`

			`delay = self.robot_parser.crawl_delay(settings.USER_AGENT)`
			`if delay is None:`
			`req_rate = self.robot_parser.request_rate(settings.USER_AGENT)`
			`if req_rate is None:`
			`delay = 5`
			`else:`
			`delay = req_rate.requests, req_rate.seconds`
			`self.crawl_delay = timedelta(seconds=delay)`

			`def urlroot(self):`
			`''' Get the root url for this website '''`
			`return 'https://{}/'.format(self.name)`

			`def fetch_delay(self):`
			`''' Get the delay needed before fetching a page is possible '''`
			`can_fetch_time = self.last_crawled + self.crawl_delay`
			`if can_fetch_time < datetime.now():`
			`return timedelta(0)`
			`return can_fetch_time - datetime.now()`

			`def can_fetch(self, url):`
			`''' Check whether this program can fetch a given page '''`
			`return self.robot_parser.can_fetch(settings.USER_AGENT, url)`

			`def fetching(self):`
			`''' Tell the scheduler that a page is being fetched now '''`
			`self.last_crawled = datetime.now()`


			`class CrawlingThread(Thread):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" A separate thread for the crawling task. This is needed to use asyncio,`
			`since the thread will need its own event loop. """`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`def __init__(self):`
			`super(CrawlingThread, self).__init__()`

			`def run(self):`
			`tasks = []`
			`tasks.append(async_print('https://python.org'))`
Start of url getter function 2018-02-21 19:06:46 +01:00			`tasks.append(async_print('https://python.org/about/gettingstarted'))`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
			`loop = asyncio.new_event_loop()`
			`asyncio.set_event_loop(loop)`
			`loop.run_until_complete(asyncio.wait(tasks))`
			`loop.close()`


			`class PageGetter:`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Asynchronously get a webpage, abiding by robots.txt """`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`def __init__(self, session, url):`
			`self.url = url`
			`self.session = session`

			`async def get(self):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Actually retrieve the webpage """`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`scheduler = WebsiteScheduler(self.url)`
			`if not scheduler.can_fetch(self.url):`
			`return None`

			`delay = scheduler.fetch_delay()`
			`while delay > timedelta(0):`
			`await asyncio.sleep(delay.total_seconds())`
			`delay = scheduler.fetch_delay()`
			`scheduler.fetching()`
			`async with async_timeout.timeout(10):`
			`async with self.session.get(self.url) as resp:`
			`return await resp.text()`


			`async def async_print(url):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Debug function to follow what's actually happening """`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`async with aiohttp.ClientSession() as session:`
			`html = await PageGetter(session, url).get()`
Start of url getter function 2018-02-21 19:06:46 +01:00
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`print('GOT {}HTML for {} at {}'.format(`
			`'None ' if html is None else '',`
			`url,`
			`datetime.now() - startup_time))`


			`if __name__ == '__main__':`
			`crawl = CrawlingThread()`
			`crawl.start()`
			`crawl.join()`