mpri-webdam/crawl/crawl.py

from threading import Thread
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse

from ssl import CertificateError
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta

import asyncio
import aiohttp
import async_timeout

from bs4 import BeautifulSoup, Comment

from profiles.models import BrowserFingerprint, SearchEngine

# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings

# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark

HARD_LIMIT = 20
MAX_PER_PAGE = 10

FOOTER_URL = re.compile(".*footer.*")


class Settings:
    USER_AGENT = 'Default User'

settings = Settings()


def url_getter(html, current_page, root_url):
    links_list = [] # The final resutl
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
    if not body:
        return links_list
    # remove the body
    if body.footer:
        body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()

    # Remove all bookmark links pointing to the current html page.
    links = map(lambda link: link.get("href", ""), body.find_all("a"))
    for link in links:
        if link: #Edge case, if no href found.
            if link.startswith("http"):
                links_list.append(link)
            elif link.startswith('/'): #Internal link, linking to page root url
                links_list.append(root_url + link)
            elif link.startswith("#"):
                continue
            else:
                links_list.append(current_page + "/" + link)

    ## uniqifier works with python <= 3.6
    #seen = set()
    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))

    forbidden_words = ['login', 'agreement', 'mailto', 'settings']
    links_list = [link for link in links_list if not any(word in link.lower()
                                                         for word in
                                                         forbidden_words)]

    return links_list


class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """

    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')

    def canonical_url(cls, url):
        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]

    def __call__(cls, url, *args, **kwargs):
        canonical = cls.canonical_url(url)
        if canonical not in cls._instances:
            cls._instances[canonical] = \
                super(WebsiteSchedulerMeta, cls) \
                .__call__(canonical, *args, **kwargs)
        return cls._instances[canonical]


class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """

    search_engines = []  # Must be set by CrawlingThread.__init__

    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        self.can_fetch_b = False
        self.user_agent = (user_agent if user_agent is not None
                           else settings.USER_AGENT)
        if any(self.urlroot() in item for item in self.search_engines):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
        else:
            try:
                robots_url = self.urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
                self.robot_parser.read()  # TODO async?
            except (URLError, CertificateError):
                try:
                    robots_url = self.unsafe_urlroot() + 'robots.txt'
                    self.robot_parser = RobotFileParser(robots_url)
                    self.robot_parser.read()
                except URLError:  # Almost surely an offline website.
                    self.dead = True
                    self.crawl_delay = 0
            except Exception as e:
                print(e)
                raise e
            if not self.robot_parser.default_entry:
                self.dead = True
            if not self.dead:
                delay = self.robot_parser.crawl_delay(self.user_agent)
                if delay is None:
                    req_rate = self.robot_parser.request_rate(self.user_agent)
                    if req_rate is None:
                        delay = 5
                    else:
                        delay = req_rate.requests, req_rate.seconds
                self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)

    def unsafe_urlroot(self):
        return 'http://{}/'.format(self.name)

    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
        if can_fetch_time < datetime.now():
            return timedelta(0)
        return can_fetch_time - datetime.now()

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
        return ((self.can_fetch_b)
                or ((not self.dead) and
                    self.robot_parser.can_fetch(self.user_agent, url)))

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
        self.last_crawled = datetime.now()


class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

    def __init__(self, url):
        engine_list = [engine.url for engine in SearchEngine.objects.all()]
        WebsiteScheduler.search_engines = engine_list

        nb_fingerprint = len(BrowserFingerprint.objects.all())
        fingerprint = BrowserFingerprint.objects.all()[
            randint(0, nb_fingerprint - 1)]
        self.headers = fingerprint.serialize_headers()

        self.output_tree = []
        super(CrawlingThread, self).__init__()
        self.url = url

    def run(self):
        tasks = []

        #tasks.append(async_crawler("http://plus.google.com/+Python"))
        #tasks.append(async_crawler('https://python.org/'))
        tasks.append(run_crawl(self.url, self.output_tree, self.headers))

        try:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            loop.run_until_complete(asyncio.wait(tasks))
        finally:
            loop.close()


class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

    headers = None

    def __init__(self, session, url, user_agent):
        self.url = url
        self.session = session
        self.user_agent = user_agent

    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url, self.user_agent)
        if not scheduler.can_fetch(self.url):
            return None

        delay = scheduler.fetch_delay()
        while delay > timedelta(0):
            await asyncio.sleep(delay.total_seconds())
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url, verify_ssl=ssl) as resp:
                try:
                    return await resp.text()
                except UnicodeDecodeError:
                    return None


async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url,
                                settings.USER_AGENT).get(ssl=False)

        print('GOT {}HTML for {}'.format(
            'None ' if html is None else '',
            url,
        ))


class CrawlElem:
    ''' Describes a crawled element, to be assembled into a tree '''

    def __init__(self, url, parent):
        self.url = url
        self.parent = parent


async def run_crawl(url, output_tree, headers=None):
    ''' Starts a crawling session '''

    if headers is None:
        headers = {}
    if 'User-Agent' not in headers:
        headers['User-Agent'] = settings.USER_AGENT

    user_agent = headers['User-Agent']
    crawled = set()

    async with aiohttp.ClientSession(headers=headers) as session:
        await async_crawler(
            url, output_tree, crawled, user_agent, session, None)


def simplify_url(url):
    anchor = url.find('#')
    if anchor >= 0:
        url = url[:anchor]

    prot = url.find('://')
    if prot >= 0:
        url = url[prot+3:]

    if url.startswith('www.'):
        url = url[4:]

    return url


async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
    if len(crawled) >= HARD_LIMIT:
        return
    crawled.add(simplify_url(url))
    parsed_url = urlparse(url)
    print("Crawling {}".format(url))
    try:
        with async_timeout.timeout(3):
            html = await PageGetter(session, url, user_agent).get(ssl=False)
    except asyncio.TimeoutError:
        return

    new_tasks = []

    if html:
        this_elem = CrawlElem(url, parent)
        out_tree.append(this_elem)
        new_urls = url_getter(
            html,
            url,
            parsed_url.scheme + "://" + parsed_url.netloc
        )
        if new_urls:
            sampled = sample(
                new_urls,
                randrange(min(MAX_PER_PAGE, len(new_urls)))
            )
            for sample_url in sampled:
                if simplify_url(sample_url) not in crawled:
                    new_tasks.append(async_crawler(
                        sample_url, out_tree, crawled, user_agent, session,
                        this_elem))
    else:
        print("No html received")
    if len(crawled) >= HARD_LIMIT:
        return
    if new_tasks:
        await asyncio.wait(new_tasks)
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`from threading import Thread`
			`from urllib.robotparser import RobotFileParser`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`from urllib.error import URLError`
Multiple bug fixes. TODO : remove <div id=footer>-like patterns 2018-02-22 14:07:53 +01:00			`from urllib.parse import urlparse`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`from ssl import CertificateError`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`from random import sample, randrange, randint`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`import re`
			`from datetime import datetime, timedelta`

			`import asyncio`
			`import aiohttp`
			`import async_timeout`

Multiple bug fixes. TODO : remove <div id=footer>-like patterns 2018-02-22 14:07:53 +01:00			`from bs4 import BeautifulSoup, Comment`

Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:45:08 +01:00			`from profiles.models import BrowserFingerprint, SearchEngine`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`# Ugly hack to use this module alone instead of integrating it with Django`
			`# from django.conf import settings`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Start of url getter function 2018-02-21 19:06:46 +01:00			`# Gets all the direct bookmarks in the html.`
			`# We want this to avoid following this kind of bookmark`

Multiple bug fixes. TODO : remove <div id=footer>-like patterns 2018-02-22 14:07:53 +01:00			`HARD_LIMIT = 20`
			`MAX_PER_PAGE = 10`
Start of url getter function 2018-02-21 19:06:46 +01:00
Nearly working crawler 2018-02-22 14:33:07 +01:00			`FOOTER_URL = re.compile(".footer.")`

Check if crawling a search engine 2018-02-26 11:12:36 +01:00
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`class Settings:`
Integration of crawl module in histories 2018-02-24 23:17:24 +01:00			`USER_AGENT = 'Default User'`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
			`settings = Settings()`


Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00			`def url_getter(html, current_page, root_url):`
			`links_list = [] # The final resutl`
Start of url getter function 2018-02-21 19:06:46 +01:00			`soup = BeautifulSoup(html, "html.parser")`
			`# Get only the body`
			`body = soup.find('body')`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`if not body:`
			`return links_list`
Start of url getter function 2018-02-21 19:06:46 +01:00			`# remove the body`
Nearly working crawler 2018-02-22 14:33:07 +01:00			`if body.footer:`
			`body.footer.decompose()`
Start of url getter function 2018-02-21 19:06:46 +01:00			`# remove all comments`
Multiple bug fixes. TODO : remove <div id=footer>-like patterns 2018-02-22 14:07:53 +01:00			`comments = soup.findAll(text=lambda text: isinstance(text, Comment))`
Start of url getter function 2018-02-21 19:06:46 +01:00			`for comment in comments:`
			`comment.extract()`

Nearly working crawler 2018-02-22 14:33:07 +01:00			`footers = soup.findAll(id=FOOTER_URL)`
			`for footer in footers:`
			`footer.extract()`

Start of url getter function 2018-02-21 19:06:46 +01:00			`# Remove all bookmark links pointing to the current html page.`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`links = map(lambda link: link.get("href", ""), body.find_all("a"))`
Start of url getter function 2018-02-21 19:06:46 +01:00			`for link in links:`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`if link: #Edge case, if no href found.`
			`if link.startswith("http"):`
			`links_list.append(link)`
			`elif link.startswith('/'): #Internal link, linking to page root url`
			`links_list.append(root_url + link)`
			`elif link.startswith("#"):`
Better filter 2018-02-24 11:39:04 +01:00			`continue`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`else:`
			`links_list.append(current_page + "/" + link)`
Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00
			`## uniqifier works with python <= 3.6`
			`#seen = set()`
			`#links_list = [x for x in links_list if x not in seen and not seen.add(x)]`
			`# uniqifier`
			`# Works only with python >= 3.6`
Multiple bug fixes. TODO : remove <div id=footer>-like patterns 2018-02-22 14:07:53 +01:00			`links_list = list(dict.fromkeys(links_list))`
Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00
We do not want to use settings 2018-02-26 15:14:53 +01:00			`forbidden_words = ['login', 'agreement', 'mailto', 'settings']`
Better filter 2018-02-24 11:39:04 +01:00			`links_list = [link for link in links_list if not any(word in link.lower()`
			`for word in`
			`forbidden_words)]`

It can be useful to return the links list 2018-02-21 23:11:57 +01:00			`return links_list`

Url getter function : retrieves the list of so-called relevant links 2018-02-21 22:51:05 +01:00
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`class WebsiteSchedulerMeta(type):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Meta-class for WebsiteScheduler, allowing a singleton class-like`
			`interface, but spawning one instance per canonical website URL """`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`_instances = {}`
			`_canonicalize = re.compile(r'(https?://)?([^/]+)(/?\|$)')`

			`def canonical_url(cls, url):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Canonicalize a url """`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`return cls._canonicalize.search(url).groups()[1]`

			`def __call__(cls, url, args, *kwargs):`
			`canonical = cls.canonical_url(url)`
			`if canonical not in cls._instances:`
			`cls._instances[canonical] = \`
			`super(WebsiteSchedulerMeta, cls) \`
			`.__call__(canonical, args, *kwargs)`
			`return cls._instances[canonical]`


			`class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):`
			`""" Schedule the accesses to a website as of robots.txt """`
Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:45:08 +01:00
			`search_engines = [] # Must be set by CrawlingThread.__init__`

Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`def __init__(self, name, user_agent):`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`self.name = name`
			`self.last_crawled = datetime.fromtimestamp(0)`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`self.dead = False`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`self.can_fetch_b = False`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`self.user_agent = (user_agent if user_agent is not None`
			`else settings.USER_AGENT)`
Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:45:08 +01:00			`if any(self.urlroot() in item for item in self.search_engines):`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`print("found a search engine for %s" % self.urlroot())`
			`self.crawl_delay = timedelta(seconds=5)`
			`self.can_fetch_b = True`
			`else:`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`try:`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`robots_url = self.urlroot() + 'robots.txt'`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`self.robot_parser = RobotFileParser(robots_url)`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`self.robot_parser.read() # TODO async?`
			`except (URLError, CertificateError):`
			`try:`
			`robots_url = self.unsafe_urlroot() + 'robots.txt'`
			`self.robot_parser = RobotFileParser(robots_url)`
			`self.robot_parser.read()`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`except URLError: # Almost surely an offline website.`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`self.dead = True`
			`self.crawl_delay = 0`
			`except Exception as e:`
			`print(e)`
			`raise e`
			`if not self.robot_parser.default_entry:`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`self.dead = True`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`if not self.dead:`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`delay = self.robot_parser.crawl_delay(self.user_agent)`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`if delay is None:`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`req_rate = self.robot_parser.request_rate(self.user_agent)`
Check if crawling a search engine 2018-02-26 11:12:36 +01:00			`if req_rate is None:`
			`delay = 5`
			`else:`
			`delay = req_rate.requests, req_rate.seconds`
			`self.crawl_delay = timedelta(seconds=delay)`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
			`def urlroot(self):`
			`''' Get the root url for this website '''`
			`return 'https://{}/'.format(self.name)`

Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`def unsafe_urlroot(self):`
			`return 'http://{}/'.format(self.name)`

Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`def fetch_delay(self):`
			`''' Get the delay needed before fetching a page is possible '''`
			`can_fetch_time = self.last_crawled + self.crawl_delay`
			`if can_fetch_time < datetime.now():`
			`return timedelta(0)`
			`return can_fetch_time - datetime.now()`

			`def can_fetch(self, url):`
			`''' Check whether this program can fetch a given page '''`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`return ((self.can_fetch_b)`
			`or ((not self.dead) and`
			`self.robot_parser.can_fetch(self.user_agent, url)))`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
			`def fetching(self):`
			`''' Tell the scheduler that a page is being fetched now '''`
			`self.last_crawled = datetime.now()`


			`class CrawlingThread(Thread):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" A separate thread for the crawling task. This is needed to use asyncio,`
			`since the thread will need its own event loop. """`

Real async crawling 2018-02-26 15:27:57 +01:00			`def __init__(self, url):`
Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:45:08 +01:00			`engine_list = [engine.url for engine in SearchEngine.objects.all()]`
			`WebsiteScheduler.search_engines = engine_list`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00
			`nb_fingerprint = len(BrowserFingerprint.objects.all())`
			`fingerprint = BrowserFingerprint.objects.all()[`
			`randint(0, nb_fingerprint - 1)]`
			`self.headers = fingerprint.serialize_headers()`

Real async crawling 2018-02-26 15:27:57 +01:00			`self.output_tree = []`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`super(CrawlingThread, self).__init__()`
Integration of crawl module in histories 2018-02-24 23:17:24 +01:00			`self.url = url`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
			`def run(self):`
			`tasks = []`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`#tasks.append(async_crawler("http://plus.google.com/+Python"))`
Integration of crawl module in histories 2018-02-24 23:17:24 +01:00			`#tasks.append(async_crawler('https://python.org/'))`
Real async crawling 2018-02-26 15:27:57 +01:00			`tasks.append(run_crawl(self.url, self.output_tree, self.headers))`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Real async crawling 2018-02-26 15:27:57 +01:00			`try:`
			`loop = asyncio.new_event_loop()`
			`asyncio.set_event_loop(loop)`
			`loop.run_until_complete(asyncio.wait(tasks))`
			`finally:`
			`loop.close()`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00

			`class PageGetter:`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Asynchronously get a webpage, abiding by robots.txt """`

Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`headers = None`

			`def __init__(self, session, url, user_agent):`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`self.url = url`
			`self.session = session`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`self.user_agent = user_agent`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`async def get(self, ssl=True):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Actually retrieve the webpage """`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`scheduler = WebsiteScheduler(self.url, self.user_agent)`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`if not scheduler.can_fetch(self.url):`
			`return None`

			`delay = scheduler.fetch_delay()`
			`while delay > timedelta(0):`
			`await asyncio.sleep(delay.total_seconds())`
			`delay = scheduler.fetch_delay()`
			`scheduler.fetching()`
			`async with async_timeout.timeout(10):`
Change option name due to downgrade of aiohttp 2018-02-26 10:23:32 +01:00			`async with self.session.get(self.url, verify_ssl=ssl) as resp:`
Exception handling Big problem with the url https:/plus.google.com/+Python concerning robots parsing. Didn't find the bug. @tobast, if you have some time to look at it :) 2018-02-23 00:37:36 +01:00			`try:`
			`return await resp.text()`
			`except UnicodeDecodeError:`
			`return None`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00

			`async def async_print(url):`
Make the code somewhat readable 2018-02-21 11:54:41 +01:00			`""" Debug function to follow what's actually happening """`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`async with aiohttp.ClientSession() as session:`
Fix function calls 2018-02-26 11:56:02 +01:00			`html = await PageGetter(session, url,`
			`settings.USER_AGENT).get(ssl=False)`
Start of url getter function 2018-02-21 19:06:46 +01:00
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`print('GOT {}HTML for {}'.format(`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00			`'None ' if html is None else '',`
			`url,`
Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`))`


[REBASE ME] Crawl: enhance efficiency and output a tree 2018-02-25 15:08:06 +01:00			`class CrawlElem:`
			`''' Describes a crawled element, to be assembled into a tree '''`

			`def __init__(self, url, parent):`
			`self.url = url`
			`self.parent = parent`

Real async crawling 2018-02-26 15:27:57 +01:00
			`async def run_crawl(url, output_tree, headers=None):`
			`''' Starts a crawling session '''`

Crawler: use a random fingerprint 2018-02-26 11:27:07 +01:00			`if headers is None:`
Fix function calls 2018-02-26 11:56:02 +01:00			`headers = {}`
			`if 'User-Agent' not in headers:`
			`headers['User-Agent'] = settings.USER_AGENT`

			`user_agent = headers['User-Agent']`
[REBASE ME] Crawl: enhance efficiency and output a tree 2018-02-25 15:08:06 +01:00			`crawled = set()`
Add tentative crawl file Nothing functional, just tests 2018-02-20 12:48:53 +01:00
Real async crawling 2018-02-26 15:27:57 +01:00			`async with aiohttp.ClientSession(headers=headers) as session:`
			`await async_crawler(`
			`url, output_tree, crawled, user_agent, session, None)`


			`def simplify_url(url):`
			`anchor = url.find('#')`
			`if anchor >= 0:`
			`url = url[:anchor]`

			`prot = url.find('://')`
			`if prot >= 0:`
			`url = url[prot+3:]`

			`if url.startswith('www.'):`
			`url = url[4:]`

			`return url`


			`async def async_crawler(url, out_tree, crawled, user_agent, session, parent):`
			`if len(crawled) >= HARD_LIMIT:`
			`return`
			`crawled.add(simplify_url(url))`
			`parsed_url = urlparse(url)`
			`print("Crawling {}".format(url))`
Add a timeout to a single page retrieval 2018-02-26 15:42:36 +01:00			`try:`
			`with async_timeout.timeout(3):`
			`html = await PageGetter(session, url, user_agent).get(ssl=False)`
			`except asyncio.TimeoutError:`
			`return`
Real async crawling 2018-02-26 15:27:57 +01:00
			`new_tasks = []`

			`if html:`
			`this_elem = CrawlElem(url, parent)`
			`out_tree.append(this_elem)`
			`new_urls = url_getter(`
			`html,`
			`url,`
			`parsed_url.scheme + "://" + parsed_url.netloc`
			`)`
			`if new_urls:`
			`sampled = sample(`
			`new_urls,`
			`randrange(min(MAX_PER_PAGE, len(new_urls)))`
			`)`
			`for sample_url in sampled:`
			`if simplify_url(sample_url) not in crawled:`
			`new_tasks.append(async_crawler(`
			`sample_url, out_tree, crawled, user_agent, session,`
			`this_elem))`
			`else:`
			`print("No html received")`
			`if len(crawled) >= HARD_LIMIT:`
			`return`
			`if new_tasks:`
			`await asyncio.wait(new_tasks)`