Merge branch 'crawl' into histories_models

2018-02-24 18:44:27 +01:00 · 2018-02-24 18:44:27 +01:00 · 60bfc8cb77
commit 60bfc8cb77
parent 12c8c652d7 d19c2e8216
9 changed files with 278 additions and 1 deletions
--- a/crawl/init.py
+++ b/crawl/init.py
--- a/crawl/admin.py
+++ b/crawl/admin.py
@ -0,0 +1,3 @@
 from django.contrib import admin
 # Register your models here.
--- a/crawl/apps.py
+++ b/crawl/apps.py
@ -0,0 +1,5 @@
 from django.apps import AppConfig
 class CrawlConfig(AppConfig):
    name = 'crawl'
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -0,0 +1,248 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 from urllib.error import URLError
 from urllib.parse import urlparse
 from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
 import asyncio
 import aiohttp
 import async_timeout
 from bs4 import BeautifulSoup, Comment
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
 # Gets all the direct bookmarks in the html.
 # We want this to avoid following this kind of bookmark
 HARD_LIMIT = 20
 MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 class Settings:
    USER_AGENT = 'Blah'
 settings = Settings()
 startup_time = datetime.now()
 def url_getter(html, current_page, root_url):
    links_list = [] # The final resutl
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
    if not body:
        return links_list
    # remove the body
    if body.footer:
        body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()
    # Remove all bookmark links pointing to the current html page.
    links = map(lambda link: link.get("href", ""), body.find_all("a"))
    for link in links:
        if link: #Edge case, if no href found.
            if link.startswith("http"):
                links_list.append(link)
            elif link.startswith('/'): #Internal link, linking to page root url
                links_list.append(root_url + link)
            elif link.startswith("#"):
                continue
            else:
                links_list.append(current_page + "/" + link)
    ## uniqifier works with python <= 3.6
    #seen = set()
    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
    # uniqifier
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))
    forbidden_words = ['login', 'agreement', 'mailto']
    links_list = [link for link in links_list if not any(word in link.lower()
                                                         for word in
                                                         forbidden_words)]
    return links_list
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
    _instances = {}
    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
    def canonical_url(cls, url):
        """ Canonicalize a url """
        return cls._canonicalize.search(url).groups()[1]
    def __call__(cls, url, *args, **kwargs):
        canonical = cls.canonical_url(url)
        if canonical not in cls._instances:
            cls._instances[canonical] = \
                super(WebsiteSchedulerMeta, cls) \
                .__call__(canonical, *args, **kwargs)
        return cls._instances[canonical]
 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        try:
            robots_url = self.urlroot() + 'robots.txt'
            self.robot_parser = RobotFileParser(robots_url)
            self.robot_parser.read()  # TODO async?
        except (URLError, CertificateError):
            try:
                robots_url = self.unsafe_urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
                self.robot_parser.read()
            except URLError: # Almost surely an offline website.
                self.dead = True
                self.crawl_delay = 0
        except Exception as e:
            print(e)
            raise e
        if not self.robot_parser.default_entry:
            self.dead = True
        if not self.dead:
            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
            if delay is None:
                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
                if req_rate is None:
                    delay = 5
                else:
                    delay = req_rate.requests, req_rate.seconds
            self.crawl_delay = timedelta(seconds=delay)
    def urlroot(self):
        ''' Get the root url for this website '''
        return 'https://{}/'.format(self.name)
    def unsafe_urlroot(self):
        return 'http://{}/'.format(self.name)
    def fetch_delay(self):
        ''' Get the delay needed before fetching a page is possible '''
        can_fetch_time = self.last_crawled + self.crawl_delay
        if can_fetch_time < datetime.now():
            return timedelta(0)
        return can_fetch_time - datetime.now()
    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
        self.last_crawled = datetime.now()
 class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """
    def __init__(self):
        super(CrawlingThread, self).__init__()
    def run(self):
        tasks = []
        #tasks.append(async_crawler("http://plus.google.com/+Python"))
        tasks.append(async_crawler('https://python.org/'))
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """
    def __init__(self, session, url):
        self.url = url
        self.session = session
    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None
        delay = scheduler.fetch_delay()
        while delay > timedelta(0):
            await asyncio.sleep(delay.total_seconds())
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url, ssl=ssl) as resp:
                try:
                    return await resp.text()
                except UnicodeDecodeError:
                    return None
 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get(ssl=False)
        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
            url,
            datetime.now() - startup_time))
 async def async_crawler(url):
    queue = [url]
    crawled = []
    while queue and (len(crawled) < HARD_LIMIT):
        async with aiohttp.ClientSession() as session:
            try:
                url = queue.pop(0)
            except IndexError:
                print("Error queue is empty")
                return crawled
            parsed_url = urlparse(url)
            print("Crawling {}".format(url))
            html = await PageGetter(session, url).get(ssl=False)
            if html:
                new_urls = url_getter(
                    html,
                    url,
                    parsed_url.scheme + "://" + parsed_url.netloc
                )
                crawled += [url]
                if new_urls:
                    sampled = sample(
                        new_urls,
                        randrange(min(MAX_PER_PAGE, len(new_urls)))
                    )
                    queue += [sample_url for sample_url in sampled if
                              sample_url not in queue and sample_url not in
                              crawled]
    print(crawled)
    return crawled
 if __name__ == '__main__':
    crawl = CrawlingThread()
    crawl.start()
    crawl.join()
--- a/crawl/migrations/init.py
+++ b/crawl/migrations/init.py
--- a/crawl/models.py
+++ b/crawl/models.py
@ -0,0 +1,3 @@
 from django.db import models
 # Create your models here.
--- a/crawl/views.py
+++ b/crawl/views.py
@ -0,0 +1,3 @@
 from django.shortcuts import render
 # Create your views here.
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@ -29,7 +29,8 @@ INSTALLED_APPS = [
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'profiles',
-    'histories'
+    'histories',
    'crawl',
 ]
 MIDDLEWARE = [
@ -103,3 +104,5 @@ USE_TZ = True
 # https://docs.djangoproject.com/en/2.0/howto/static-files/
 STATIC_URL = '/static/'
 USER_AGENT = 'UnaffiliatedBot/0.1'
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,14 @@
 aiodns==1.1.1
 aiohttp==3.0.1
 async-timeout==2.0.0
 attrs==17.4.0
 cchardet==2.1.1
 chardet==3.0.4
 Django==2.0.1
 idna==2.6
 idna-ssl==1.0.0
 multidict==4.1.0
 pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
 beautifulsoup4==4.6.0
		`@ -0,0 +1,3 @@`
							`from django.contrib import admin`

							`# Register your models here.`
		`@ -0,0 +1,3 @@`
							`from django.db import models`

							`# Create your models here.`
		`@ -0,0 +1,3 @@`
							`from django.shortcuts import render`

							`# Create your views here.`