Merge branch 'crawl' into histories_models

2018-02-24 18:44:27 +01:00 · 2018-02-24 18:44:27 +01:00 · 60bfc8cb77
commit 60bfc8cb77
parent 12c8c652d7 d19c2e8216
9 changed files with 278 additions and 1 deletions
--- a/crawl/init.py
+++ b/crawl/init.py
--- a/crawl/admin.py
+++ b/crawl/admin.py
@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
--- a/crawl/apps.py
+++ b/crawl/apps.py
@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class CrawlConfig(AppConfig):
+    name = 'crawl'
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -0,0 +1,248 @@
+from threading import Thread
+from urllib.robotparser import RobotFileParser
+from urllib.error import URLError
+from urllib.parse import urlparse
+
+from ssl import CertificateError
+from random import sample, randrange
+import re
+from datetime import datetime, timedelta
+
+import asyncio
+import aiohttp
+import async_timeout
+
+from bs4 import BeautifulSoup, Comment
+
+# Ugly hack to use this module alone instead of integrating it with Django
+# from django.conf import settings
+
+# Gets all the direct bookmarks in the html.
+# We want this to avoid following this kind of bookmark
+
+HARD_LIMIT = 20
+MAX_PER_PAGE = 10
+
+FOOTER_URL = re.compile(".*footer.*")
+
+class Settings:
+    USER_AGENT = 'Blah'
+
+settings = Settings()
+startup_time = datetime.now()
+
+
+def url_getter(html, current_page, root_url):
+    links_list = [] # The final resutl
+    soup = BeautifulSoup(html, "html.parser")
+    # Get only the body
+    body = soup.find('body')
+    if not body:
+        return links_list
+    # remove the body
+    if body.footer:
+        body.footer.decompose()
+    # remove all comments
+    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
+    for comment in comments:
+        comment.extract()
+
+    footers = soup.findAll(id=FOOTER_URL)
+    for footer in footers:
+        footer.extract()
+
+    # Remove all bookmark links pointing to the current html page.
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
+    for link in links:
+        if link: #Edge case, if no href found.
+            if link.startswith("http"):
+                links_list.append(link)
+            elif link.startswith('/'): #Internal link, linking to page root url
+                links_list.append(root_url + link)
+            elif link.startswith("#"):
+                continue
+            else:
+                links_list.append(current_page + "/" + link)
+
+    ## uniqifier works with python <= 3.6
+    #seen = set()
+    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
+    # uniqifier
+    # Works only with python >= 3.6
+    links_list = list(dict.fromkeys(links_list))
+
+    forbidden_words = ['login', 'agreement', 'mailto']
+    links_list = [link for link in links_list if not any(word in link.lower()
+                                                         for word in
+                                                         forbidden_words)]
+
+    return links_list
+
+
+
+
+class WebsiteSchedulerMeta(type):
+    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
+    interface, but spawning one instance per canonical website URL """
+
+    _instances = {}
+    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
+
+    def canonical_url(cls, url):
+        """ Canonicalize a url """
+        return cls._canonicalize.search(url).groups()[1]
+
+    def __call__(cls, url, *args, **kwargs):
+        canonical = cls.canonical_url(url)
+        if canonical not in cls._instances:
+            cls._instances[canonical] = \
+                super(WebsiteSchedulerMeta, cls) \
+                .__call__(canonical, *args, **kwargs)
+        return cls._instances[canonical]
+
+
+class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
+    """ Schedule the accesses to a website as of robots.txt """
+    def __init__(self, name):
+        self.name = name
+        self.last_crawled = datetime.fromtimestamp(0)
+        self.dead = False
+        try:
+            robots_url = self.urlroot() + 'robots.txt'
+            self.robot_parser = RobotFileParser(robots_url)
+            self.robot_parser.read()  # TODO async?
+        except (URLError, CertificateError):
+            try:
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                self.robot_parser = RobotFileParser(robots_url)
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
+                self.dead = True
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.robot_parser.default_entry:
+            self.dead = True
+        if not self.dead:
+            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+            if delay is None:
+                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                if req_rate is None:
+                    delay = 5
+                else:
+                    delay = req_rate.requests, req_rate.seconds
+            self.crawl_delay = timedelta(seconds=delay)
+
+    def urlroot(self):
+        ''' Get the root url for this website '''
+        return 'https://{}/'.format(self.name)
+
+    def unsafe_urlroot(self):
+        return 'http://{}/'.format(self.name)
+
+    def fetch_delay(self):
+        ''' Get the delay needed before fetching a page is possible '''
+        can_fetch_time = self.last_crawled + self.crawl_delay
+        if can_fetch_time < datetime.now():
+            return timedelta(0)
+        return can_fetch_time - datetime.now()
+
+    def can_fetch(self, url):
+        ''' Check whether this program can fetch a given page '''
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
+
+    def fetching(self):
+        ''' Tell the scheduler that a page is being fetched now '''
+        self.last_crawled = datetime.now()
+
+
+class CrawlingThread(Thread):
+    """ A separate thread for the crawling task. This is needed to use asyncio,
+    since the thread will need its own event loop. """
+
+    def __init__(self):
+        super(CrawlingThread, self).__init__()
+
+    def run(self):
+        tasks = []
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        tasks.append(async_crawler('https://python.org/'))
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(asyncio.wait(tasks))
+        loop.close()
+
+
+class PageGetter:
+    """ Asynchronously get a webpage, abiding by robots.txt """
+
+    def __init__(self, session, url):
+        self.url = url
+        self.session = session
+
+    async def get(self, ssl=True):
+        """ Actually retrieve the webpage """
+        scheduler = WebsiteScheduler(self.url)
+        if not scheduler.can_fetch(self.url):
+            return None
+
+        delay = scheduler.fetch_delay()
+        while delay > timedelta(0):
+            await asyncio.sleep(delay.total_seconds())
+            delay = scheduler.fetch_delay()
+        scheduler.fetching()
+        async with async_timeout.timeout(10):
+            async with self.session.get(self.url, ssl=ssl) as resp:
+                try:
+                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None
+
+
+async def async_print(url):
+    """ Debug function to follow what's actually happening """
+    async with aiohttp.ClientSession() as session:
+        html = await PageGetter(session, url).get(ssl=False)
+
+        print('GOT {}HTML for {} at {}'.format(
+            'None ' if html is None else '',
+            url,
+            datetime.now() - startup_time))
+
+async def async_crawler(url):
+    queue = [url]
+    crawled = []
+    while queue and (len(crawled) < HARD_LIMIT):
+        async with aiohttp.ClientSession() as session:
+            try:
+                url = queue.pop(0)
+            except IndexError:
+                print("Error queue is empty")
+                return crawled
+            parsed_url = urlparse(url)
+            print("Crawling {}".format(url))
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
+                new_urls = url_getter(
+                    html,
+                    url,
+                    parsed_url.scheme + "://" + parsed_url.netloc
+                )
+                crawled += [url]
+                if new_urls:
+                    sampled = sample(
+                        new_urls,
+                        randrange(min(MAX_PER_PAGE, len(new_urls)))
+                    )
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
+    print(crawled)
+    return crawled
+
+if __name__ == '__main__':
+    crawl = CrawlingThread()
+    crawl.start()
+    crawl.join()
--- a/crawl/migrations/init.py
+++ b/crawl/migrations/init.py
--- a/crawl/models.py
+++ b/crawl/models.py
@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
--- a/crawl/views.py
+++ b/crawl/views.py
@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@ -29,7 +29,8 @@ INSTALLED_APPS = [
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'profiles',
-    'histories'
+    'histories',
+    'crawl',
 ]

 MIDDLEWARE = [
@ -103,3 +104,5 @@ USE_TZ = True
 # https://docs.djangoproject.com/en/2.0/howto/static-files/

 STATIC_URL = '/static/'
+
+USER_AGENT = 'UnaffiliatedBot/0.1'
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,14 @@
+aiodns==1.1.1
+aiohttp==3.0.1
+async-timeout==2.0.0
+attrs==17.4.0
+cchardet==2.1.1
+chardet==3.0.4
 Django==2.0.1
+idna==2.6
+idna-ssl==1.0.0
+multidict==4.1.0
+pycares==2.3.0
 pytz==2017.3
+yarl==1.1.1
+beautifulsoup4==4.6.0