diff --git a/crawl/__init__.py b/crawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/admin.py b/crawl/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/crawl/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/crawl/apps.py b/crawl/apps.py new file mode 100644 index 0000000..96dcfeb --- /dev/null +++ b/crawl/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class CrawlConfig(AppConfig): + name = 'crawl' diff --git a/crawl/crawl.py b/crawl/crawl.py new file mode 100644 index 0000000..e8467f1 --- /dev/null +++ b/crawl/crawl.py @@ -0,0 +1,248 @@ +from threading import Thread +from urllib.robotparser import RobotFileParser +from urllib.error import URLError +from urllib.parse import urlparse + +from ssl import CertificateError +from random import sample, randrange +import re +from datetime import datetime, timedelta + +import asyncio +import aiohttp +import async_timeout + +from bs4 import BeautifulSoup, Comment + +# Ugly hack to use this module alone instead of integrating it with Django +# from django.conf import settings + +# Gets all the direct bookmarks in the html. +# We want this to avoid following this kind of bookmark + +HARD_LIMIT = 20 +MAX_PER_PAGE = 10 + +FOOTER_URL = re.compile(".*footer.*") + +class Settings: + USER_AGENT = 'Blah' + +settings = Settings() +startup_time = datetime.now() + + +def url_getter(html, current_page, root_url): + links_list = [] # The final resutl + soup = BeautifulSoup(html, "html.parser") + # Get only the body + body = soup.find('body') + if not body: + return links_list + # remove the body + if body.footer: + body.footer.decompose() + # remove all comments + comments = soup.findAll(text=lambda text: isinstance(text, Comment)) + for comment in comments: + comment.extract() + + footers = soup.findAll(id=FOOTER_URL) + for footer in footers: + footer.extract() + + # Remove all bookmark links pointing to the current html page. + links = map(lambda link: link.get("href", ""), body.find_all("a")) + for link in links: + if link: #Edge case, if no href found. + if link.startswith("http"): + links_list.append(link) + elif link.startswith('/'): #Internal link, linking to page root url + links_list.append(root_url + link) + elif link.startswith("#"): + continue + else: + links_list.append(current_page + "/" + link) + + ## uniqifier works with python <= 3.6 + #seen = set() + #links_list = [x for x in links_list if x not in seen and not seen.add(x)] + # uniqifier + # Works only with python >= 3.6 + links_list = list(dict.fromkeys(links_list)) + + forbidden_words = ['login', 'agreement', 'mailto'] + links_list = [link for link in links_list if not any(word in link.lower() + for word in + forbidden_words)] + + return links_list + + + + +class WebsiteSchedulerMeta(type): + """ Meta-class for WebsiteScheduler, allowing a singleton class-like + interface, but spawning one instance per canonical website URL """ + + _instances = {} + _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') + + def canonical_url(cls, url): + """ Canonicalize a url """ + return cls._canonicalize.search(url).groups()[1] + + def __call__(cls, url, *args, **kwargs): + canonical = cls.canonical_url(url) + if canonical not in cls._instances: + cls._instances[canonical] = \ + super(WebsiteSchedulerMeta, cls) \ + .__call__(canonical, *args, **kwargs) + return cls._instances[canonical] + + +class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): + """ Schedule the accesses to a website as of robots.txt """ + def __init__(self, name): + self.name = name + self.last_crawled = datetime.fromtimestamp(0) + self.dead = False + try: + robots_url = self.urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() # TODO async? + except (URLError, CertificateError): + try: + robots_url = self.unsafe_urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() + except URLError: # Almost surely an offline website. + self.dead = True + self.crawl_delay = 0 + except Exception as e: + print(e) + raise e + if not self.robot_parser.default_entry: + self.dead = True + if not self.dead: + delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + if delay is None: + req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + if req_rate is None: + delay = 5 + else: + delay = req_rate.requests, req_rate.seconds + self.crawl_delay = timedelta(seconds=delay) + + def urlroot(self): + ''' Get the root url for this website ''' + return 'https://{}/'.format(self.name) + + def unsafe_urlroot(self): + return 'http://{}/'.format(self.name) + + def fetch_delay(self): + ''' Get the delay needed before fetching a page is possible ''' + can_fetch_time = self.last_crawled + self.crawl_delay + if can_fetch_time < datetime.now(): + return timedelta(0) + return can_fetch_time - datetime.now() + + def can_fetch(self, url): + ''' Check whether this program can fetch a given page ''' + return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url) + + def fetching(self): + ''' Tell the scheduler that a page is being fetched now ''' + self.last_crawled = datetime.now() + + +class CrawlingThread(Thread): + """ A separate thread for the crawling task. This is needed to use asyncio, + since the thread will need its own event loop. """ + + def __init__(self): + super(CrawlingThread, self).__init__() + + def run(self): + tasks = [] + #tasks.append(async_crawler("http://plus.google.com/+Python")) + tasks.append(async_crawler('https://python.org/')) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(asyncio.wait(tasks)) + loop.close() + + +class PageGetter: + """ Asynchronously get a webpage, abiding by robots.txt """ + + def __init__(self, session, url): + self.url = url + self.session = session + + async def get(self, ssl=True): + """ Actually retrieve the webpage """ + scheduler = WebsiteScheduler(self.url) + if not scheduler.can_fetch(self.url): + return None + + delay = scheduler.fetch_delay() + while delay > timedelta(0): + await asyncio.sleep(delay.total_seconds()) + delay = scheduler.fetch_delay() + scheduler.fetching() + async with async_timeout.timeout(10): + async with self.session.get(self.url, ssl=ssl) as resp: + try: + return await resp.text() + except UnicodeDecodeError: + return None + + +async def async_print(url): + """ Debug function to follow what's actually happening """ + async with aiohttp.ClientSession() as session: + html = await PageGetter(session, url).get(ssl=False) + + print('GOT {}HTML for {} at {}'.format( + 'None ' if html is None else '', + url, + datetime.now() - startup_time)) + +async def async_crawler(url): + queue = [url] + crawled = [] + while queue and (len(crawled) < HARD_LIMIT): + async with aiohttp.ClientSession() as session: + try: + url = queue.pop(0) + except IndexError: + print("Error queue is empty") + return crawled + parsed_url = urlparse(url) + print("Crawling {}".format(url)) + html = await PageGetter(session, url).get(ssl=False) + if html: + new_urls = url_getter( + html, + url, + parsed_url.scheme + "://" + parsed_url.netloc + ) + crawled += [url] + if new_urls: + sampled = sample( + new_urls, + randrange(min(MAX_PER_PAGE, len(new_urls))) + ) + queue += [sample_url for sample_url in sampled if + sample_url not in queue and sample_url not in + crawled] + print(crawled) + return crawled + +if __name__ == '__main__': + crawl = CrawlingThread() + crawl.start() + crawl.join() diff --git a/crawl/migrations/__init__.py b/crawl/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/models.py b/crawl/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/crawl/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/crawl/views.py b/crawl/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/crawl/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/pinocchio/settings.py b/pinocchio/settings.py index 4c24915..ff1ce17 100644 --- a/pinocchio/settings.py +++ b/pinocchio/settings.py @@ -29,7 +29,8 @@ INSTALLED_APPS = [ 'django.contrib.messages', 'django.contrib.staticfiles', 'profiles', - 'histories' + 'histories', + 'crawl', ] MIDDLEWARE = [ @@ -103,3 +104,5 @@ USE_TZ = True # https://docs.djangoproject.com/en/2.0/howto/static-files/ STATIC_URL = '/static/' + +USER_AGENT = 'UnaffiliatedBot/0.1' diff --git a/requirements.txt b/requirements.txt index 3b91687..480760f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,14 @@ +aiodns==1.1.1 +aiohttp==3.0.1 +async-timeout==2.0.0 +attrs==17.4.0 +cchardet==2.1.1 +chardet==3.0.4 Django==2.0.1 +idna==2.6 +idna-ssl==1.0.0 +multidict==4.1.0 +pycares==2.3.0 pytz==2017.3 +yarl==1.1.1 +beautifulsoup4==4.6.0