From bef1fca5b9d725c3dc7bed7ba157fcc8b40e076c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 20 Feb 2018 08:51:16 +0100 Subject: [PATCH 01/15] Init app 'crawl' --- crawl/__init__.py | 0 crawl/admin.py | 3 +++ crawl/apps.py | 5 +++++ crawl/migrations/__init__.py | 0 crawl/models.py | 3 +++ crawl/views.py | 3 +++ pinocchio/settings.py | 1 + 7 files changed, 15 insertions(+) create mode 100644 crawl/__init__.py create mode 100644 crawl/admin.py create mode 100644 crawl/apps.py create mode 100644 crawl/migrations/__init__.py create mode 100644 crawl/models.py create mode 100644 crawl/views.py diff --git a/crawl/__init__.py b/crawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/admin.py b/crawl/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/crawl/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/crawl/apps.py b/crawl/apps.py new file mode 100644 index 0000000..96dcfeb --- /dev/null +++ b/crawl/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class CrawlConfig(AppConfig): + name = 'crawl' diff --git a/crawl/migrations/__init__.py b/crawl/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/models.py b/crawl/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/crawl/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/crawl/views.py b/crawl/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/crawl/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/pinocchio/settings.py b/pinocchio/settings.py index 97c22e4..f917498 100644 --- a/pinocchio/settings.py +++ b/pinocchio/settings.py @@ -26,6 +26,7 @@ INSTALLED_APPS = [ 'django.contrib.messages', 'django.contrib.staticfiles', 'profiles', + 'crawl', ] MIDDLEWARE = [ From c05c2561d2ef1de44021a8734271b693bdb33a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 20 Feb 2018 12:48:16 +0100 Subject: [PATCH 02/15] Add crawler settings and requirements --- pinocchio/settings.py | 2 ++ requirements.txt | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/pinocchio/settings.py b/pinocchio/settings.py index f917498..f17e05d 100644 --- a/pinocchio/settings.py +++ b/pinocchio/settings.py @@ -100,3 +100,5 @@ USE_TZ = True # https://docs.djangoproject.com/en/2.0/howto/static-files/ STATIC_URL = '/static/' + +USER_AGENT = 'UnaffiliatedBot/0.1' diff --git a/requirements.txt b/requirements.txt index 3b91687..bea30c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,13 @@ +aiodns==1.1.1 +aiohttp==3.0.1 +async-timeout==2.0.0 +attrs==17.4.0 +cchardet==2.1.1 +chardet==3.0.4 Django==2.0.1 +idna==2.6 +idna-ssl==1.0.0 +multidict==4.1.0 +pycares==2.3.0 pytz==2017.3 +yarl==1.1.1 From c97acb22b585ecda2e67507a158a7ebe76aaed43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 20 Feb 2018 12:48:53 +0100 Subject: [PATCH 03/15] Add tentative crawl file Nothing functional, just tests --- crawl/crawl.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 crawl/crawl.py diff --git a/crawl/crawl.py b/crawl/crawl.py new file mode 100644 index 0000000..7d22422 --- /dev/null +++ b/crawl/crawl.py @@ -0,0 +1,126 @@ +from threading import Thread +from urllib.robotparser import RobotFileParser + +import random + +import re +from datetime import datetime, timedelta + +import asyncio +import aiohttp +import async_timeout + +#from django.conf import settings + + +class Settings: + USER_AGENT = 'Blah' + +settings = Settings() +startup_time = datetime.now() + + +class WebsiteSchedulerMeta(type): + _instances = {} + _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') + + def canonical_url(cls, url): + return cls._canonicalize.search(url).groups()[1] + + def __call__(cls, url, *args, **kwargs): + canonical = cls.canonical_url(url) + if canonical not in cls._instances: + cls._instances[canonical] = \ + super(WebsiteSchedulerMeta, cls) \ + .__call__(canonical, *args, **kwargs) + return cls._instances[canonical] + + +class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): + """ Schedule the accesses to a website as of robots.txt """ + def __init__(self, name): + self.name = name + self.last_crawled = datetime.fromtimestamp(0) + robots_url = self.urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() # TODO async? + + delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + if delay is None: + req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + if req_rate is None: + delay = 5 + else: + delay = req_rate.requests, req_rate.seconds + + self.crawl_delay = timedelta(seconds=delay) + + def urlroot(self): + ''' Get the root url for this website ''' + return 'https://{}/'.format(self.name) + + def fetch_delay(self): + ''' Get the delay needed before fetching a page is possible ''' + can_fetch_time = self.last_crawled + self.crawl_delay + if can_fetch_time < datetime.now(): + return timedelta(0) + return can_fetch_time - datetime.now() + + def can_fetch(self, url): + ''' Check whether this program can fetch a given page ''' + return self.robot_parser.can_fetch(settings.USER_AGENT, url) + + def fetching(self): + ''' Tell the scheduler that a page is being fetched now ''' + self.last_crawled = datetime.now() + + +class CrawlingThread(Thread): + def __init__(self): + super(CrawlingThread, self).__init__() + + def run(self): + tasks = [] + tasks.append(async_print('https://python.org')) + tasks.append(async_print('https://python.org/webstats/')) + tasks.append(async_print('https://python.org/3.5/')) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(asyncio.wait(tasks)) + loop.close() + + +class PageGetter: + def __init__(self, session, url): + self.url = url + self.session = session + + async def get(self): + scheduler = WebsiteScheduler(self.url) + if not scheduler.can_fetch(self.url): + return None + + delay = scheduler.fetch_delay() + while delay > timedelta(0): + await asyncio.sleep(delay.total_seconds()) + delay = scheduler.fetch_delay() + scheduler.fetching() + async with async_timeout.timeout(10): + async with self.session.get(self.url) as resp: + return await resp.text() + + +async def async_print(url): + async with aiohttp.ClientSession() as session: + html = await PageGetter(session, url).get() + print('GOT {}HTML for {} at {}'.format( + 'None ' if html is None else '', + url, + datetime.now() - startup_time)) + + +if __name__ == '__main__': + crawl = CrawlingThread() + crawl.start() + crawl.join() From b05e642c79fabd33105c72c9a19fcd7e797a43d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Wed, 21 Feb 2018 11:54:41 +0100 Subject: [PATCH 04/15] Make the code somewhat readable --- crawl/crawl.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 7d22422..6baafad 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,8 +1,6 @@ from threading import Thread from urllib.robotparser import RobotFileParser -import random - import re from datetime import datetime, timedelta @@ -10,8 +8,8 @@ import asyncio import aiohttp import async_timeout -#from django.conf import settings - +# Ugly hack to use this module alone instead of integrating it with Django +# from django.conf import settings class Settings: USER_AGENT = 'Blah' @@ -21,10 +19,14 @@ startup_time = datetime.now() class WebsiteSchedulerMeta(type): + """ Meta-class for WebsiteScheduler, allowing a singleton class-like + interface, but spawning one instance per canonical website URL """ + _instances = {} _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') def canonical_url(cls, url): + """ Canonicalize a url """ return cls._canonicalize.search(url).groups()[1] def __call__(cls, url, *args, **kwargs): @@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): class CrawlingThread(Thread): + """ A separate thread for the crawling task. This is needed to use asyncio, + since the thread will need its own event loop. """ + def __init__(self): super(CrawlingThread, self).__init__() def run(self): tasks = [] tasks.append(async_print('https://python.org')) - tasks.append(async_print('https://python.org/webstats/')) - tasks.append(async_print('https://python.org/3.5/')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -92,11 +95,14 @@ class CrawlingThread(Thread): class PageGetter: + """ Asynchronously get a webpage, abiding by robots.txt """ + def __init__(self, session, url): self.url = url self.session = session async def get(self): + """ Actually retrieve the webpage """ scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): return None @@ -112,6 +118,7 @@ class PageGetter: async def async_print(url): + """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() print('GOT {}HTML for {} at {}'.format( From a907cad33d52865c6e6034b8cbd7d6c777a549b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Wed, 21 Feb 2018 19:06:46 +0100 Subject: [PATCH 05/15] Start of url getter function --- crawl/crawl.py | 28 +++++++++++++++++++++++++++- requirements.txt | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 6baafad..25d8de9 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,6 +1,7 @@ from threading import Thread from urllib.robotparser import RobotFileParser +from bs4 import BeautifulSoup, Comment import re from datetime import datetime, timedelta @@ -11,6 +12,11 @@ import async_timeout # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings +# Gets all the direct bookmarks in the html. +# We want this to avoid following this kind of bookmark + +BOOKMARK_URL = "#.*" + class Settings: USER_AGENT = 'Blah' @@ -18,6 +24,24 @@ settings = Settings() startup_time = datetime.now() +def url_getter(html): + soup = BeautifulSoup(html, "html.parser") + # Get only the body + body = soup.find('body') + # remove the body + body.footer.decompose() + # remove all comments + comments = soup.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + # Remove all bookmark links pointing to the current html page. + links = body.find_all("a") + for link in links: + if re.match(BOOKMARK_URL, link["href"]): + link.extract() + + class WebsiteSchedulerMeta(type): """ Meta-class for WebsiteScheduler, allowing a singleton class-like interface, but spawning one instance per canonical website URL """ @@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): delay = 5 else: delay = req_rate.requests, req_rate.seconds - self.crawl_delay = timedelta(seconds=delay) def urlroot(self): @@ -87,6 +110,7 @@ class CrawlingThread(Thread): def run(self): tasks = [] tasks.append(async_print('https://python.org')) + tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -116,11 +140,13 @@ class PageGetter: async with self.session.get(self.url) as resp: return await resp.text() +async def async_parser(html_text) async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() + print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', url, diff --git a/requirements.txt b/requirements.txt index bea30c4..480760f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ multidict==4.1.0 pycares==2.3.0 pytz==2017.3 yarl==1.1.1 +beautifulsoup4==4.6.0 From 4e6ac5ac7baf8fd5cb1866fbd30efadc7e1e8044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Wed, 21 Feb 2018 22:51:05 +0100 Subject: [PATCH 06/15] Url getter function : retrieves the list of so-called relevant links --- crawl/crawl.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 25d8de9..76affb3 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -24,7 +24,8 @@ settings = Settings() startup_time = datetime.now() -def url_getter(html): +def url_getter(html, current_page, root_url): + links_list = [] # The final resutl soup = BeautifulSoup(html, "html.parser") # Get only the body body = soup.find('body') @@ -38,8 +39,24 @@ def url_getter(html): # Remove all bookmark links pointing to the current html page. links = body.find_all("a") for link in links: - if re.match(BOOKMARK_URL, link["href"]): - link.extract() + if link.startswith("http"): + links_list.append(link) + elif link.startswith('/'): #Internal link, linking to page root url + link_list.append(root_url + link) + elif link.startswith("#"): + print("Invalid link : internal bookmark") + else: + links_list.append(current_page + link) + + ## uniqifier works with python <= 3.6 + #seen = set() + #links_list = [x for x in links_list if x not in seen and not seen.add(x)] + + # uniqifier + # Works only with python >= 3.6 + links_list = list(dict.fromkeys(seq)) + + class WebsiteSchedulerMeta(type): @@ -140,7 +157,6 @@ class PageGetter: async with self.session.get(self.url) as resp: return await resp.text() -async def async_parser(html_text) async def async_print(url): """ Debug function to follow what's actually happening """ From 236e15296c014dd3e325a69719ff837f3491b53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Wed, 21 Feb 2018 23:11:57 +0100 Subject: [PATCH 07/15] It can be useful to return the links list --- crawl/crawl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 76affb3..f18f4cf 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -51,11 +51,12 @@ def url_getter(html, current_page, root_url): ## uniqifier works with python <= 3.6 #seen = set() #links_list = [x for x in links_list if x not in seen and not seen.add(x)] - # uniqifier # Works only with python >= 3.6 links_list = list(dict.fromkeys(seq)) + return links_list + From e19e623df1f8d6f060299c1b682448ca73dde0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Thu, 22 Feb 2018 14:07:53 +0100 Subject: [PATCH 08/15] Multiple bug fixes. TODO : remove