From bef1fca5b9d725c3dc7bed7ba157fcc8b40e076c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Tue, 20 Feb 2018 08:51:16 +0100
Subject: [PATCH 01/15] Init app 'crawl'

---
 crawl/__init__.py            | 0
 crawl/admin.py               | 3 +++
 crawl/apps.py                | 5 +++++
 crawl/migrations/__init__.py | 0
 crawl/models.py              | 3 +++
 crawl/views.py               | 3 +++
 pinocchio/settings.py        | 1 +
 7 files changed, 15 insertions(+)
 create mode 100644 crawl/__init__.py
 create mode 100644 crawl/admin.py
 create mode 100644 crawl/apps.py
 create mode 100644 crawl/migrations/__init__.py
 create mode 100644 crawl/models.py
 create mode 100644 crawl/views.py

diff --git a/crawl/__init__.py b/crawl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawl/admin.py b/crawl/admin.py
new file mode 100644
index 0000000..8c38f3f
--- /dev/null
+++ b/crawl/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/crawl/apps.py b/crawl/apps.py
new file mode 100644
index 0000000..96dcfeb
--- /dev/null
+++ b/crawl/apps.py
@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class CrawlConfig(AppConfig):
+    name = 'crawl'
diff --git a/crawl/migrations/__init__.py b/crawl/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawl/models.py b/crawl/models.py
new file mode 100644
index 0000000..71a8362
--- /dev/null
+++ b/crawl/models.py
@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
diff --git a/crawl/views.py b/crawl/views.py
new file mode 100644
index 0000000..91ea44a
--- /dev/null
+++ b/crawl/views.py
@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.
diff --git a/pinocchio/settings.py b/pinocchio/settings.py
index 97c22e4..f917498 100644
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@@ -26,6 +26,7 @@ INSTALLED_APPS = [
     'django.contrib.messages',
     'django.contrib.staticfiles',
     'profiles',
+    'crawl',
 ]
 
 MIDDLEWARE = [

From c05c2561d2ef1de44021a8734271b693bdb33a78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Tue, 20 Feb 2018 12:48:16 +0100
Subject: [PATCH 02/15] Add crawler settings and requirements

---
 pinocchio/settings.py |  2 ++
 requirements.txt      | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/pinocchio/settings.py b/pinocchio/settings.py
index f917498..f17e05d 100644
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@@ -100,3 +100,5 @@ USE_TZ = True
 # https://docs.djangoproject.com/en/2.0/howto/static-files/
 
 STATIC_URL = '/static/'
+
+USER_AGENT = 'UnaffiliatedBot/0.1'
diff --git a/requirements.txt b/requirements.txt
index 3b91687..bea30c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,13 @@
+aiodns==1.1.1
+aiohttp==3.0.1
+async-timeout==2.0.0
+attrs==17.4.0
+cchardet==2.1.1
+chardet==3.0.4
 Django==2.0.1
+idna==2.6
+idna-ssl==1.0.0
+multidict==4.1.0
+pycares==2.3.0
 pytz==2017.3
+yarl==1.1.1

From c97acb22b585ecda2e67507a158a7ebe76aaed43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Tue, 20 Feb 2018 12:48:53 +0100
Subject: [PATCH 03/15] Add tentative crawl file

Nothing functional, just tests
---
 crawl/crawl.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 crawl/crawl.py

diff --git a/crawl/crawl.py b/crawl/crawl.py
new file mode 100644
index 0000000..7d22422
--- /dev/null
+++ b/crawl/crawl.py
@@ -0,0 +1,126 @@
+from threading import Thread
+from urllib.robotparser import RobotFileParser
+
+import random
+
+import re
+from datetime import datetime, timedelta
+
+import asyncio
+import aiohttp
+import async_timeout
+
+#from django.conf import settings
+
+
+class Settings:
+    USER_AGENT = 'Blah'
+
+settings = Settings()
+startup_time = datetime.now()
+
+
+class WebsiteSchedulerMeta(type):
+    _instances = {}
+    _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
+
+    def canonical_url(cls, url):
+        return cls._canonicalize.search(url).groups()[1]
+
+    def __call__(cls, url, *args, **kwargs):
+        canonical = cls.canonical_url(url)
+        if canonical not in cls._instances:
+            cls._instances[canonical] = \
+                super(WebsiteSchedulerMeta, cls) \
+                .__call__(canonical, *args, **kwargs)
+        return cls._instances[canonical]
+
+
+class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
+    """ Schedule the accesses to a website as of robots.txt """
+    def __init__(self, name):
+        self.name = name
+        self.last_crawled = datetime.fromtimestamp(0)
+        robots_url = self.urlroot() + 'robots.txt'
+        self.robot_parser = RobotFileParser(robots_url)
+        self.robot_parser.read()  # TODO async?
+
+        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+        if delay is None:
+            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+            if req_rate is None:
+                delay = 5
+            else:
+                delay = req_rate.requests, req_rate.seconds
+
+        self.crawl_delay = timedelta(seconds=delay)
+
+    def urlroot(self):
+        ''' Get the root url for this website '''
+        return 'https://{}/'.format(self.name)
+
+    def fetch_delay(self):
+        ''' Get the delay needed before fetching a page is possible '''
+        can_fetch_time = self.last_crawled + self.crawl_delay
+        if can_fetch_time < datetime.now():
+            return timedelta(0)
+        return can_fetch_time - datetime.now()
+
+    def can_fetch(self, url):
+        ''' Check whether this program can fetch a given page '''
+        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+
+    def fetching(self):
+        ''' Tell the scheduler that a page is being fetched now '''
+        self.last_crawled = datetime.now()
+
+
+class CrawlingThread(Thread):
+    def __init__(self):
+        super(CrawlingThread, self).__init__()
+
+    def run(self):
+        tasks = []
+        tasks.append(async_print('https://python.org'))
+        tasks.append(async_print('https://python.org/webstats/'))
+        tasks.append(async_print('https://python.org/3.5/'))
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(asyncio.wait(tasks))
+        loop.close()
+
+
+class PageGetter:
+    def __init__(self, session, url):
+        self.url = url
+        self.session = session
+
+    async def get(self):
+        scheduler = WebsiteScheduler(self.url)
+        if not scheduler.can_fetch(self.url):
+            return None
+
+        delay = scheduler.fetch_delay()
+        while delay > timedelta(0):
+            await asyncio.sleep(delay.total_seconds())
+            delay = scheduler.fetch_delay()
+        scheduler.fetching()
+        async with async_timeout.timeout(10):
+            async with self.session.get(self.url) as resp:
+                return await resp.text()
+
+
+async def async_print(url):
+    async with aiohttp.ClientSession() as session:
+        html = await PageGetter(session, url).get()
+        print('GOT {}HTML for {} at {}'.format(
+            'None ' if html is None else '',
+            url,
+            datetime.now() - startup_time))
+
+
+if __name__ == '__main__':
+    crawl = CrawlingThread()
+    crawl.start()
+    crawl.join()

From b05e642c79fabd33105c72c9a19fcd7e797a43d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Wed, 21 Feb 2018 11:54:41 +0100
Subject: [PATCH 04/15] Make the code somewhat readable

---
 crawl/crawl.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 7d22422..6baafad 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,8 +1,6 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 
-import random
-
 import re
 from datetime import datetime, timedelta
 
@@ -10,8 +8,8 @@ import asyncio
 import aiohttp
 import async_timeout
 
-#from django.conf import settings
-
+# Ugly hack to use this module alone instead of integrating it with Django
+# from django.conf import settings
 
 class Settings:
     USER_AGENT = 'Blah'
@@ -21,10 +19,14 @@ startup_time = datetime.now()
 
 
 class WebsiteSchedulerMeta(type):
+    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
+    interface, but spawning one instance per canonical website URL """
+
     _instances = {}
     _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
 
     def canonical_url(cls, url):
+        """ Canonicalize a url """
         return cls._canonicalize.search(url).groups()[1]
 
     def __call__(cls, url, *args, **kwargs):
@@ -76,14 +78,15 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
 
 class CrawlingThread(Thread):
+    """ A separate thread for the crawling task. This is needed to use asyncio,
+    since the thread will need its own event loop. """
+
     def __init__(self):
         super(CrawlingThread, self).__init__()
 
     def run(self):
         tasks = []
         tasks.append(async_print('https://python.org'))
-        tasks.append(async_print('https://python.org/webstats/'))
-        tasks.append(async_print('https://python.org/3.5/'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -92,11 +95,14 @@ class CrawlingThread(Thread):
 
 
 class PageGetter:
+    """ Asynchronously get a webpage, abiding by robots.txt """
+
     def __init__(self, session, url):
         self.url = url
         self.session = session
 
     async def get(self):
+        """ Actually retrieve the webpage """
         scheduler = WebsiteScheduler(self.url)
         if not scheduler.can_fetch(self.url):
             return None
@@ -112,6 +118,7 @@ class PageGetter:
 
 
 async def async_print(url):
+    """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
         html = await PageGetter(session, url).get()
         print('GOT {}HTML for {} at {}'.format(

From a907cad33d52865c6e6034b8cbd7d6c777a549b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Wed, 21 Feb 2018 19:06:46 +0100
Subject: [PATCH 05/15] Start of url getter function

---
 crawl/crawl.py   | 28 +++++++++++++++++++++++++++-
 requirements.txt |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 6baafad..25d8de9 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,6 +1,7 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 
+from bs4 import BeautifulSoup, Comment
 import re
 from datetime import datetime, timedelta
 
@@ -11,6 +12,11 @@ import async_timeout
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
 
+# Gets all the direct bookmarks in the html.
+# We want this to avoid following this kind of bookmark
+
+BOOKMARK_URL = "#.*"
+
 class Settings:
     USER_AGENT = 'Blah'
 
@@ -18,6 +24,24 @@ settings = Settings()
 startup_time = datetime.now()
 
 
+def url_getter(html):
+    soup = BeautifulSoup(html, "html.parser")
+    # Get only the body
+    body = soup.find('body')
+    # remove the body
+    body.footer.decompose()
+    # remove all comments
+    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    for comment in comments:
+        comment.extract()
+
+    # Remove all bookmark links pointing to the current html page.
+    links = body.find_all("a")
+    for link in links:
+        if re.match(BOOKMARK_URL, link["href"]):
+            link.extract()
+
+
 class WebsiteSchedulerMeta(type):
     """ Meta-class for WebsiteScheduler, allowing a singleton class-like
     interface, but spawning one instance per canonical website URL """
@@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                 delay = 5
             else:
                 delay = req_rate.requests, req_rate.seconds
-
         self.crawl_delay = timedelta(seconds=delay)
 
     def urlroot(self):
@@ -87,6 +110,7 @@ class CrawlingThread(Thread):
     def run(self):
         tasks = []
         tasks.append(async_print('https://python.org'))
+        tasks.append(async_print('https://python.org/about/gettingstarted'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -116,11 +140,13 @@ class PageGetter:
             async with self.session.get(self.url) as resp:
                 return await resp.text()
 
+async def async_parser(html_text)
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
         html = await PageGetter(session, url).get()
+
         print('GOT {}HTML for {} at {}'.format(
             'None ' if html is None else '',
             url,
diff --git a/requirements.txt b/requirements.txt
index bea30c4..480760f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ multidict==4.1.0
 pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
+beautifulsoup4==4.6.0

From 4e6ac5ac7baf8fd5cb1866fbd30efadc7e1e8044 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Wed, 21 Feb 2018 22:51:05 +0100
Subject: [PATCH 06/15] Url getter function : retrieves the list of so-called
 relevant links

---
 crawl/crawl.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 25d8de9..76affb3 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -24,7 +24,8 @@ settings = Settings()
 startup_time = datetime.now()
 
 
-def url_getter(html):
+def url_getter(html, current_page, root_url):
+    links_list = [] # The final resutl
     soup = BeautifulSoup(html, "html.parser")
     # Get only the body
     body = soup.find('body')
@@ -38,8 +39,24 @@ def url_getter(html):
     # Remove all bookmark links pointing to the current html page.
     links = body.find_all("a")
     for link in links:
-        if re.match(BOOKMARK_URL, link["href"]):
-            link.extract()
+        if link.startswith("http"):
+            links_list.append(link)
+        elif link.startswith('/'): #Internal link, linking to page root url
+            link_list.append(root_url + link)
+        elif link.startswith("#"):
+            print("Invalid link : internal bookmark")
+        else:
+            links_list.append(current_page + link)
+
+    ## uniqifier works with python <= 3.6
+    #seen = set()
+    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
+
+    # uniqifier
+    # Works only with python >= 3.6
+    links_list = list(dict.fromkeys(seq))
+
+
 
 
 class WebsiteSchedulerMeta(type):
@@ -140,7 +157,6 @@ class PageGetter:
             async with self.session.get(self.url) as resp:
                 return await resp.text()
 
-async def async_parser(html_text)
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """

From 236e15296c014dd3e325a69719ff837f3491b53a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Wed, 21 Feb 2018 23:11:57 +0100
Subject: [PATCH 07/15] It can be useful to return the links list

---
 crawl/crawl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 76affb3..f18f4cf 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -51,11 +51,12 @@ def url_getter(html, current_page, root_url):
     ## uniqifier works with python <= 3.6
     #seen = set()
     #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
-
     # uniqifier
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(seq))
 
+    return links_list
+
 
 
 

From e19e623df1f8d6f060299c1b682448ca73dde0bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Thu, 22 Feb 2018 14:07:53 +0100
Subject: [PATCH 08/15] Multiple bug fixes. TODO : remove <div id=footer>-like
 patterns

---
 crawl/crawl.py | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index f18f4cf..c85a220 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,7 +1,8 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse
 
-from bs4 import BeautifulSoup, Comment
+from random import sample, randrange
 import re
 from datetime import datetime, timedelta
 
@@ -9,13 +10,16 @@ import asyncio
 import aiohttp
 import async_timeout
 
+from bs4 import BeautifulSoup, Comment
+
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
 
 # Gets all the direct bookmarks in the html.
 # We want this to avoid following this kind of bookmark
 
-BOOKMARK_URL = "#.*"
+HARD_LIMIT = 20
+MAX_PER_PAGE = 10
 
 class Settings:
     USER_AGENT = 'Blah'
@@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url):
     # remove the body
     body.footer.decompose()
     # remove all comments
-    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
     for comment in comments:
         comment.extract()
 
     # Remove all bookmark links pointing to the current html page.
-    links = body.find_all("a")
+    links = map(lambda link: link["href"], body.find_all("a"))
     for link in links:
         if link.startswith("http"):
             links_list.append(link)
         elif link.startswith('/'): #Internal link, linking to page root url
-            link_list.append(root_url + link)
+            links_list.append(root_url + link)
         elif link.startswith("#"):
             print("Invalid link : internal bookmark")
         else:
@@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url):
     #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
     # uniqifier
     # Works only with python >= 3.6
-    links_list = list(dict.fromkeys(seq))
+    links_list = list(dict.fromkeys(links_list))
+    print(links_list)
 
     return links_list
 
@@ -127,8 +132,8 @@ class CrawlingThread(Thread):
 
     def run(self):
         tasks = []
-        tasks.append(async_print('https://python.org'))
-        tasks.append(async_print('https://python.org/about/gettingstarted'))
+        tasks.append(async_crawler('https://python.org'))
+        #tasks.append(async_print('https://python.org/about/gettingstarted'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -169,6 +174,30 @@ async def async_print(url):
             url,
             datetime.now() - startup_time))
 
+async def async_crawler(url):
+    queue = [url]
+    crawled = []
+    while (not queue) or (len(crawled) < HARD_LIMIT):
+        async with aiohttp.ClientSession() as session:
+            try:
+                url = queue.pop(0)
+            except IndexError:
+                print("Error queue is empty")
+                return crawled
+            parsed_url = urlparse(url)
+            print("Crawling {}".format(url))
+            html = await PageGetter(session, url).get()
+            new_urls = url_getter(
+                html,
+                url,
+                parsed_url.scheme + "://" + parsed_url.netloc
+            )
+            crawled += url
+            queue += sample(
+                new_urls,
+                randrange(min(MAX_PER_PAGE, len(new_urls)))
+            )
+    print(crawled)
 
 if __name__ == '__main__':
     crawl = CrawlingThread()

From 9b78e268c90c3edeab7a6ac12830ef9a0bb40998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Thu, 22 Feb 2018 14:33:07 +0100
Subject: [PATCH 09/15] Nearly working crawler

---
 crawl/crawl.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index c85a220..132acee 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
 HARD_LIMIT = 20
 MAX_PER_PAGE = 10
 
+FOOTER_URL = re.compile(".*footer.*")
+
 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'BlahBlah'
 
 settings = Settings()
 startup_time = datetime.now()
@@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
     # Get only the body
     body = soup.find('body')
     # remove the body
-    body.footer.decompose()
+    if body.footer:
+        body.footer.decompose()
     # remove all comments
     comments = soup.findAll(text=lambda text: isinstance(text, Comment))
     for comment in comments:
         comment.extract()
 
+    print("Retrieving footers")
+    footers = soup.findAll(id=FOOTER_URL)
+    for footer in footers:
+        footer.extract()
+
     # Remove all bookmark links pointing to the current html page.
     links = map(lambda link: link["href"], body.find_all("a"))
     for link in links:
@@ -132,7 +140,7 @@ class CrawlingThread(Thread):
 
     def run(self):
         tasks = []
-        tasks.append(async_crawler('https://python.org'))
+        tasks.append(async_crawler("https://python.org/"))
         #tasks.append(async_print('https://python.org/about/gettingstarted'))
 
         loop = asyncio.new_event_loop()
@@ -192,11 +200,13 @@ async def async_crawler(url):
                 url,
                 parsed_url.scheme + "://" + parsed_url.netloc
             )
-            crawled += url
-            queue += sample(
+            crawled += [url]
+            sampled = sample(
                 new_urls,
                 randrange(min(MAX_PER_PAGE, len(new_urls)))
             )
+            queue += [sample_url for sample_url in sampled if sample_url not in
+                      queue and sample_url not in crawled]
     print(crawled)
 
 if __name__ == '__main__':

From 77ca7ebcb930122481b757e7068dc988cefe77ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Thu, 22 Feb 2018 15:35:46 +0100
Subject: [PATCH 10/15] Silly me.

---
 crawl/crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 132acee..ee32971 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -185,7 +185,7 @@ async def async_print(url):
 async def async_crawler(url):
     queue = [url]
     crawled = []
-    while (not queue) or (len(crawled) < HARD_LIMIT):
+    while queue or (len(crawled) < HARD_LIMIT):
         async with aiohttp.ClientSession() as session:
             try:
                 url = queue.pop(0)

From 0e02f22d089df9274af91e863bc3872ae19c869a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Fri, 23 Feb 2018 00:37:36 +0100
Subject: [PATCH 11/15] Exception handling

Big problem with the url https:/plus.google.com/+Python concerning
robots parsing.
Didn't find the bug. @tobast, if you have some time to look at it :)
---
 crawl/crawl.py | 113 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index ee32971..46c7707 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,7 +1,9 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.error import URLError
 from urllib.parse import urlparse
 
+from ssl import CertificateError
 from random import sample, randrange
 import re
 from datetime import datetime, timedelta
@@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 
 class Settings:
-    USER_AGENT = 'BlahBlah'
+    USER_AGENT = 'Blah'
 
 settings = Settings()
 startup_time = datetime.now()
@@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
     soup = BeautifulSoup(html, "html.parser")
     # Get only the body
     body = soup.find('body')
+    if not body:
+        return links_list
     # remove the body
     if body.footer:
         body.footer.decompose()
@@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
     for comment in comments:
         comment.extract()
 
-    print("Retrieving footers")
     footers = soup.findAll(id=FOOTER_URL)
     for footer in footers:
         footer.extract()
 
     # Remove all bookmark links pointing to the current html page.
-    links = map(lambda link: link["href"], body.find_all("a"))
+    links = map(lambda link: link.get("href", ""), body.find_all("a"))
     for link in links:
-        if link.startswith("http"):
-            links_list.append(link)
-        elif link.startswith('/'): #Internal link, linking to page root url
-            links_list.append(root_url + link)
-        elif link.startswith("#"):
-            print("Invalid link : internal bookmark")
-        else:
-            links_list.append(current_page + link)
+        if link: #Edge case, if no href found.
+            if link.startswith("http"):
+                links_list.append(link)
+            elif link.startswith('/'): #Internal link, linking to page root url
+                links_list.append(root_url + link)
+            elif link.startswith("#"):
+                print("Invalid link : internal bookmark")
+            else:
+                links_list.append(current_page + "/" + link)
 
     ## uniqifier works with python <= 3.6
     #seen = set()
@@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
     # uniqifier
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(links_list))
-    print(links_list)
 
     return links_list
 
@@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
     def __init__(self, name):
         self.name = name
         self.last_crawled = datetime.fromtimestamp(0)
-        robots_url = self.urlroot() + 'robots.txt'
-        self.robot_parser = RobotFileParser(robots_url)
-        self.robot_parser.read()  # TODO async?
-
-        delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-        if delay is None:
-            req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-            if req_rate is None:
-                delay = 5
-            else:
-                delay = req_rate.requests, req_rate.seconds
-        self.crawl_delay = timedelta(seconds=delay)
+        self.dead = False
+        try:
+            robots_url = self.urlroot() + 'robots.txt'
+            self.robot_parser = RobotFileParser(robots_url)
+            self.robot_parser.read()  # TODO async?
+        except (URLError, CertificateError):
+            try:
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                self.robot_parser = RobotFileParser(robots_url)
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
+                self.dead = True
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.dead:
+            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+            if delay is None:
+                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                if req_rate is None:
+                    delay = 5
+                else:
+                    delay = req_rate.requests, req_rate.seconds
+            self.crawl_delay = timedelta(seconds=delay)
 
     def urlroot(self):
         ''' Get the root url for this website '''
         return 'https://{}/'.format(self.name)
 
+    def unsafe_urlroot(self):
+        return 'http://{}/'.format(self.name)
+
     def fetch_delay(self):
         ''' Get the delay needed before fetching a page is possible '''
         can_fetch_time = self.last_crawled + self.crawl_delay
@@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
     def can_fetch(self, url):
         ''' Check whether this program can fetch a given page '''
-        return self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
 
     def fetching(self):
         ''' Tell the scheduler that a page is being fetched now '''
@@ -140,8 +159,8 @@ class CrawlingThread(Thread):
 
     def run(self):
         tasks = []
-        tasks.append(async_crawler("https://python.org/"))
-        #tasks.append(async_print('https://python.org/about/gettingstarted'))
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        tasks.append(async_print('https://python.org/'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -156,7 +175,7 @@ class PageGetter:
         self.url = url
         self.session = session
 
-    async def get(self):
+    async def get(self, ssl=True):
         """ Actually retrieve the webpage """
         scheduler = WebsiteScheduler(self.url)
         if not scheduler.can_fetch(self.url):
@@ -168,14 +187,17 @@ class PageGetter:
             delay = scheduler.fetch_delay()
         scheduler.fetching()
         async with async_timeout.timeout(10):
-            async with self.session.get(self.url) as resp:
-                return await resp.text()
+            async with self.session.get(self.url, ssl=ssl) as resp:
+                try:
+                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None
 
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get()
+        html = await PageGetter(session, url).get(ssl=False)
 
         print('GOT {}HTML for {} at {}'.format(
             'None ' if html is None else '',
@@ -194,19 +216,22 @@ async def async_crawler(url):
                 return crawled
             parsed_url = urlparse(url)
             print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get()
-            new_urls = url_getter(
-                html,
-                url,
-                parsed_url.scheme + "://" + parsed_url.netloc
-            )
-            crawled += [url]
-            sampled = sample(
-                new_urls,
-                randrange(min(MAX_PER_PAGE, len(new_urls)))
-            )
-            queue += [sample_url for sample_url in sampled if sample_url not in
-                      queue and sample_url not in crawled]
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
+                new_urls = url_getter(
+                    html,
+                    url,
+                    parsed_url.scheme + "://" + parsed_url.netloc
+                )
+                crawled += [url]
+                if new_urls:
+                    sampled = sample(
+                        new_urls,
+                        randrange(min(MAX_PER_PAGE, len(new_urls)))
+                    )
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
     print(crawled)
 
 if __name__ == '__main__':

From f6da17982048b319508987d7d13ac3e99bf29bd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Fri, 23 Feb 2018 10:36:14 +0100
Subject: [PATCH 12/15] If robots.txt file is invalid, abort mission.

---
 crawl/crawl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 46c7707..ab3b8e1 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -117,6 +117,8 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
         except Exception as e:
             print(e)
             raise e
+        if not self.robot_parser.default_entry:
+            self.dead = True
         if not self.dead:
             delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
             if delay is None:

From f0b8672c89d28a6f4cf8af90192562b688b5000b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Fri, 23 Feb 2018 10:44:51 +0100
Subject: [PATCH 13/15] Silly me. (bis)

---
 crawl/crawl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index ab3b8e1..6eb748c 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -162,7 +162,7 @@ class CrawlingThread(Thread):
     def run(self):
         tasks = []
         #tasks.append(async_crawler("http://plus.google.com/+Python"))
-        tasks.append(async_print('https://python.org/'))
+        tasks.append(async_crawler('https://python.org/'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -209,7 +209,7 @@ async def async_print(url):
 async def async_crawler(url):
     queue = [url]
     crawled = []
-    while queue or (len(crawled) < HARD_LIMIT):
+    while queue and (len(crawled) < HARD_LIMIT):
         async with aiohttp.ClientSession() as session:
             try:
                 url = queue.pop(0)

From e56c088632725bb2d5325b1f27f72eb9e9c94bf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Sat, 24 Feb 2018 11:39:04 +0100
Subject: [PATCH 14/15] Better filter

---
 crawl/crawl.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 6eb748c..cc86f18 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url):
             elif link.startswith('/'): #Internal link, linking to page root url
                 links_list.append(root_url + link)
             elif link.startswith("#"):
-                print("Invalid link : internal bookmark")
+                continue
             else:
                 links_list.append(current_page + "/" + link)
 
@@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url):
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(links_list))
 
+    forbidden_words = ['login', 'agreement']
+    links_list = [link for link in links_list if not any(word in link.lower()
+                                                         for word in
+                                                         forbidden_words)]
+
     return links_list
 
 
@@ -235,6 +240,7 @@ async def async_crawler(url):
                               sample_url not in queue and sample_url not in
                               crawled]
     print(crawled)
+    return crawled
 
 if __name__ == '__main__':
     crawl = CrawlingThread()

From d19c2e821623b715e0d13a7dcc8a91cccc03c960 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Sat, 24 Feb 2018 15:41:46 +0100
Subject: [PATCH 15/15] Add mailto adresses to forbidden list

---
 crawl/crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index cc86f18..e8467f1 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -71,7 +71,7 @@ def url_getter(html, current_page, root_url):
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(links_list))
 
-    forbidden_words = ['login', 'agreement']
+    forbidden_words = ['login', 'agreement', 'mailto']
     links_list = [link for link in links_list if not any(word in link.lower()
                                                          for word in
                                                          forbidden_words)]