From 4f0148cb63113475ff749d25d6047c3d70b2f5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Mon, 26 Feb 2018 11:27:07 +0100
Subject: [PATCH] Crawler: use a random fingerprint

---
 crawl/crawl.py | 55 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 16afdc2..3d050a4 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -5,7 +5,7 @@ from urllib.error import URLError
 from urllib.parse import urlparse
 
 from ssl import CertificateError
-from random import sample, randrange
+from random import sample, randrange, randint
 import re
 from datetime import datetime, timedelta
 
@@ -15,6 +15,8 @@ import async_timeout
 
 from bs4 import BeautifulSoup, Comment
 
+from profiles.models import BrowserFingerprint
+
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
 
@@ -32,7 +34,6 @@ class Settings:
     USER_AGENT = 'Default User'
 
 settings = Settings()
-startup_time = datetime.min
 
 
 def url_getter(html, current_page, root_url):
@@ -82,8 +83,6 @@ def url_getter(html, current_page, root_url):
     return links_list
 
 
-
-
 class WebsiteSchedulerMeta(type):
     """ Meta-class for WebsiteScheduler, allowing a singleton class-like
     interface, but spawning one instance per canonical website URL """
@@ -106,11 +105,13 @@ class WebsiteSchedulerMeta(type):
 
 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
     """ Schedule the accesses to a website as of robots.txt """
-    def __init__(self, name):
+    def __init__(self, name, user_agent):
         self.name = name
         self.last_crawled = datetime.fromtimestamp(0)
         self.dead = False
         self.can_fetch_b = False
+        self.user_agent = (user_agent if user_agent is not None
+                           else settings.USER_AGENT)
         if any(self.urlroot() in item for item in SEARCH_ENGINE):
             print("found a search engine for %s" % self.urlroot())
             self.crawl_delay = timedelta(seconds=5)
@@ -125,7 +126,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                     robots_url = self.unsafe_urlroot() + 'robots.txt'
                     self.robot_parser = RobotFileParser(robots_url)
                     self.robot_parser.read()
-                except URLError: # Almost surely an offline website.
+                except URLError:  # Almost surely an offline website.
                     self.dead = True
                     self.crawl_delay = 0
             except Exception as e:
@@ -134,9 +135,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
             if not self.robot_parser.default_entry:
                 self.dead = True
             if not self.dead:
-                delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+                delay = self.robot_parser.crawl_delay(self.user_agent)
                 if delay is None:
-                    req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                    req_rate = self.robot_parser.request_rate(self.user_agent)
                     if req_rate is None:
                         delay = 5
                     else:
@@ -159,7 +160,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
     def can_fetch(self, url):
         ''' Check whether this program can fetch a given page '''
-        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
+        return ((self.can_fetch_b)
+                or ((not self.dead) and
+                    self.robot_parser.can_fetch(self.user_agent, url)))
 
     def fetching(self):
         ''' Tell the scheduler that a page is being fetched now '''
@@ -174,22 +177,25 @@ class CrawlingThread(Thread):
         global settings
         global SEARCH_ENGINE
         SEARCH_ENGINE = engine_list
+
+        nb_fingerprint = len(BrowserFingerprint.objects.all())
+        fingerprint = BrowserFingerprint.objects.all()[
+            randint(0, nb_fingerprint - 1)]
+        self.headers = fingerprint.serialize_headers()
+
         self.queue = queue
         super(CrawlingThread, self).__init__()
-        if user:
-            settings.USER_AGENT = user.serialize_headers()
         self.url = url
 
     def run(self):
-        global startup_time
         tasks = []
+
         #tasks.append(async_crawler("http://plus.google.com/+Python"))
         #tasks.append(async_crawler('https://python.org/'))
-        tasks.append(async_crawler(self.url, self.queue))
+        tasks.append(async_crawler(self.url, self.queue, self.headers))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
-        startup_time = datetime.now()
         loop.run_until_complete(asyncio.wait(tasks))
         loop.close()
 
@@ -197,13 +203,16 @@ class CrawlingThread(Thread):
 class PageGetter:
     """ Asynchronously get a webpage, abiding by robots.txt """
 
-    def __init__(self, session, url):
+    headers = None
+
+    def __init__(self, session, url, user_agent):
         self.url = url
         self.session = session
+        self.user_agent = user_agent
 
     async def get(self, ssl=True):
         """ Actually retrieve the webpage """
-        scheduler = WebsiteScheduler(self.url)
+        scheduler = WebsiteScheduler(self.url, self.user_agent)
         if not scheduler.can_fetch(self.url):
             return None
 
@@ -226,16 +235,22 @@ async def async_print(url):
     async with aiohttp.ClientSession() as session:
         html = await PageGetter(session, url).get(ssl=False)
 
-        print('GOT {}HTML for {} at {}'.format(
+        print('GOT {}HTML for {}'.format(
             'None ' if html is None else '',
             url,
-            datetime.now() - startup_time))
+        ))
+
+
+async def async_crawler(url, queue, headers=None):
+    if headers is None:
+        headers = {
+            'User-Agent': settings.USER_AGENT,
+        }
 
-async def async_crawler(url, queue):
     queued = [url]
     crawled = []
     while queued and (len(crawled) < HARD_LIMIT):
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(headers=headers) as session:
             try:
                 url = queued.pop(0)
             except IndexError: