From adb892ab7dceb30aa822d876f5ce756c7b9e450b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Mon, 26 Feb 2018 11:12:36 +0100
Subject: [PATCH] Check if crawling a search engine

---
 crawl/crawl.py      | 67 +++++++++++++++++++++++++++------------------
 histories/models.py |  4 ++-
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 048114d..16afdc2 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
 
 FOOTER_URL = re.compile(".*footer.*")
 
+SEARCH_ENGINE = []
+
 class Settings:
     USER_AGENT = 'Default User'
 
@@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
         self.name = name
         self.last_crawled = datetime.fromtimestamp(0)
         self.dead = False
-        try:
-            robots_url = self.urlroot() + 'robots.txt'
-            self.robot_parser = RobotFileParser(robots_url)
-            self.robot_parser.read()  # TODO async?
-        except (URLError, CertificateError):
+        self.can_fetch_b = False
+        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+            print("found a search engine for %s" % self.urlroot())
+            self.crawl_delay = timedelta(seconds=5)
+            self.can_fetch_b = True
+        else:
             try:
-                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                robots_url = self.urlroot() + 'robots.txt'
                 self.robot_parser = RobotFileParser(robots_url)
-                self.robot_parser.read()
-            except URLError: # Almost surely an offline website.
+                self.robot_parser.read()  # TODO async?
+            except (URLError, CertificateError):
+                try:
+                    robots_url = self.unsafe_urlroot() + 'robots.txt'
+                    self.robot_parser = RobotFileParser(robots_url)
+                    self.robot_parser.read()
+                except URLError: # Almost surely an offline website.
+                    self.dead = True
+                    self.crawl_delay = 0
+            except Exception as e:
+                print(e)
+                raise e
+            if not self.robot_parser.default_entry:
                 self.dead = True
-                self.crawl_delay = 0
-        except Exception as e:
-            print(e)
-            raise e
-        if not self.robot_parser.default_entry:
-            self.dead = True
-        if not self.dead:
-            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-            if delay is None:
-                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-                if req_rate is None:
-                    delay = 5
-                else:
-                    delay = req_rate.requests, req_rate.seconds
-            self.crawl_delay = timedelta(seconds=delay)
+            if not self.dead:
+                delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+                if delay is None:
+                    req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                    if req_rate is None:
+                        delay = 5
+                    else:
+                        delay = req_rate.requests, req_rate.seconds
+                self.crawl_delay = timedelta(seconds=delay)
 
     def urlroot(self):
         ''' Get the root url for this website '''
@@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
 
     def can_fetch(self, url):
         ''' Check whether this program can fetch a given page '''
-        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
 
     def fetching(self):
         ''' Tell the scheduler that a page is being fetched now '''
@@ -162,8 +170,10 @@ class CrawlingThread(Thread):
     """ A separate thread for the crawling task. This is needed to use asyncio,
     since the thread will need its own event loop. """
 
-    def __init__(self, user, url, queue):
+    def __init__(self, user, url, engine_list, queue):
         global settings
+        global SEARCH_ENGINE
+        SEARCH_ENGINE = engine_list
         self.queue = queue
         super(CrawlingThread, self).__init__()
         if user:
@@ -204,6 +214,7 @@ class PageGetter:
         scheduler.fetching()
         async with async_timeout.timeout(10):
             async with self.session.get(self.url, verify_ssl=ssl) as resp:
+                print("Resp status %s" % resp.status)
                 try:
                     return await resp.text()
                 except UnicodeDecodeError:
@@ -248,11 +259,15 @@ async def async_crawler(url, queue):
                     queued += [sample_url for sample_url in sampled if
                               sample_url not in queued and sample_url not in
                               crawled]
+            else:
+                print("No html received")
     print(crawled)
     queue.put(crawled)
 
 if __name__ == '__main__':
     queue = Queue()
-    crawl = CrawlingThread(None, "https://python.org/", queue)
+    crawl = CrawlingThread(None,
+                           "https://google.com/search?q=fabriquer+masque+manif",
+                           ["https://google.com/search/"], queue)
     crawl.start()
     crawl.join()
diff --git a/histories/models.py b/histories/models.py
index 649aff8..21797f1 100644
--- a/histories/models.py
+++ b/histories/models.py
@@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
     result.append((basis, timestamp))
     timestamp += 5* random.weibullvariate(1, 1.5)
     queue = Queue()
-    crawler = crawl.CrawlingThread(user, basis, queue)
+    search_engine_query = profiles.SearchEngine.objects.all()
+    search_engine_list = [item.url for item in search_engine_query]
+    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
     crawler.start()
     crawler.join()
     urls = queue.get()