Check if crawling a search engine

2018-02-26 11:12:36 +01:00 · 2018-02-26 11:12:36 +01:00 · adb892ab7d
commit adb892ab7d
parent 15db8b4697
2 changed files with 44 additions and 27 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -26,6 +26,8 @@ MAX_PER_PAGE = 10

 FOOTER_URL = re.compile(".*footer.*")

+SEARCH_ENGINE = []
+
 class Settings:
    USER_AGENT = 'Default User'

@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
-        try:
-            robots_url = self.urlroot() + 'robots.txt'
-            self.robot_parser = RobotFileParser(robots_url)
-            self.robot_parser.read()  # TODO async?
-        except (URLError, CertificateError):
+        self.can_fetch_b = False
+        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+            print("found a search engine for %s" % self.urlroot())
+            self.crawl_delay = timedelta(seconds=5)
+            self.can_fetch_b = True
+        else:
            try:
-                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                robots_url = self.urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
-                self.robot_parser.read()
-            except URLError: # Almost surely an offline website.
+                self.robot_parser.read()  # TODO async?
+            except (URLError, CertificateError):
+                try:
+                    robots_url = self.unsafe_urlroot() + 'robots.txt'
+                    self.robot_parser = RobotFileParser(robots_url)
+                    self.robot_parser.read()
+                except URLError: # Almost surely an offline website.
+                    self.dead = True
+                    self.crawl_delay = 0
+            except Exception as e:
+                print(e)
+                raise e
+            if not self.robot_parser.default_entry:
                self.dead = True
-                self.crawl_delay = 0
-        except Exception as e:
-            print(e)
-            raise e
-        if not self.robot_parser.default_entry:
-            self.dead = True
-        if not self.dead:
-            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-            if delay is None:
-                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-                if req_rate is None:
-                    delay = 5
-                else:
-                    delay = req_rate.requests, req_rate.seconds
-            self.crawl_delay = timedelta(seconds=delay)
+            if not self.dead:
+                delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+                if delay is None:
+                    req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                    if req_rate is None:
+                        delay = 5
+                    else:
+                        delay = req_rate.requests, req_rate.seconds
+                self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self, user, url, queue):
+    def __init__(self, user, url, engine_list, queue):
        global settings
+        global SEARCH_ENGINE
+        SEARCH_ENGINE = engine_list
        self.queue = queue
        super(CrawlingThread, self).__init__()
        if user:
@ -204,6 +214,7 @@ class PageGetter:
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url, verify_ssl=ssl) as resp:
+                print("Resp status %s" % resp.status)
                try:
                    return await resp.text()
                except UnicodeDecodeError:
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
                    queued += [sample_url for sample_url in sampled if
                              sample_url not in queued and sample_url not in
                              crawled]
+            else:
+                print("No html received")
    print(crawled)
    queue.put(crawled)

 if __name__ == '__main__':
    queue = Queue()
-    crawl = CrawlingThread(None, "https://python.org/", queue)
+    crawl = CrawlingThread(None,
+                           "https://google.com/search?q=fabriquer+masque+manif",
+                           ["https://google.com/search/"], queue)
    crawl.start()
    crawl.join()
--- a/histories/models.py
+++ b/histories/models.py
@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
    result.append((basis, timestamp))
    timestamp += 5* random.weibullvariate(1, 1.5)
    queue = Queue()
-    crawler = crawl.CrawlingThread(user, basis, queue)
+    search_engine_query = profiles.SearchEngine.objects.all()
+    search_engine_list = [item.url for item in search_engine_query]
+    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()