Check if crawling a search engine

2018-02-26 11:12:36 +01:00 · 2018-02-26 11:12:36 +01:00 · adb892ab7d
parent 15db8b4697
commit adb892ab7d
2 changed files with 44 additions and 27 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 SEARCH_ENGINE = []
 class Settings:
    USER_AGENT = 'Default User'
@ -108,6 +110,12 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        self.can_fetch_b = False
        if any(self.urlroot() in item for item in SEARCH_ENGINE):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
        else:
            try:
                robots_url = self.urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """
-    def __init__(self, user, url, queue):
+    def __init__(self, user, url, engine_list, queue):
        global settings
        global SEARCH_ENGINE
        SEARCH_ENGINE = engine_list
        self.queue = queue
        super(CrawlingThread, self).__init__()
        if user:
@ -204,6 +214,7 @@ class PageGetter:
        scheduler.fetching()
        async with async_timeout.timeout(10):
            async with self.session.get(self.url, verify_ssl=ssl) as resp:
                print("Resp status %s" % resp.status)
                try:
                    return await resp.text()
                except UnicodeDecodeError:
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
                    queued += [sample_url for sample_url in sampled if
                              sample_url not in queued and sample_url not in
                              crawled]
            else:
                print("No html received")
    print(crawled)
    queue.put(crawled)
 if __name__ == '__main__':
    queue = Queue()
-    crawl = CrawlingThread(None, "https://python.org/", queue)
+    crawl = CrawlingThread(None,
                           "https://google.com/search?q=fabriquer+masque+manif",
                           ["https://google.com/search/"], queue)
    crawl.start()
    crawl.join()
--- a/histories/models.py
+++ b/histories/models.py
@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
    result.append((basis, timestamp))
    timestamp += 5* random.weibullvariate(1, 1.5)
    queue = Queue()
-    crawler = crawl.CrawlingThread(user, basis, queue)
+    search_engine_query = profiles.SearchEngine.objects.all()
    search_engine_list = [item.url for item in search_engine_query]
    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()