From adb892ab7dceb30aa822d876f5ce756c7b9e450b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Mon, 26 Feb 2018 11:12:36 +0100 Subject: [PATCH] Check if crawling a search engine --- crawl/crawl.py | 67 +++++++++++++++++++++++++++------------------ histories/models.py | 4 ++- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 048114d..16afdc2 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -26,6 +26,8 @@ MAX_PER_PAGE = 10 FOOTER_URL = re.compile(".*footer.*") +SEARCH_ENGINE = [] + class Settings: USER_AGENT = 'Default User' @@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): self.name = name self.last_crawled = datetime.fromtimestamp(0) self.dead = False - try: - robots_url = self.urlroot() + 'robots.txt' - self.robot_parser = RobotFileParser(robots_url) - self.robot_parser.read() # TODO async? - except (URLError, CertificateError): + self.can_fetch_b = False + if any(self.urlroot() in item for item in SEARCH_ENGINE): + print("found a search engine for %s" % self.urlroot()) + self.crawl_delay = timedelta(seconds=5) + self.can_fetch_b = True + else: try: - robots_url = self.unsafe_urlroot() + 'robots.txt' + robots_url = self.urlroot() + 'robots.txt' self.robot_parser = RobotFileParser(robots_url) - self.robot_parser.read() - except URLError: # Almost surely an offline website. + self.robot_parser.read() # TODO async? + except (URLError, CertificateError): + try: + robots_url = self.unsafe_urlroot() + 'robots.txt' + self.robot_parser = RobotFileParser(robots_url) + self.robot_parser.read() + except URLError: # Almost surely an offline website. + self.dead = True + self.crawl_delay = 0 + except Exception as e: + print(e) + raise e + if not self.robot_parser.default_entry: self.dead = True - self.crawl_delay = 0 - except Exception as e: - print(e) - raise e - if not self.robot_parser.default_entry: - self.dead = True - if not self.dead: - delay = self.robot_parser.crawl_delay(settings.USER_AGENT) - if delay is None: - req_rate = self.robot_parser.request_rate(settings.USER_AGENT) - if req_rate is None: - delay = 5 - else: - delay = req_rate.requests, req_rate.seconds - self.crawl_delay = timedelta(seconds=delay) + if not self.dead: + delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + if delay is None: + req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + if req_rate is None: + delay = 5 + else: + delay = req_rate.requests, req_rate.seconds + self.crawl_delay = timedelta(seconds=delay) def urlroot(self): ''' Get the root url for this website ''' @@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' - return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url) + return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' @@ -162,8 +170,10 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, user, url, queue): + def __init__(self, user, url, engine_list, queue): global settings + global SEARCH_ENGINE + SEARCH_ENGINE = engine_list self.queue = queue super(CrawlingThread, self).__init__() if user: @@ -204,6 +214,7 @@ class PageGetter: scheduler.fetching() async with async_timeout.timeout(10): async with self.session.get(self.url, verify_ssl=ssl) as resp: + print("Resp status %s" % resp.status) try: return await resp.text() except UnicodeDecodeError: @@ -248,11 +259,15 @@ async def async_crawler(url, queue): queued += [sample_url for sample_url in sampled if sample_url not in queued and sample_url not in crawled] + else: + print("No html received") print(crawled) queue.put(crawled) if __name__ == '__main__': queue = Queue() - crawl = CrawlingThread(None, "https://python.org/", queue) + crawl = CrawlingThread(None, + "https://google.com/search?q=fabriquer+masque+manif", + ["https://google.com/search/"], queue) crawl.start() crawl.join() diff --git a/histories/models.py b/histories/models.py index 649aff8..21797f1 100644 --- a/histories/models.py +++ b/histories/models.py @@ -77,7 +77,9 @@ def generate_partial_history(user, t_start): result.append((basis, timestamp)) timestamp += 5* random.weibullvariate(1, 1.5) queue = Queue() - crawler = crawl.CrawlingThread(user, basis, queue) + search_engine_query = profiles.SearchEngine.objects.all() + search_engine_list = [item.url for item in search_engine_query] + crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue) crawler.start() crawler.join() urls = queue.get()