From 8f1d69bc4170968a43eae904395833c18b5b8e6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Mon, 26 Feb 2018 11:27:07 +0100 Subject: [PATCH] Crawler: use a random fingerprint --- crawl/crawl.py | 55 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 16afdc2..3d050a4 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -5,7 +5,7 @@ from urllib.error import URLError from urllib.parse import urlparse from ssl import CertificateError -from random import sample, randrange +from random import sample, randrange, randint import re from datetime import datetime, timedelta @@ -15,6 +15,8 @@ import async_timeout from bs4 import BeautifulSoup, Comment +from profiles.models import BrowserFingerprint + # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings @@ -32,7 +34,6 @@ class Settings: USER_AGENT = 'Default User' settings = Settings() -startup_time = datetime.min def url_getter(html, current_page, root_url): @@ -82,8 +83,6 @@ def url_getter(html, current_page, root_url): return links_list - - class WebsiteSchedulerMeta(type): """ Meta-class for WebsiteScheduler, allowing a singleton class-like interface, but spawning one instance per canonical website URL """ @@ -106,11 +105,13 @@ class WebsiteSchedulerMeta(type): class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): """ Schedule the accesses to a website as of robots.txt """ - def __init__(self, name): + def __init__(self, name, user_agent): self.name = name self.last_crawled = datetime.fromtimestamp(0) self.dead = False self.can_fetch_b = False + self.user_agent = (user_agent if user_agent is not None + else settings.USER_AGENT) if any(self.urlroot() in item for item in SEARCH_ENGINE): print("found a search engine for %s" % self.urlroot()) self.crawl_delay = timedelta(seconds=5) @@ -125,7 +126,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): robots_url = self.unsafe_urlroot() + 'robots.txt' self.robot_parser = RobotFileParser(robots_url) self.robot_parser.read() - except URLError: # Almost surely an offline website. + except URLError: # Almost surely an offline website. self.dead = True self.crawl_delay = 0 except Exception as e: @@ -134,9 +135,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): if not self.robot_parser.default_entry: self.dead = True if not self.dead: - delay = self.robot_parser.crawl_delay(settings.USER_AGENT) + delay = self.robot_parser.crawl_delay(self.user_agent) if delay is None: - req_rate = self.robot_parser.request_rate(settings.USER_AGENT) + req_rate = self.robot_parser.request_rate(self.user_agent) if req_rate is None: delay = 5 else: @@ -159,7 +160,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' - return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)) + return ((self.can_fetch_b) + or ((not self.dead) and + self.robot_parser.can_fetch(self.user_agent, url))) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' @@ -174,22 +177,25 @@ class CrawlingThread(Thread): global settings global SEARCH_ENGINE SEARCH_ENGINE = engine_list + + nb_fingerprint = len(BrowserFingerprint.objects.all()) + fingerprint = BrowserFingerprint.objects.all()[ + randint(0, nb_fingerprint - 1)] + self.headers = fingerprint.serialize_headers() + self.queue = queue super(CrawlingThread, self).__init__() - if user: - settings.USER_AGENT = user.serialize_headers() self.url = url def run(self): - global startup_time tasks = [] + #tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler('https://python.org/')) - tasks.append(async_crawler(self.url, self.queue)) + tasks.append(async_crawler(self.url, self.queue, self.headers)) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - startup_time = datetime.now() loop.run_until_complete(asyncio.wait(tasks)) loop.close() @@ -197,13 +203,16 @@ class CrawlingThread(Thread): class PageGetter: """ Asynchronously get a webpage, abiding by robots.txt """ - def __init__(self, session, url): + headers = None + + def __init__(self, session, url, user_agent): self.url = url self.session = session + self.user_agent = user_agent async def get(self, ssl=True): """ Actually retrieve the webpage """ - scheduler = WebsiteScheduler(self.url) + scheduler = WebsiteScheduler(self.url, self.user_agent) if not scheduler.can_fetch(self.url): return None @@ -226,16 +235,22 @@ async def async_print(url): async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get(ssl=False) - print('GOT {}HTML for {} at {}'.format( + print('GOT {}HTML for {}'.format( 'None ' if html is None else '', url, - datetime.now() - startup_time)) + )) + + +async def async_crawler(url, queue, headers=None): + if headers is None: + headers = { + 'User-Agent': settings.USER_AGENT, + } -async def async_crawler(url, queue): queued = [url] crawled = [] while queued and (len(crawled) < HARD_LIMIT): - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(headers=headers) as session: try: url = queued.pop(0) except IndexError: