Crawler: use a random fingerprint

This commit is contained in:
Théophile Bastian 2018-02-26 11:27:07 +01:00
parent 38ccd04d31
commit 8f1d69bc41

View file

@ -5,7 +5,7 @@ from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
from ssl import CertificateError from ssl import CertificateError
from random import sample, randrange from random import sample, randrange, randint
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -15,6 +15,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint
# Ugly hack to use this module alone instead of integrating it with Django # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings # from django.conf import settings
@ -32,7 +34,6 @@ class Settings:
USER_AGENT = 'Default User' USER_AGENT = 'Default User'
settings = Settings() settings = Settings()
startup_time = datetime.min
def url_getter(html, current_page, root_url): def url_getter(html, current_page, root_url):
@ -82,8 +83,6 @@ def url_getter(html, current_page, root_url):
return links_list return links_list
class WebsiteSchedulerMeta(type): class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like """ Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """ interface, but spawning one instance per canonical website URL """
@ -106,11 +105,13 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """ """ Schedule the accesses to a website as of robots.txt """
def __init__(self, name): def __init__(self, name, user_agent):
self.name = name self.name = name
self.last_crawled = datetime.fromtimestamp(0) self.last_crawled = datetime.fromtimestamp(0)
self.dead = False self.dead = False
self.can_fetch_b = False self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in SEARCH_ENGINE): if any(self.urlroot() in item for item in SEARCH_ENGINE):
print("found a search engine for %s" % self.urlroot()) print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5) self.crawl_delay = timedelta(seconds=5)
@ -125,7 +126,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
robots_url = self.unsafe_urlroot() + 'robots.txt' robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url) self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() self.robot_parser.read()
except URLError: # Almost surely an offline website. except URLError: # Almost surely an offline website.
self.dead = True self.dead = True
self.crawl_delay = 0 self.crawl_delay = 0
except Exception as e: except Exception as e:
@ -134,9 +135,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
if not self.robot_parser.default_entry: if not self.robot_parser.default_entry:
self.dead = True self.dead = True
if not self.dead: if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT) delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None: if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT) req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None: if req_rate is None:
delay = 5 delay = 5
else: else:
@ -159,7 +160,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url): def can_fetch(self, url):
''' Check whether this program can fetch a given page ''' ''' Check whether this program can fetch a given page '''
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)) return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self): def fetching(self):
''' Tell the scheduler that a page is being fetched now ''' ''' Tell the scheduler that a page is being fetched now '''
@ -174,22 +177,25 @@ class CrawlingThread(Thread):
global settings global settings
global SEARCH_ENGINE global SEARCH_ENGINE
SEARCH_ENGINE = engine_list SEARCH_ENGINE = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.queue = queue self.queue = queue
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url self.url = url
def run(self): def run(self):
global startup_time
tasks = [] tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue)) tasks.append(async_crawler(self.url, self.queue, self.headers))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks)) loop.run_until_complete(asyncio.wait(tasks))
loop.close() loop.close()
@ -197,13 +203,16 @@ class CrawlingThread(Thread):
class PageGetter: class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """ """ Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url): headers = None
def __init__(self, session, url, user_agent):
self.url = url self.url = url
self.session = session self.session = session
self.user_agent = user_agent
async def get(self, ssl=True): async def get(self, ssl=True):
""" Actually retrieve the webpage """ """ Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url) scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url): if not scheduler.can_fetch(self.url):
return None return None
@ -226,16 +235,22 @@ async def async_print(url):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False) html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {} at {}'.format( print('GOT {}HTML for {}'.format(
'None ' if html is None else '', 'None ' if html is None else '',
url, url,
datetime.now() - startup_time)) ))
async def async_crawler(url, queue, headers=None):
if headers is None:
headers = {
'User-Agent': settings.USER_AGENT,
}
async def async_crawler(url, queue):
queued = [url] queued = [url]
crawled = [] crawled = []
while queued and (len(crawled) < HARD_LIMIT): while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(headers=headers) as session:
try: try:
url = queued.pop(0) url = queued.pop(0)
except IndexError: except IndexError: