Crawler: use a random fingerprint
This commit is contained in:
parent
4a8bd32516
commit
4f0148cb63
1 changed files with 35 additions and 20 deletions
|
@ -5,7 +5,7 @@ from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ssl import CertificateError
|
from ssl import CertificateError
|
||||||
from random import sample, randrange
|
from random import sample, randrange, randint
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@ import async_timeout
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Comment
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
|
from profiles.models import BrowserFingerprint
|
||||||
|
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
|
@ -32,7 +34,6 @@ class Settings:
|
||||||
USER_AGENT = 'Default User'
|
USER_AGENT = 'Default User'
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
startup_time = datetime.min
|
|
||||||
|
|
||||||
|
|
||||||
def url_getter(html, current_page, root_url):
|
def url_getter(html, current_page, root_url):
|
||||||
|
@ -82,8 +83,6 @@ def url_getter(html, current_page, root_url):
|
||||||
return links_list
|
return links_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||||
interface, but spawning one instance per canonical website URL """
|
interface, but spawning one instance per canonical website URL """
|
||||||
|
@ -106,11 +105,13 @@ class WebsiteSchedulerMeta(type):
|
||||||
|
|
||||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
""" Schedule the accesses to a website as of robots.txt """
|
""" Schedule the accesses to a website as of robots.txt """
|
||||||
def __init__(self, name):
|
def __init__(self, name, user_agent):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.last_crawled = datetime.fromtimestamp(0)
|
self.last_crawled = datetime.fromtimestamp(0)
|
||||||
self.dead = False
|
self.dead = False
|
||||||
self.can_fetch_b = False
|
self.can_fetch_b = False
|
||||||
|
self.user_agent = (user_agent if user_agent is not None
|
||||||
|
else settings.USER_AGENT)
|
||||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
||||||
print("found a search engine for %s" % self.urlroot())
|
print("found a search engine for %s" % self.urlroot())
|
||||||
self.crawl_delay = timedelta(seconds=5)
|
self.crawl_delay = timedelta(seconds=5)
|
||||||
|
@ -125,7 +126,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
self.robot_parser.read()
|
self.robot_parser.read()
|
||||||
except URLError: # Almost surely an offline website.
|
except URLError: # Almost surely an offline website.
|
||||||
self.dead = True
|
self.dead = True
|
||||||
self.crawl_delay = 0
|
self.crawl_delay = 0
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -134,9 +135,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
if not self.robot_parser.default_entry:
|
if not self.robot_parser.default_entry:
|
||||||
self.dead = True
|
self.dead = True
|
||||||
if not self.dead:
|
if not self.dead:
|
||||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
delay = self.robot_parser.crawl_delay(self.user_agent)
|
||||||
if delay is None:
|
if delay is None:
|
||||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
req_rate = self.robot_parser.request_rate(self.user_agent)
|
||||||
if req_rate is None:
|
if req_rate is None:
|
||||||
delay = 5
|
delay = 5
|
||||||
else:
|
else:
|
||||||
|
@ -159,7 +160,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
|
|
||||||
def can_fetch(self, url):
|
def can_fetch(self, url):
|
||||||
''' Check whether this program can fetch a given page '''
|
''' Check whether this program can fetch a given page '''
|
||||||
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
|
return ((self.can_fetch_b)
|
||||||
|
or ((not self.dead) and
|
||||||
|
self.robot_parser.can_fetch(self.user_agent, url)))
|
||||||
|
|
||||||
def fetching(self):
|
def fetching(self):
|
||||||
''' Tell the scheduler that a page is being fetched now '''
|
''' Tell the scheduler that a page is being fetched now '''
|
||||||
|
@ -174,22 +177,25 @@ class CrawlingThread(Thread):
|
||||||
global settings
|
global settings
|
||||||
global SEARCH_ENGINE
|
global SEARCH_ENGINE
|
||||||
SEARCH_ENGINE = engine_list
|
SEARCH_ENGINE = engine_list
|
||||||
|
|
||||||
|
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||||
|
fingerprint = BrowserFingerprint.objects.all()[
|
||||||
|
randint(0, nb_fingerprint - 1)]
|
||||||
|
self.headers = fingerprint.serialize_headers()
|
||||||
|
|
||||||
self.queue = queue
|
self.queue = queue
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
if user:
|
|
||||||
settings.USER_AGENT = user.serialize_headers()
|
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
global startup_time
|
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(async_crawler(self.url, self.queue))
|
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
startup_time = datetime.now()
|
|
||||||
loop.run_until_complete(asyncio.wait(tasks))
|
loop.run_until_complete(asyncio.wait(tasks))
|
||||||
loop.close()
|
loop.close()
|
||||||
|
|
||||||
|
@ -197,13 +203,16 @@ class CrawlingThread(Thread):
|
||||||
class PageGetter:
|
class PageGetter:
|
||||||
""" Asynchronously get a webpage, abiding by robots.txt """
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||||
|
|
||||||
def __init__(self, session, url):
|
headers = None
|
||||||
|
|
||||||
|
def __init__(self, session, url, user_agent):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.session = session
|
self.session = session
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
async def get(self, ssl=True):
|
async def get(self, ssl=True):
|
||||||
""" Actually retrieve the webpage """
|
""" Actually retrieve the webpage """
|
||||||
scheduler = WebsiteScheduler(self.url)
|
scheduler = WebsiteScheduler(self.url, self.user_agent)
|
||||||
if not scheduler.can_fetch(self.url):
|
if not scheduler.can_fetch(self.url):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -226,16 +235,22 @@ async def async_print(url):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url).get(ssl=False)
|
html = await PageGetter(session, url).get(ssl=False)
|
||||||
|
|
||||||
print('GOT {}HTML for {} at {}'.format(
|
print('GOT {}HTML for {}'.format(
|
||||||
'None ' if html is None else '',
|
'None ' if html is None else '',
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
async def async_crawler(url, queue, headers=None):
|
||||||
|
if headers is None:
|
||||||
|
headers = {
|
||||||
|
'User-Agent': settings.USER_AGENT,
|
||||||
|
}
|
||||||
|
|
||||||
async def async_crawler(url, queue):
|
|
||||||
queued = [url]
|
queued = [url]
|
||||||
crawled = []
|
crawled = []
|
||||||
while queued and (len(crawled) < HARD_LIMIT):
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
try:
|
try:
|
||||||
url = queued.pop(0)
|
url = queued.pop(0)
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
|
Loading…
Reference in a new issue