Compare commits

...

9 Commits

6 changed files with 271 additions and 76 deletions

View File

@ -5,7 +5,7 @@ from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
from ssl import CertificateError from ssl import CertificateError
from random import sample, randrange from random import sample, randrange, randint
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -15,6 +15,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings # from django.conf import settings
@ -26,13 +28,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*") FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings: class Settings:
USER_AGENT = 'Default User' USER_AGENT = 'Default User'
settings = Settings() settings = Settings()
startup_time = datetime.min
def url_getter(html, current_page, root_url): def url_getter(html, current_page, root_url):
@ -82,8 +82,6 @@ def url_getter(html, current_page, root_url):
return links_list return links_list
class WebsiteSchedulerMeta(type): class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like """ Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """ interface, but spawning one instance per canonical website URL """
@ -106,12 +104,17 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """ """ Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name self.name = name
self.last_crawled = datetime.fromtimestamp(0) self.last_crawled = datetime.fromtimestamp(0)
self.dead = False self.dead = False
self.can_fetch_b = False self.can_fetch_b = False
if any(self.urlroot() in item for item in SEARCH_ENGINE): self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot()) print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5) self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True self.can_fetch_b = True
@ -125,7 +128,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
robots_url = self.unsafe_urlroot() + 'robots.txt' robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url) self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() self.robot_parser.read()
except URLError: # Almost surely an offline website. except URLError: # Almost surely an offline website.
self.dead = True self.dead = True
self.crawl_delay = 0 self.crawl_delay = 0
except Exception as e: except Exception as e:
@ -134,9 +137,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
if not self.robot_parser.default_entry: if not self.robot_parser.default_entry:
self.dead = True self.dead = True
if not self.dead: if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT) delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None: if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT) req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None: if req_rate is None:
delay = 5 delay = 5
else: else:
@ -159,7 +162,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url): def can_fetch(self, url):
''' Check whether this program can fetch a given page ''' ''' Check whether this program can fetch a given page '''
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)) return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self): def fetching(self):
''' Tell the scheduler that a page is being fetched now ''' ''' Tell the scheduler that a page is being fetched now '''
@ -170,26 +175,28 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, user, url, engine_list, queue): def __init__(self, url, queue):
global settings engine_list = [engine.url for engine in SearchEngine.objects.all()]
global SEARCH_ENGINE WebsiteScheduler.search_engines = engine_list
SEARCH_ENGINE = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.queue = queue self.queue = queue
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url self.url = url
def run(self): def run(self):
global startup_time
tasks = [] tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue)) tasks.append(async_crawler(self.url, self.queue, self.headers))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks)) loop.run_until_complete(asyncio.wait(tasks))
loop.close() loop.close()
@ -197,13 +204,16 @@ class CrawlingThread(Thread):
class PageGetter: class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """ """ Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url): headers = None
def __init__(self, session, url, user_agent):
self.url = url self.url = url
self.session = session self.session = session
self.user_agent = user_agent
async def get(self, ssl=True): async def get(self, ssl=True):
""" Actually retrieve the webpage """ """ Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url) scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url): if not scheduler.can_fetch(self.url):
return None return None
@ -226,16 +236,22 @@ async def async_print(url):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False) html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {} at {}'.format( print('GOT {}HTML for {}'.format(
'None ' if html is None else '', 'None ' if html is None else '',
url, url,
datetime.now() - startup_time)) ))
async def async_crawler(url, queue, headers=None):
if headers is None:
headers = {
'User-Agent': settings.USER_AGENT,
}
async def async_crawler(url, queue):
queued = [url] queued = [url]
crawled = [] crawled = []
while queued and (len(crawled) < HARD_LIMIT): while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(headers=headers) as session:
try: try:
url = queued.pop(0) url = queued.pop(0)
except IndexError: except IndexError:

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
nicknames_dict

View File

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View File

@ -3,14 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
interests, keywords... interests, keywords...
""" """
from collections import namedtuple
import random import random
from math import floor from math import floor
from queue import Queue from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models from django.db import models
import profiles.models as profiles import profiles.models as profiles
from tor_runner import TorInstance
from crawl import crawl from crawl import crawl
from pinocchio.settings import HISTORY_MIN from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model): class HistoryEntry(models.Model):
@ -28,14 +40,48 @@ class HistoryEntry(models.Model):
""" """
return "{} : {}".format(self.timestamp, self.search) return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = self.search
entry_ts = ET.Element('timestamp')
entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model): class History(models.Model):
""" A history for a user, containing some web connections (http, https). """ A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """ Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField( start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like ' help_text=('The starting timestamp of the history. Useful for '
'structure.' 'cron-like structure.')
) )
played = models.BooleanField(default=False) played = models.BooleanField(default=False)
@ -47,18 +93,18 @@ class History(models.Model):
def return_history(self): def return_history(self):
""" Returns the history, sorted by increasing timestamps """ Returns the history, sorted by increasing timestamps
""" """
history_set = self.history_set.order_by('timestamp') output_history = self.historyentry_set.order_by('timestamp')
history_set = [(item.search, item.timestamp.date()) for item in history_set] output_history = [(item.search, item.timestamp.date())
return history_set for item in output_history]
return output_history
def __str__(self): def __str__(self):
""" Returns the string representation of a history. """ Returns the string representation of a history.
""" """
history_set = self.history_set.order_by('timestamp') history_set = self.historyentry_set.order_by('timestamp')
header = "[History]:\n" header = "[History]:\n"
return header + "\n".join(history_set) return header + "\n".join(history_set)
def play_histories(self): def play_histories(self):
""" Actually plays the history. """ Actually plays the history.
""" """
@ -66,6 +112,52 @@ class History(models.Model):
runner = TorInstance(self.history) runner = TorInstance(self.history)
self.save() self.save()
def to_xml(self, xml_root):
''' Exports the current history to xml '''
hist_node = ET.Element("history", attrib={
'start-ts': self.start_ts,
'played': 1 if self.played else 0,
'user': self.user.pk,
})
xml_root.append(hist_node)
for entry in self.historyentry_set:
entry.to_xml(hist_node)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start): def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at """ Generate the part of the history resulting from the crawl starting at
@ -74,36 +166,51 @@ def generate_partial_history(user, t_start):
timestamp = t_start timestamp = t_start
result = [] result = []
basis = generate_first_url(user) basis = generate_first_url(user)
result.append((basis, timestamp)) result.append(PartialHistoryEntry(basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
queue = Queue() queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all() crawler = crawl.CrawlingThread(basis, queue)
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls = queue.get() urls = queue.get()
for url in urls: for url in urls:
timestamp += 5* random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
result.append((url, timestamp)) result.append(PartialHistoryEntry(url, timestamp))
return result return result
def generate_first_url(user): def generate_first_url(user):
""" Generate the first url of a partial history, based on the user """ Generate the first url of a partial history, based on the user
information. """ information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(), def nonempty(seq):
user.interests.websites.all(), user.interests.events.all() out = []
] for elt in seq:
) if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest) search_term = random.choice(interest)
url = search_term.generate_url(user) url = search_term.generate_url(user)
return url return url
def generate_history(user, start_time):
def generate_history(user, ts_start):
""" Generate a new history for the user `user`, starting from timestamp """ Generate a new history for the user `user`, starting from timestamp
`ts_start`. `ts_start`.
A few heuristics are used in order to give the impression that the history A few heuristics are used in order to give the impression that the history
@ -111,19 +218,27 @@ def generate_history(user, ts_start):
""" """
# let's define a new history object. # let's define a new history object.
history = History(start_ts=ts_start, user=user) history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5)) length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
history_line = 0 history_line = 0
current_timestamp = start_time.timestamp()
while history_line < length: while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8) current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start) history_list = generate_partial_history(user, current_timestamp)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list: for (url, timestamp) in history_list:
new_line = HistoryEntry( new_line = HistoryEntry(
search=url, search=url,
timestamp=timestamp, timestamp=datetime.fromtimestamp(timestamp),
history=history history=history
) )
new_line.full_clean()
new_line.save() new_line.save()
return history

View File

@ -1,11 +1,10 @@
#!/bin/bash #!/bin/bash
# -*- coding: UTF8 -*- # -*- coding: UTF8 -*-
/usr/bin/python3 manage.py import_browser_fp python3 manage.py import_browser_fp
/usr/bin/python3 manage.py import_search_engine python3 manage.py import_search_engine
/usr/bin/python3 manage.py import_keywords python3 manage.py import_keywords
/usr/bin/python3 manage.py import_website python3 manage.py import_website
/usr/bin/python3 manage.py import_places python3 manage.py import_places
/usr/bin/python3 manage.py import_events python3 manage.py import_events
/usr/bin/python3 manage.py import_interests python3 manage.py import_interests

View File

@ -12,12 +12,36 @@ from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = open("/usr/share/dict/american-english").read().splitlines() NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines() LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines() FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines() EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception): class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform ''' Thrown when the DB contains invalid data, and cannot perform
something ''' something '''
@ -72,7 +96,7 @@ class Website(models.Model):
elif rand <= 0.1: elif rand <= 0.1:
url = random.choice(self.notable_pages).url url = random.choice(self.notable_pages).url
elif rand <= 0.8: elif rand <= 0.8:
search_term_text = self.name + " " + \ search_term_text = self.name + " " + \
random.choice(self.keywords) random.choice(self.keywords)
url = user.search_engine.search_url(search_term_text) url = user.search_engine.search_url(search_term_text)
else: else:
@ -122,7 +146,6 @@ class Event(models.Model):
return user.search_engine.search_url(" ".join(possibilities)) return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model): class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent ''' ''' A browser fingerprint, containing things like a user agent '''
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
def serialize_headers(self): def serialize_headers(self):
return { return {
"Description" : str(self.description), "Description": str(self.description),
"User-Agent" : str(self.useragent), "User-Agent": str(self.useragent),
"Accept-Encoding" : str(self.accept_encoding), "Accept-Encoding": str(self.accept_encoding),
"Accept" : str(self.accept_default), "Accept": str(self.accept_default),
"Accept-Language" : str(self.accept_lang), "Accept-Language": str(self.accept_lang),
} }
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
url = models.URLField() url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a # query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL that can be resolved to # search term (using `.format()`), must yield a URL tail that can be
# perform the search # concatenated with `url` to perform a search (eg. `?q={}` for ddg).
def __str__(self): def __str__(self):
return self.name return self.name
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
def search_url(self, search_term): def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine ''' ''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern) pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern: if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self)) raise InvalidData("Search engine {}: bad pattern".format(self))
return str(self.query_pattern).format(search_term) return self.url + (str(self.query_pattern).format(search_term))
class Interest(models.Model): class Interest(models.Model):
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
if random.random() < 0.3: if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain email = first_name + "." + last_name + "@" + domain
else: else:
email = nick + "@" + domain email = nick + "@" + domain
return email return email
@require_nicknames
def create_profile(nick=None): def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2,5))) nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES) first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES) last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name) email = generate_email(nick, first_name, last_name)
@ -227,7 +253,11 @@ def create_profile(nick=None):
first_name=first_name, first_name=first_name,
last_name=last_name, last_name=last_name,
email=email, email=email,
uses_url=(random.random() < 0.5), uses_urls=(random.random() < 0.5),
) )
profile.search_engine = random.choice(SearchEngine.objects.all()) profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all()) profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
return profile