Compare commits

...

9 commits

6 changed files with 271 additions and 76 deletions

View file

@ -5,7 +5,7 @@ from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
@ -15,6 +15,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -26,13 +28,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings:
USER_AGENT = 'Default User'
settings = Settings()
startup_time = datetime.min
def url_getter(html, current_page, root_url):
@ -82,8 +82,6 @@ def url_getter(html, current_page, root_url):
return links_list
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -106,12 +104,17 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
self.can_fetch_b = False
if any(self.urlroot() in item for item in SEARCH_ENGINE):
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
@ -125,7 +128,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
@ -134,9 +137,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
if not self.robot_parser.default_entry:
self.dead = True
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None:
delay = 5
else:
@ -159,7 +162,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -170,26 +175,28 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, engine_list, queue):
global settings
global SEARCH_ENGINE
SEARCH_ENGINE = engine_list
def __init__(self, url, queue):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.queue = queue
super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url
def run(self):
global startup_time
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
tasks.append(async_crawler(self.url, self.queue, self.headers))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
@ -197,13 +204,16 @@ class CrawlingThread(Thread):
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
self.user_agent = user_agent
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
@ -226,16 +236,22 @@ async def async_print(url):
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {} at {}'.format(
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
))
async def async_crawler(url, queue, headers=None):
if headers is None:
headers = {
'User-Agent': settings.USER_AGENT,
}
async def async_crawler(url, queue):
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession(headers=headers) as session:
try:
url = queued.pop(0)
except IndexError:

1
data/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
nicknames_dict

View file

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View file

@ -3,14 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
from collections import namedtuple
import random
from math import floor
from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
import profiles.models as profiles
from tor_runner import TorInstance
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model):
@ -28,14 +40,48 @@ class HistoryEntry(models.Model):
"""
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = self.search
entry_ts = ET.Element('timestamp')
entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like '
'structure.'
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
)
played = models.BooleanField(default=False)
@ -47,18 +93,18 @@ class History(models.Model):
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
history_set = self.history_set.order_by('timestamp')
history_set = [(item.search, item.timestamp.date()) for item in history_set]
return history_set
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
def __str__(self):
""" Returns the string representation of a history.
"""
history_set = self.history_set.order_by('timestamp')
history_set = self.historyentry_set.order_by('timestamp')
header = "[History]:\n"
return header + "\n".join(history_set)
def play_histories(self):
""" Actually plays the history.
"""
@ -66,6 +112,52 @@ class History(models.Model):
runner = TorInstance(self.history)
self.save()
def to_xml(self, xml_root):
''' Exports the current history to xml '''
hist_node = ET.Element("history", attrib={
'start-ts': self.start_ts,
'played': 1 if self.played else 0,
'user': self.user.pk,
})
xml_root.append(hist_node)
for entry in self.historyentry_set:
entry.to_xml(hist_node)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
@ -74,36 +166,51 @@ def generate_partial_history(user, t_start):
timestamp = t_start
result = []
basis = generate_first_url(user)
result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5)
result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5)
queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler = crawl.CrawlingThread(basis, queue)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
timestamp += 5* random.weibullvariate(1, 1.5)
result.append((url, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5)
result.append(PartialHistoryEntry(url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all()
]
)
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, ts_start):
def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
@ -111,19 +218,27 @@ def generate_history(user, ts_start):
"""
# let's define a new history object.
history = History(start_ts=ts_start, user=user)
history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
history_line = 0
current_timestamp = start_time.timestamp()
while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
new_line = HistoryEntry(
search=url,
timestamp=timestamp,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
new_line.full_clean()
new_line.save()
return history

View file

@ -1,11 +1,10 @@
#!/bin/bash
# -*- coding: UTF8 -*-
/usr/bin/python3 manage.py import_browser_fp
/usr/bin/python3 manage.py import_search_engine
/usr/bin/python3 manage.py import_keywords
/usr/bin/python3 manage.py import_website
/usr/bin/python3 manage.py import_places
/usr/bin/python3 manage.py import_events
/usr/bin/python3 manage.py import_interests
python3 manage.py import_browser_fp
python3 manage.py import_search_engine
python3 manage.py import_keywords
python3 manage.py import_website
python3 manage.py import_places
python3 manage.py import_events
python3 manage.py import_interests

View file

@ -12,12 +12,36 @@ from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform
something '''
@ -72,7 +96,7 @@ class Website(models.Model):
elif rand <= 0.1:
url = random.choice(self.notable_pages).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
search_term_text = self.name + " " + \
random.choice(self.keywords)
url = user.search_engine.search_url(search_term_text)
else:
@ -122,7 +146,6 @@ class Event(models.Model):
return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent '''
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
def serialize_headers(self):
return {
"Description" : str(self.description),
"User-Agent" : str(self.useragent),
"Accept-Encoding" : str(self.accept_encoding),
"Accept" : str(self.accept_default),
"Accept-Language" : str(self.accept_lang),
"Description": str(self.description),
"User-Agent": str(self.useragent),
"Accept-Encoding": str(self.accept_encoding),
"Accept": str(self.accept_default),
"Accept-Language": str(self.accept_lang),
}
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL that can be resolved to
# perform the search
# search term (using `.format()`), must yield a URL tail that can be
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
def __str__(self):
return self.name
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self))
return str(self.query_pattern).format(search_term)
return self.url + (str(self.query_pattern).format(search_term))
class Interest(models.Model):
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain
else:
email = nick + "@" + domain
email = nick + "@" + domain
return email
@require_nicknames
def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name)
@ -227,7 +253,11 @@ def create_profile(nick=None):
first_name=first_name,
last_name=last_name,
email=email,
uses_url=(random.random() < 0.5),
uses_urls=(random.random() < 0.5),
)
profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
return profile