Compare commits

..

No commits in common. "master" and "rdf" have entirely different histories.
master ... rdf

30 changed files with 81 additions and 7777 deletions

View file

@ -1,6 +1,3 @@
# mpri-webdam
Generate realistic fake browsing histories for borderline and/or activists
users, to hide real traffic from global surveillance.
Lacks proper documentation at the moment `:(`
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.

View file

@ -4,7 +4,7 @@ from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange, randint
from random import sample, randrange
import re
from datetime import datetime, timedelta
@ -14,8 +14,6 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -27,11 +25,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'Default User'
USER_AGENT = 'Blah'
settings = Settings()
startup_time = datetime.now()
def url_getter(html, current_page, root_url):
@ -62,7 +60,7 @@ def url_getter(html, current_page, root_url):
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
continue
print("Invalid link : internal bookmark")
else:
links_list.append(current_page + "/" + link)
@ -73,14 +71,11 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
return links_list
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -103,47 +98,34 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
def __init__(self, name):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.urlroot() + 'robots.txt'
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
if not self.dead:
delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None:
req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
@ -161,9 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -174,47 +154,30 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.output_tree = []
def __init__(self):
super(CrawlingThread, self).__init__()
self.url = url
def run(self):
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
tasks.append(async_print('https://python.org/'))
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
headers = None
def __init__(self, session, url, user_agent):
def __init__(self, session, url):
self.url = url
self.session = session
self.user_agent = user_agent
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url, self.user_agent)
scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url):
return None
@ -224,7 +187,7 @@ class PageGetter:
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
async with self.session.get(self.url, ssl=ssl) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
@ -234,89 +197,44 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {}'.format(
print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '',
url,
))
datetime.now() - startup_time))
async def async_crawler(url):
queue = [url]
crawled = []
while queue or (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queue.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in
crawled]
print(crawled)
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)
if __name__ == '__main__':
crawl = CrawlingThread()
crawl.start()
crawl.join()

1
data/.gitignore vendored
View file

@ -1 +0,0 @@
nicknames_dict

View file

@ -1,14 +0,0 @@
gmail.com
protonmail.com
riseup.net
tutanoto.com
outlook.fr
fastmail.com
yandex.com
aim.com
icloud.com
yahoo.com
fmx.fr
mail.com
hushmail.com
inbox.com

View file

@ -1,27 +0,0 @@
[
{
"name" : "Atelier Anti-Pub",
"date" : "07/03/2018 19:00 UTC",
"place" : {
"name" : "Centre Social Autogéré Vaydom",
"address" : "37 rue Marceau, Ivry-sur-Seine",
"lat" : "48.81787",
"lon" : "2.38032"
}
},
{
"name" : "Rassemblement de soutien pour Bure",
"date" : "27/02/2018 17:00 UTC",
"place" : {
"name" : "Place Saint-Michel",
"address" : "Place Saint-Michel, 75005 Paris",
"lat" : "48.85374",
"lon" : "2.34455"
}
},
{
"name" : "Création d'un serveur mail",
"date" : "15/02/2018 12:00 UTC",
"place" : "La Mutinerie"
}
]

View file

@ -1,200 +0,0 @@
Jean
Marie
Philippe
Nathalie
Michel
Isabelle
Alain
Sylvie
Patrick
Catherine
Nicolas
Martine
Christophe
Christine
Pierre
Françoise
Christian
Valérie
Éric
Sandrine
Frédéric
Stéphanie
Laurent
Véronique
Stéphane
Sophie
David
Céline
Pascal
Chantal
Daniel
Patricia
Alexandre
Anne
Julien
Brigitte
Thierry
Julie
Olivier
Monique
Bernard
Aurélie
Thomas
Nicole
Sébastien
Laurence
Gérard
Annie
Didier
Émilie
Dominique
Dominique
Vincent
Virginie
François
Corinne
Bruno
Élodie
Guillaume
Christelle
Jérôme
Camille
Jacques
Caroline
Marc
Léa
Maxime
Sarah
Romain
Florence
Claude
Laetitia
Antoine
Audrey
Franck
Hélène
Jean-Pierre
Laura
Anthony
Manon
Kévin
Michèle
Gilles
Cécile
Cédric
Christiane
Serge
Béatrice
André
Claire
Mathieu
Nadine
Benjamin
Delphine
Patrice
Pauline
Fabrice
Karine
Joël
Mélanie
Jérémy
Marion
Clément
Chloe
Arnaud
Jacqueline
Denis
Elisabeth
Paul
Evelyne
Lucas
Marine
Hervé
Claudine
Jean-Claude
Anais
Sylvain
Lucie
Yves
Danielle
Ludovic
Carole
Guy
Fabienne
Florian
Mathilde
Damien
Sandra
Alexis
Pascale
Mickaël
Annick
Quentin
Charlotte
Emmanuel
Emma
Louis
Severine
Benoît
Sabrina
Jean-Luc
Amandine
Fabien
Myriam
Francis
Jocelyne
Hugo
Alexandra
Jonathan
Angelique
Loïc
Josiane
Xavier
Joelle
Théo
Agnes
Adrien
Mireille
Raphaël
Vanessa
Jean-François
Justine
Grégory
Sonia
Robert
Bernadette
Michaël
Emmanuelle
Valentin
Oceane
Cyril
Amelie
Jean-Marc
Clara
René
Maryse
Lionel
Anne-marie
Yannick
Fanny
Enzo
Magali
Yannis
Marie-christine
Jean-Michel
Morgane
Baptiste
Ines
Matthieu
Nadia
Rémi
Muriel
Georges
Jessica
Aurélien
Laure
Nathan
Genevieve
Jean-Paul
Estelle

View file

@ -1,55 +0,0 @@
[
{
"name": "occupation",
"keywords": [
{"keyword" : "ZAD NDDL"},
{"keyword" : "Organiser un squat"},
{"keyword" : "mobilisation et rassemblement"}
],
"places": [
{"place" : "Zad NDDL"},
{"place" : "Zad Bure"}
],
"websites": [
{"website": "zad nadir"}
],
"events": [
{"event": "Rassemblement de soutien pour Bure"}
]
},
{
"name": "LGBT",
"keywords": [
{"keyword" : "Discrimniation sexistes, quelles actions ?"},
{"keyword" : "gender queer Paris"},
{"keyword" : "Existrans Paris"}
],
"places": [
{"place" : "La Mutinerie"}
],
"websites": [
{"website": "emmaclit"},
{"website": "paris-luttes info"}
],
"events": [
{"event": "Création d'un serveur mail"}
]
},
{
"name": "Anti pub",
"keywords": [
{"keyword" : "Affichage JCDecaux"},
{"keyword" : "Anti-pub"},
{"keyword" : "Journée contre la publicité"}
],
"places": [
{"place" : "Centre Social Autogéré Vaydom"}
],
"websites": [
{"website": "paris-luttes info"}
],
"events": [
{"event": "Atelier Anti-Pub"}
]
}
]

View file

@ -1,17 +0,0 @@
{
"list": [
{ "keyword" : "gender queer Paris"},
{"keyword" : "fabriquer masque manif"},
{"keyword" : "Se protéger en manif"},
{"keyword" : "Legal team manif France"},
{"keyword" : "Guide juridique GAV"},
{"keyword" : "Échec du capitaisme"},
{"keyword" : "Bienfait du communisme"},
{"keyword" : "Le comité invisible"},
{"keyword" : "À nos enfants"},
{"keyword" : "Squats sur Paris"},
{"keyword" : "Local facho à Strasbourg"},
{"keyword" : "Discrimation sexistes, quelles actions ?"},
{"keyword" : "Pourquoi la lutte des classes"}
]
}

View file

@ -1,200 +0,0 @@
Martin
Bernard
Thomas
Petit
Robert
Richard
Durand
Dubois
Moreau
Laurent
Simon
Michel
Lefebvre
Leroy
Roux
David
Bertrand
Morel
Fournier
Girard
Bonnet
Dupont
Lambert
Fontaine
Rousseau
Vincent
Muller
Lefevre
Faure
Andre
Mercier
Blanc
Guerin
Boyer
Garnier
Chevalier
Francois
Legrand
Gauthier
Garcia
Perrin
Robin
Clement
Morin
Nicolas
Henry
Roussel
Mathieu
Gautier
Masson
Marchand
Duval
Denis
Dumont
Marie
Lemaire
Noel
Meyer
Dufour
Meunier
Brun
Blanchard
Giraud
Joly
Riviere
Lucas
Brunet
Gaillard
Barbier
Arnaud
Martinez
Gerard
Roche
Renard
Schmitt
Roy
Leroux
Colin
Vidal
Caron
Picard
Roger
Fabre
Aubert
Lemoine
Renaud
Dumas
Lacroix
Olivier
Philippe
Bourgeois
Pierre
Benoit
Rey
Leclerc
Payet
Rolland
Leclercq
Guillaume
Lecomte
Lopez
Jean
Dupuy
Guillot
Hubert
Berger
Carpentier
Sanchez
Dupuis
Moulin
Louis
Deschamps
Huet
Vasseur
Perez
Boucher
Fleury
Royer
Klein
Jacquet
Adam
Paris
Poirier
Marty
Aubry
Guyot
Carre
Charles
Renault
Charpentier
Menard
Maillard
Baron
Bertin
Bailly
Herve
Schneider
Fernandez
Le Gall
Collet
Leger
Bouvier
Julien
Prevost
Millet
Perrot
Daniel
Le Roux
Cousin
Germain
Breton
Besson
Langlois
Remy
Le Goff
Pelletier
Leveque
Perrier
Leblanc
Barre
Lebrun
Marchal
Weber
Mallet
Hamon
Boulanger
Jacob
Monnier
Michaud
Rodriguez
Guichard
Gillet
Etienne
Grondin
Poulain
Tessier
Chevallier
Collin
Chauvin
Da Silva
Bouchet
Gay
Lemaitre
Benard
Marechal
Humbert
Reynaud
Antoine
Hoarau
Perret
Barthelemy
Cordier
Pichon
Lejeune
Gilbert
Lamy
Delaunay
Pasquier
Carlier
Laporte

View file

@ -1,26 +0,0 @@
[
{
"place" : {
"name" : "Zad NDDL",
"address" : "Notre-Dame-des-landes, 44111",
"lat" : "47.3435",
"lon": "-1.7367"
}
},
{
"place" : {
"name" : "La Mutinerie",
"address" : "176 - 178 rue Saint Martin, 75003 Paris",
"lat" : "48.8625665",
"lon": "2.3522237"
}
},
{
"place" : {
"name" : "Zad Bure",
"address" : "2 rue de l'Église, 55290 Bure",
"lat" : "48.502",
"lon": "5.351"
}
}
]

View file

@ -1,44 +0,0 @@
[
{
"searchengine": {
"name":"Google",
"url":"https://google.com/",
"query_pattern": "search?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo",
"url":"https://duckduckgo.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo Lite",
"url":"https://duckduckgo.com/lite/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant",
"url":"https://www.qwant.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant lite",
"url":"https://lite.qwant.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Framabee",
"url":"https://framabee.org/",
"query_pattern":"?q={}"
}
}
]

File diff suppressed because it is too large Load diff

View file

@ -1,93 +0,0 @@
[
{
"name":"emmaclit",
"url":"https://emmaclit.com/",
"keywords": [
{"keyword":"Charge mentale"},
{"keyword":"Un autre regard"},
{"keyword":"Un petit poutou"},
{"keyword":"solidarité"},
{"keyword":"dédicace"}
],
"notable_pages": [
{"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
{"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
{"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
]
},
{
"name":"paris-luttes info",
"url":"https://paris-luttes.info/",
"keywords": [
{"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"},
{"keyword":"grève salariés"},
{"keyword":"prison"},
{"keyword":"violence policère"}
],
"notable_pages": [
{"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
{"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
{"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
]
},
{
"name":"zad nadir",
"url":"http://zad.nadir.org/",
"keywords": [
{"keyword":"Écologie"},
{"keyword":"opération césar"},
{"keyword":"expulsion vinci"},
{"keyword":"adresse"},
{"keyword":"la wardine"},
{"keyword":"route des chicanes"},
{"keyword":"opposition à l'aéroport Grand Ouest"}
],
"notable_pages": [
{"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
{"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
]
},
{
"name":"Fnac",
"url":"https://www.fnac.com/",
"keywords": [
{"keyword":"smartphone"},
{"keyword":"SAV"},
{"keyword":"Macbook"},
{"keyword":"TV"},
{"keyword":"PC Gaming"},
{"keyword":"DVD"},
{"keyword":"Home Cinema Philips"},
{"keyword":"Billeterie"}
],
"notable_pages": [
{"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
{"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
{"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
{"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
]
},
{
"name":"Sea Shepherd",
"url":"https://www.seashepherd.fr/",
"keywords": [
{"keyword":"pirates"},
{"keyword":"Phoques"},
{"keyword":"Paul Watson"},
{"keyword":"harponnage"},
{"keyword":"seal"},
{"keyword":"Chasse aux dauphins"},
{"keyword":"participation"},
{"keyword":"boutique"}
],
"notable_pages": [
{"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
{"webpage": "http://nyamba.seashepherd.info/"},
{"webpage": "http://seashepherd-shop.com/en/"},
{"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
]
}
]

View file

@ -1,16 +0,0 @@
from django.core.management.base import BaseCommand
from profiles import models as profiles
from histories.models import generate_history
from datetime import datetime
class Command(BaseCommand):
''' Generates an history and prints the related XML '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
prof = profiles.Profile.objects.all()[0]
history = generate_history(prof, datetime.now())
print(history.to_xml_string())

View file

@ -1,34 +0,0 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View file

@ -1,276 +1,3 @@
""" Models for the history. This history should be able to generate history
entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
from collections import namedtuple
import random
import asyncio
from math import floor
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp.
"""
search = models.URLField(help_text="The url to be searched")
timestamp = models.DateTimeField()
history = models.ForeignKey(
'History',
on_delete=models.CASCADE
)
def __str__(self):
""" Returns the string representation of a history entry.
"""
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = str(self.search)
entry_ts = ET.Element('timestamp')
entry_ts.text = str(self.timestamp.timestamp())
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
)
played = models.BooleanField(default=False)
user = models.ForeignKey(
profiles.Profile,
on_delete=models.CASCADE
)
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
def __str__(self):
""" Returns the string representation of a history.
"""
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
async def _handler(self):
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
await runner.run()
self.played = True
self.save()
def play_histories(self):
""" Actually plays the history.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait([self._handler()]))
def to_xml(self, xml_root=None):
''' Exports the current history to xml '''
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={
'start-ts': str(self.start_ts),
'played': '1' if self.played else '0',
'user': str(self.user.pk),
})
xml_root.append(hist_node)
for entry in self.historyentry_set.all():
entry.to_xml(hist_node)
if standalone:
return xml_root
def to_xml_string(self):
xml = self.to_xml()
return ET.tostring(xml)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
timestamp = t_start
result = []
basis = generate_first_url(user)
t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis)
crawler.start()
crawler.join()
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
is actually played by a user.
"""
# let's define a new history object.
history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
current_timestamp = start_time.timestamp()
hist_size = 0
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
if len(url) < 200:
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
try:
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
return history
# Create your models here.

View file

@ -1,123 +0,0 @@
"""
Modules that handles tor instaces creations in order to safely run histories
"""
import shutil
import datetime as dt
from time import sleep
import asyncio
import aiohttp
from aiosocks.connector import ProxyConnector, ProxyClientRequest
import async_timeout
import stem.process as tor
class TorInstance():
"""
A tor instance object, with some useful information.
It is designed to be used as a worker in order to replay an history.
"""
BASE_SOCKS_PORT = 40000
BASE_CONTROL_PORT = 20000
BASE_DATA_DIR = "/tmp/tor{}/"
TOR_RUNNER = 0
@classmethod
async def create(cls, history, headers):
""" Factory creation of tor processes"""
socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
TorInstance.TOR_RUNNER += 1
self = TorInstance()
self.socks_port = socks_port
self.control_port = control_port
self.data_dir = data_dir
self.history = history
self.headers = headers
self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
self.create_session()
self.process = tor.launch_tor_with_config(
config={
'ControlPort' : str(control_port),
'SocksPort' : str(socks_port),
'DataDir' : data_dir
}
)
return self
def __init__(self):
self.socks_port = 0
self.control_port = 0
self.data_dir = ""
self.history = None
self.proxy = ""
self.headers = {}
self.session = None
self.process = None
async def run(self):
""" Runs the Tor Instance on the history.
"""
while (self.history) and (dt.datetime.combine(self.history[0][1],
dt.datetime.min.time()) -
dt.datetime.now()).total_seconds() >= 10:
print("Sleeping")
sleep(10)
while self.history:
item = self.history.pop(0)
async with async_timeout.timeout(30):
await(self.query(item[0]))
now = dt.datetime.now()
print(self.history[0])
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
def create_session(self):
""" Create a aiohttp session.
"""
conn = ProxyConnector(remote_resolve=True)
self.session = aiohttp.ClientSession(
connector=conn,
headers=self.headers,
request_class=ProxyClientRequest
)
async def query(self, url):
""" Performs a query.
"""
async with async_timeout.timeout(30):
async with self.session.get(
url,
proxy=self.proxy,
proxy_auth=None) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
def __str__(self):
""" Utility function """
return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
'{0.control_port}, DataDir: {0.data_dir}'.format(self))
async def kill(self):
""" Kills the process and remove the data dir"""
self.process.kill()
self.session.close()
shutil.rmtree(self.data_dir)
async def main():
""" Test function """
for _ in range(3):
instance = await TorInstance.create(None, {"user-agent" : "Blah"})
await instance.query("https://python.org/")
print("One page received")
await instance.kill()
if __name__ == "__main__":
LOOP = asyncio.get_event_loop()
LOOP.run_until_complete(main())

View file

@ -13,9 +13,6 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
import os
from .settings_local import BASE_DIR, DEBUG, SECRET_KEY, DATABASES
HISTORY_MIN = 25
ALLOWED_HOSTS = []
@ -29,7 +26,6 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'profiles',
'histories',
'crawl',
]
@ -97,7 +93,7 @@ USE_I18N = True
USE_L10N = True
USE_TZ = False # We don't really care, we want POSIX timestamps
USE_TZ = True
# Static files (CSS, JavaScript, Images)

View file

@ -1,10 +0,0 @@
#!/bin/bash
# -*- coding: UTF8 -*-
python3 manage.py import_browser_fp
python3 manage.py import_search_engine
python3 manage.py import_keywords
python3 manage.py import_website
python3 manage.py import_places
python3 manage.py import_events
python3 manage.py import_interests

View file

@ -1,41 +0,0 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import BrowserFingerprint
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
data = data[0]["list"]
for os_agent in data:
for useragent in os_agent["useragents"]:
import_useragent(useragent)
def import_useragent(useragent):
fingerprint = BrowserFingerprint(
description=useragent.get("description", ""),
useragent=useragent.get("useragent", ""),
appname=useragent.get("appname", ""),
appversion=useragent.get("appversion", ""),
platform=useragent.get("appversion", ""),
vendor=useragent.get("vendor", ""),
vendorsub=useragent.get("vendorsub", ""),
buildID=useragent.get("buildID", ""),
oscpu=useragent.get("oscpu", ""),
accept_encoding=useragent.get("accept_encoding", ""),
accept_default=useragent.get("accept_default", ""),
accept_lang=useragent.get("accept_lang", ""),
pixeldepth=int(useragent.get("pixeldepth", 0)),
colordepth=int(useragent.get("colordepth", 0)),
screens=useragent.get("screen", ""),
)
print(fingerprint)
fingerprint.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/user-agent.json")

View file

@ -1,41 +0,0 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Place, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for event in data:
import_event(event)
def import_place(_place):
place = Place(
name=_place.get("name", ""),
address=_place.get("address", ""),
lat=float(_place.get("lat", 0)),
lon=float(_place.get("lon", 0))
)
place.save()
return place
def import_event(_event):
if isinstance(_event["place"], str):
place = Place.objects.get(name=_event["place"])
else:
place = import_place(_event["place"])
event = Event(
name=_event.get("name", ""),
date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
place=place
)
#print(event)
event.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/events.json")

View file

@ -1,51 +0,0 @@
""" Small module that import interests into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for interest in data:
import_interest(interest)
def import_interest(_interest):
keywords = []
places = []
websites = []
for keyword in _interest.get("keywords", []):
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
websites.append(Website.objects.get(name=website["website"]))
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
for website in websites:
interest.websites.add(website)
interest.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/interests.json")

View file

@ -1,20 +0,0 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Keyword
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for _keyword in data["list"]:
keyword = Keyword(text=_keyword.get("keyword", ""))
keyword.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/keywords.json")

View file

@ -1,27 +0,0 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Place
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for place in data:
import_place(place["place"])
def import_place(_place):
place = Place(
name=_place.get("name", ""),
address=_place.get("address", ""),
lat=float(_place.get("lat", 0)),
lon=float(_place.get("lon", 0))
)
place.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/place.json")

View file

@ -1,27 +0,0 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import SearchEngine
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for search_engine in data:
import_search_engine(search_engine["searchengine"])
def import_search_engine(engine):
search_engine = SearchEngine(
name=engine.get("name", ""),
url=engine.get("url", ""),
query_pattern=engine.get("query_pattern", "")
)
#print(search_engine)
search_engine.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/search_engine.json")

View file

@ -1,46 +0,0 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Webpage, Website, Keyword
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for website in data:
import_website(website)
def import_website(_website):
keywords = []
webpages = []
for keyword in _website.get("keywords", []):
new_keyword = Keyword(
text=keyword.get("keyword", "")
)
new_keyword.save()
keywords.append(new_keyword)
for webpage in _website.get("notable_pages",[]):
new_webpage = Webpage(
url=webpage.get("keyword", "")
)
new_webpage.save()
webpages.append(new_webpage)
website = Website(
name=_website.get("name", ""),
url=_website.get("url", ""),
)
website.save()
for keyword in keywords:
website.keywords.add(keyword)
for webpage in webpages:
website.notable_pages.add(webpage)
print(website)
#website.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/website.json")

View file

@ -1,46 +1,5 @@
"""
A django module that defines a profile, and all the information that can be
stored in a profile.
It stores interests, technical information such as the browser fingerprint,
the preferred search engin, and if the user is likely to directly use urls
or to type in the search engine.
"""
import os
import random
from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform
@ -62,14 +21,8 @@ class Keyword(models.Model):
def __str__(self):
return self.text
def generate_url(self, user):
""" Generates the url for a keyword, based on the user search engine.
"""
return user.search_engine.search_url(self)
class Webpage(models.Model):
''' A webpage url '''
url = models.URLField()
def __str__(self):
@ -87,22 +40,6 @@ class Website(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" Generates the url in case the interest chosen is a website.
"""
rand = random.random()
if user.uses_urls:
url = self.url
elif rand <= 0.1:
url = random.choice(self.notable_pages.all()).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
str(random.choice(self.keywords.all()))
url = user.search_engine.search_url(search_term_text)
else:
url = user.search_engine.search_url(self.name)
return url
class Place(models.Model):
''' A real-life place '''
@ -115,16 +52,6 @@ class Place(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" Generates the url for a place.
"""
rand = random.random()
if rand < 1/2:
url = user.search_engine.search_url(self.name)
else:
url = user.search_engine.search_url(self.address)
return url
class Event(models.Model):
''' A real-life event (protests, meeting, ...) '''
@ -136,15 +63,6 @@ class Event(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" generate the url for an event object.
"""
possibilities = random.sample(
[self.name, self.date, self.place],
3
)
return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent '''
@ -168,15 +86,6 @@ class BrowserFingerprint(models.Model):
def __str__(self):
return self.description
def serialize_headers(self):
return {
"Description": str(self.description),
"User-Agent": str(self.useragent),
"Accept-Encoding": str(self.accept_encoding),
"Accept": str(self.accept_default),
"Accept-Language": str(self.accept_lang),
}
class SearchEngine(models.Model):
''' A search engine, and all the data needed to use it '''
@ -185,8 +94,8 @@ class SearchEngine(models.Model):
url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL tail that can be
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
# search term (using `.format()`), must yield a URL that can be resolved to
# perform the search
def __str__(self):
return self.name
@ -194,10 +103,9 @@ class SearchEngine(models.Model):
def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self))
return self.url + (str(self.query_pattern).format(search_term))
return str(self.query_pattern).format(search_term)
class Interest(models.Model):
@ -231,35 +139,3 @@ class Profile(models.Model):
on_delete=models.CASCADE)
browser_fingerprint = models.ForeignKey(BrowserFingerprint,
on_delete=models.CASCADE)
def generate_email(nick, first_name, last_name):
domain = random.choice(EMAIL_DOMAINS)
if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain
else:
email = nick + "@" + domain
return email
@require_nicknames
def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name)
profile = Profile(
nick=nick,
first_name=first_name,
last_name=last_name,
email=email,
uses_urls=(random.random() < 0.5),
)
profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile

View file

@ -1,5 +1,5 @@
aiodns==1.1.1
aiohttp==2.3.2
aiohttp==3.0.1
async-timeout==2.0.0
attrs==17.4.0
cchardet==2.1.1
@ -12,8 +12,5 @@ pycares==2.3.0
pytz==2017.3
yarl==1.1.1
beautifulsoup4==4.6.0
stem==1.6.0
pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git
aiosocks==0.2.6