Compare commits

...

84 commits
rdf ... master

Author SHA1 Message Date
Rémi Oudin 89d1f8301a Remove duplicated url in history 2018-02-26 17:46:49 +01:00
Théophile Bastian 379b53e6ce Fix printing in gen_history 2018-02-26 17:25:04 +01:00
Théophile Bastian c94841c17b Add gen_history django-admin command 2018-02-26 17:25:04 +01:00
Rémi Oudin 97107d9bec Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam 2018-02-26 17:12:26 +01:00
Rémi Oudin dedc66bb9d Bug fix 2018-02-26 17:12:19 +01:00
Théophile Bastian d3d04739e7 Add DuckDuckGo lite search engine to stock data
This search engine works better than the others
2018-02-26 17:10:18 +01:00
Théophile Bastian b88aeffd5a Helpful README 2018-02-26 17:09:05 +01:00
Rémi Oudin 7c8ec7351c Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam 2018-02-26 17:04:09 +01:00
Théophile Bastian 2005c0f24f Add xml string gen 2018-02-26 17:03:27 +01:00
Rémi Oudin 392e16b797 Merge branch 'histories_models' 2018-02-26 17:03:27 +01:00
Théophile Bastian 185c1cf8a4 Fix XML generation 2018-02-26 17:00:53 +01:00
Rémi Oudin 9dd1954067 Partial runner fix 2018-02-26 17:00:53 +01:00
Rémi Oudin 04270e88c0 Bug fix 2018-02-26 17:00:12 +01:00
Théophile Bastian 6bc64ceb7a Add requirement for aiohttp 2018-02-26 16:38:16 +01:00
Rémi Oudin 15e0c2a11c Partial runner fix 2018-02-26 16:37:51 +01:00
Rémi Oudin 2b07779f5c Bug fix 2018-02-26 16:37:32 +01:00
Théophile Bastian 8cdc50c04e Fix stupid typo 2018-02-26 16:34:43 +01:00
Rémi Oudin 22fa039f1b Remove debug print 2018-02-26 16:23:14 +01:00
Théophile Bastian e4ad8c7ce6 Towards a working XML export 2018-02-26 15:58:30 +01:00
Théophile Bastian 67ad232533 Add a timeout to a single page retrieval 2018-02-26 15:42:36 +01:00
Théophile Bastian e140d4a8a7 Fix merge remanences 2018-02-26 15:37:05 +01:00
Théophile Bastian 98fe69ba62 Real async crawling 2018-02-26 15:30:38 +01:00
Théophile Bastian 968ff6d24c More robust crawling 2018-02-26 15:29:36 +01:00
Rémi Oudin 5d4bd30e20 Exception handling 2018-02-26 15:15:03 +01:00
Rémi Oudin bdfa285e6b We do not want to use settings 2018-02-26 15:14:53 +01:00
Rémi Oudin 65f777f00f Should get the objects and not the Manager 2018-02-26 15:04:26 +01:00
Rémi Oudin 236e40d359 Sanity check 2018-02-26 14:57:46 +01:00
Rémi Oudin 22017cea91 Typo in data u_u 2018-02-26 14:56:22 +01:00
Rémi Oudin 549c861908 Bug fixé 2018-02-26 14:38:26 +01:00
Rémi Oudin 517be1d822 Merge rdf branch 2018-02-26 14:11:06 +01:00
Rémi Oudin c4f63a92b2 Error in the merge, mea culpa 2018-02-26 14:01:29 +01:00
Rémi Oudin db067e56fc Typo 2018-02-26 13:59:34 +01:00
Rémi Oudin 33bdae96e4 merge commit from histories_tobast into histories_models 2018-02-26 12:59:38 +01:00
Rémi Oudin 526aad1364 Add interests 2018-02-26 12:33:23 +01:00
Théophile Bastian 02e91bb2b7 Fix function calls 2018-02-26 11:56:02 +01:00
Théophile Bastian 3e5fc2f9b3 Fix search engine URL generation 2018-02-26 11:49:24 +01:00
Théophile Bastian 45ddbff91a Crawling and histories: fix a lot of stuff 2018-02-26 11:49:24 +01:00
Théophile Bastian e6d587bffd Actually save to DB a created history 2018-02-26 11:49:24 +01:00
Théophile Bastian 8baf408e02 Use dict from data/nicknames_dict for nicknames 2018-02-26 11:49:24 +01:00
Théophile Bastian 6463e348ac Fix populate.sh exec path 2018-02-26 11:48:51 +01:00
Théophile Bastian 22064ebee3 Histories: xml import/export — untested
To be tested when history generation is available
2018-02-26 11:48:51 +01:00
Théophile Bastian a4de51b84a Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:48:51 +01:00
Théophile Bastian 4f0148cb63 Crawler: use a random fingerprint 2018-02-26 11:48:51 +01:00
Théophile Bastian 4a8bd32516 Fix tor_runner import 2018-02-26 11:48:51 +01:00
Rémi Oudin 44cf26df8f It can be useful to save a new object 2018-02-26 11:42:45 +01:00
Rémi Oudin adb892ab7d Check if crawling a search engine 2018-02-26 11:12:36 +01:00
Rémi Oudin 15db8b4697 Change option name due to downgrade of aiohttp 2018-02-26 10:23:32 +01:00
Rémi Oudin d6b26c0a46 Better use of history 2018-02-26 10:05:33 +01:00
Rémi Oudin 8f5c4f3f0f Use datetimes 2018-02-26 09:49:24 +01:00
Rémi Oudin 71d9e18eec Add headers support 2018-02-25 23:56:51 +01:00
Rémi Oudin 8ad46c0481 Bug fix, syntax erro 2018-02-25 21:59:29 +01:00
Rémi Oudin f66c978466 Tor runner has a run function to replay the history 2018-02-25 21:53:28 +01:00
Rémi Oudin 0a676a2f65 PEP8 2018-02-25 21:34:20 +01:00
Rémi Oudin e074d96f02 tor_runner can make requests 2018-02-25 21:27:15 +01:00
Rémi Oudin 93b235cb6c Fix interests import 2018-02-25 21:20:52 +01:00
Rémi Oudin ae5699c089 Basic tor runner 2018-02-25 19:42:58 +01:00
Rémi Oudin f7313ff659 Add populate.sh script 2018-02-25 16:16:04 +01:00
Rémi Oudin 0661fe0f01 Fix path 2018-02-25 16:10:38 +01:00
Rémi Oudin 4b19febdf6 Add interests 2018-02-25 16:10:22 +01:00
Théophile Bastian 15323c3465 [REBASE ME] Crawl: enhance efficiency and output a tree 2018-02-25 15:08:06 +01:00
Rémi Oudin 05a2e2ca3f Partial generation of profiles 2018-02-25 13:18:12 +01:00
Rémi Oudin d4aefb6bb7 Load the data 2018-02-25 13:17:44 +01:00
Rémi Oudin 3eb82a4a0b data for names and emails 2018-02-25 13:17:27 +01:00
Rémi Oudin 7c0fb7dda1 Better naming 2018-02-25 11:49:44 +01:00
Rémi Oudin ee32e5385b Finished data import 2018-02-25 11:49:11 +01:00
Rémi Oudin bc7348f677 Integration of crawl module in histories 2018-02-24 23:17:24 +01:00
Rémi Oudin 60bfc8cb77 Merge branch 'crawl' into histories_models 2018-02-24 18:44:27 +01:00
Rémi Oudin 12c8c652d7 Serialisation function 2018-02-24 18:40:27 +01:00
Rémi Oudin c58f42476f Missing script for 854481d 2018-02-24 17:22:52 +01:00
Rémi Oudin 854481dbd3 Import utilities 2018-02-24 17:21:41 +01:00
Rémi Oudin d19c2e8216 Add mailto adresses to forbidden list 2018-02-24 15:41:46 +01:00
Rémi Oudin e56c088632 Better filter 2018-02-24 11:39:04 +01:00
Rémi Oudin f0b8672c89 Silly me. (bis) 2018-02-23 10:44:51 +01:00
Rémi Oudin f6da179820 If robots.txt file is invalid, abort mission. 2018-02-23 10:36:14 +01:00
Rémi Oudin 5decd205fb Typos + improvements 2018-02-22 11:06:45 +01:00
Rémi Oudin ad0ad0a783 Command to add browser fingerprint data 2018-02-21 16:50:27 +01:00
Rémi Oudin cd4d8a4c3f More generic code using @8f4458b 2018-02-21 11:50:28 +01:00
Rémi Oudin 8f4458b009 Url generation method, for more genericity 2018-02-21 11:37:44 +01:00
Rémi Oudin 5539f57139 Add missing docstrings 2018-02-21 11:35:53 +01:00
Rémi Oudin 4920de5838 Going on in the generation of history 2018-02-20 23:42:21 +01:00
Rémi Oudin 7c13ee17d4 Skeleton of history generation 2018-02-19 22:56:16 +01:00
Rémi Oudin 7f343d8ad8 Better formatting 2018-02-19 13:59:29 +01:00
Rémi Oudin 3b0fa27951 Add histories application to settings file 2018-02-19 13:59:29 +01:00
Rémi Oudin 60f09bd4d3 Add basic models for histories 2018-02-19 13:58:55 +01:00
30 changed files with 7779 additions and 83 deletions

View file

@ -1,3 +1,6 @@
# mpri-webdam
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
Generate realistic fake browsing histories for borderline and/or activists
users, to hide real traffic from global surveillance.
Lacks proper documentation at the moment `:(`

View file

@ -4,7 +4,7 @@ from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
@ -14,6 +14,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -25,11 +27,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'Blah'
USER_AGENT = 'Default User'
settings = Settings()
startup_time = datetime.now()
def url_getter(html, current_page, root_url):
@ -60,7 +62,7 @@ def url_getter(html, current_page, root_url):
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
continue
else:
links_list.append(current_page + "/" + link)
@ -71,11 +73,14 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
return links_list
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -98,34 +103,47 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
try:
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
if not self.dead:
delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None:
req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
@ -143,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -154,30 +174,47 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self):
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.output_tree = []
super(CrawlingThread, self).__init__()
self.url = url
def run(self):
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
tasks.append(async_print('https://python.org/'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
self.user_agent = user_agent
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
@ -187,7 +224,7 @@ class PageGetter:
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, ssl=ssl) as resp:
async with self.session.get(self.url, verify_ssl=ssl) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
@ -197,44 +234,89 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False)
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
print('GOT {}HTML for {} at {}'.format(
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
))
async def async_crawler(url):
queue = [url]
crawled = []
while queue or (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queue.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in
crawled]
print(crawled)
if __name__ == '__main__':
crawl = CrawlingThread()
crawl.start()
crawl.join()
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)

1
data/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
nicknames_dict

14
data/email_domains.txt Normal file
View file

@ -0,0 +1,14 @@
gmail.com
protonmail.com
riseup.net
tutanoto.com
outlook.fr
fastmail.com
yandex.com
aim.com
icloud.com
yahoo.com
fmx.fr
mail.com
hushmail.com
inbox.com

27
data/events.json Normal file
View file

@ -0,0 +1,27 @@
[
{
"name" : "Atelier Anti-Pub",
"date" : "07/03/2018 19:00 UTC",
"place" : {
"name" : "Centre Social Autogéré Vaydom",
"address" : "37 rue Marceau, Ivry-sur-Seine",
"lat" : "48.81787",
"lon" : "2.38032"
}
},
{
"name" : "Rassemblement de soutien pour Bure",
"date" : "27/02/2018 17:00 UTC",
"place" : {
"name" : "Place Saint-Michel",
"address" : "Place Saint-Michel, 75005 Paris",
"lat" : "48.85374",
"lon" : "2.34455"
}
},
{
"name" : "Création d'un serveur mail",
"date" : "15/02/2018 12:00 UTC",
"place" : "La Mutinerie"
}
]

200
data/firstnames.txt Normal file
View file

@ -0,0 +1,200 @@
Jean
Marie
Philippe
Nathalie
Michel
Isabelle
Alain
Sylvie
Patrick
Catherine
Nicolas
Martine
Christophe
Christine
Pierre
Françoise
Christian
Valérie
Éric
Sandrine
Frédéric
Stéphanie
Laurent
Véronique
Stéphane
Sophie
David
Céline
Pascal
Chantal
Daniel
Patricia
Alexandre
Anne
Julien
Brigitte
Thierry
Julie
Olivier
Monique
Bernard
Aurélie
Thomas
Nicole
Sébastien
Laurence
Gérard
Annie
Didier
Émilie
Dominique
Dominique
Vincent
Virginie
François
Corinne
Bruno
Élodie
Guillaume
Christelle
Jérôme
Camille
Jacques
Caroline
Marc
Léa
Maxime
Sarah
Romain
Florence
Claude
Laetitia
Antoine
Audrey
Franck
Hélène
Jean-Pierre
Laura
Anthony
Manon
Kévin
Michèle
Gilles
Cécile
Cédric
Christiane
Serge
Béatrice
André
Claire
Mathieu
Nadine
Benjamin
Delphine
Patrice
Pauline
Fabrice
Karine
Joël
Mélanie
Jérémy
Marion
Clément
Chloe
Arnaud
Jacqueline
Denis
Elisabeth
Paul
Evelyne
Lucas
Marine
Hervé
Claudine
Jean-Claude
Anais
Sylvain
Lucie
Yves
Danielle
Ludovic
Carole
Guy
Fabienne
Florian
Mathilde
Damien
Sandra
Alexis
Pascale
Mickaël
Annick
Quentin
Charlotte
Emmanuel
Emma
Louis
Severine
Benoît
Sabrina
Jean-Luc
Amandine
Fabien
Myriam
Francis
Jocelyne
Hugo
Alexandra
Jonathan
Angelique
Loïc
Josiane
Xavier
Joelle
Théo
Agnes
Adrien
Mireille
Raphaël
Vanessa
Jean-François
Justine
Grégory
Sonia
Robert
Bernadette
Michaël
Emmanuelle
Valentin
Oceane
Cyril
Amelie
Jean-Marc
Clara
René
Maryse
Lionel
Anne-marie
Yannick
Fanny
Enzo
Magali
Yannis
Marie-christine
Jean-Michel
Morgane
Baptiste
Ines
Matthieu
Nadia
Rémi
Muriel
Georges
Jessica
Aurélien
Laure
Nathan
Genevieve
Jean-Paul
Estelle

55
data/interests.json Normal file
View file

@ -0,0 +1,55 @@
[
{
"name": "occupation",
"keywords": [
{"keyword" : "ZAD NDDL"},
{"keyword" : "Organiser un squat"},
{"keyword" : "mobilisation et rassemblement"}
],
"places": [
{"place" : "Zad NDDL"},
{"place" : "Zad Bure"}
],
"websites": [
{"website": "zad nadir"}
],
"events": [
{"event": "Rassemblement de soutien pour Bure"}
]
},
{
"name": "LGBT",
"keywords": [
{"keyword" : "Discrimniation sexistes, quelles actions ?"},
{"keyword" : "gender queer Paris"},
{"keyword" : "Existrans Paris"}
],
"places": [
{"place" : "La Mutinerie"}
],
"websites": [
{"website": "emmaclit"},
{"website": "paris-luttes info"}
],
"events": [
{"event": "Création d'un serveur mail"}
]
},
{
"name": "Anti pub",
"keywords": [
{"keyword" : "Affichage JCDecaux"},
{"keyword" : "Anti-pub"},
{"keyword" : "Journée contre la publicité"}
],
"places": [
{"place" : "Centre Social Autogéré Vaydom"}
],
"websites": [
{"website": "paris-luttes info"}
],
"events": [
{"event": "Atelier Anti-Pub"}
]
}
]

17
data/keywords.json Normal file
View file

@ -0,0 +1,17 @@
{
"list": [
{ "keyword" : "gender queer Paris"},
{"keyword" : "fabriquer masque manif"},
{"keyword" : "Se protéger en manif"},
{"keyword" : "Legal team manif France"},
{"keyword" : "Guide juridique GAV"},
{"keyword" : "Échec du capitaisme"},
{"keyword" : "Bienfait du communisme"},
{"keyword" : "Le comité invisible"},
{"keyword" : "À nos enfants"},
{"keyword" : "Squats sur Paris"},
{"keyword" : "Local facho à Strasbourg"},
{"keyword" : "Discrimation sexistes, quelles actions ?"},
{"keyword" : "Pourquoi la lutte des classes"}
]
}

200
data/lastnames.txt Normal file
View file

@ -0,0 +1,200 @@
Martin
Bernard
Thomas
Petit
Robert
Richard
Durand
Dubois
Moreau
Laurent
Simon
Michel
Lefebvre
Leroy
Roux
David
Bertrand
Morel
Fournier
Girard
Bonnet
Dupont
Lambert
Fontaine
Rousseau
Vincent
Muller
Lefevre
Faure
Andre
Mercier
Blanc
Guerin
Boyer
Garnier
Chevalier
Francois
Legrand
Gauthier
Garcia
Perrin
Robin
Clement
Morin
Nicolas
Henry
Roussel
Mathieu
Gautier
Masson
Marchand
Duval
Denis
Dumont
Marie
Lemaire
Noel
Meyer
Dufour
Meunier
Brun
Blanchard
Giraud
Joly
Riviere
Lucas
Brunet
Gaillard
Barbier
Arnaud
Martinez
Gerard
Roche
Renard
Schmitt
Roy
Leroux
Colin
Vidal
Caron
Picard
Roger
Fabre
Aubert
Lemoine
Renaud
Dumas
Lacroix
Olivier
Philippe
Bourgeois
Pierre
Benoit
Rey
Leclerc
Payet
Rolland
Leclercq
Guillaume
Lecomte
Lopez
Jean
Dupuy
Guillot
Hubert
Berger
Carpentier
Sanchez
Dupuis
Moulin
Louis
Deschamps
Huet
Vasseur
Perez
Boucher
Fleury
Royer
Klein
Jacquet
Adam
Paris
Poirier
Marty
Aubry
Guyot
Carre
Charles
Renault
Charpentier
Menard
Maillard
Baron
Bertin
Bailly
Herve
Schneider
Fernandez
Le Gall
Collet
Leger
Bouvier
Julien
Prevost
Millet
Perrot
Daniel
Le Roux
Cousin
Germain
Breton
Besson
Langlois
Remy
Le Goff
Pelletier
Leveque
Perrier
Leblanc
Barre
Lebrun
Marchal
Weber
Mallet
Hamon
Boulanger
Jacob
Monnier
Michaud
Rodriguez
Guichard
Gillet
Etienne
Grondin
Poulain
Tessier
Chevallier
Collin
Chauvin
Da Silva
Bouchet
Gay
Lemaitre
Benard
Marechal
Humbert
Reynaud
Antoine
Hoarau
Perret
Barthelemy
Cordier
Pichon
Lejeune
Gilbert
Lamy
Delaunay
Pasquier
Carlier
Laporte

26
data/place.json Normal file
View file

@ -0,0 +1,26 @@
[
{
"place" : {
"name" : "Zad NDDL",
"address" : "Notre-Dame-des-landes, 44111",
"lat" : "47.3435",
"lon": "-1.7367"
}
},
{
"place" : {
"name" : "La Mutinerie",
"address" : "176 - 178 rue Saint Martin, 75003 Paris",
"lat" : "48.8625665",
"lon": "2.3522237"
}
},
{
"place" : {
"name" : "Zad Bure",
"address" : "2 rue de l'Église, 55290 Bure",
"lat" : "48.502",
"lon": "5.351"
}
}
]

44
data/search_engine.json Normal file
View file

@ -0,0 +1,44 @@
[
{
"searchengine": {
"name":"Google",
"url":"https://google.com/",
"query_pattern": "search?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo",
"url":"https://duckduckgo.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo Lite",
"url":"https://duckduckgo.com/lite/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant",
"url":"https://www.qwant.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant lite",
"url":"https://lite.qwant.com/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Framabee",
"url":"https://framabee.org/",
"query_pattern":"?q={}"
}
}
]

6094
data/user-agent.json Normal file

File diff suppressed because it is too large Load diff

93
data/website.json Normal file
View file

@ -0,0 +1,93 @@
[
{
"name":"emmaclit",
"url":"https://emmaclit.com/",
"keywords": [
{"keyword":"Charge mentale"},
{"keyword":"Un autre regard"},
{"keyword":"Un petit poutou"},
{"keyword":"solidarité"},
{"keyword":"dédicace"}
],
"notable_pages": [
{"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
{"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
{"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
]
},
{
"name":"paris-luttes info",
"url":"https://paris-luttes.info/",
"keywords": [
{"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"},
{"keyword":"grève salariés"},
{"keyword":"prison"},
{"keyword":"violence policère"}
],
"notable_pages": [
{"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
{"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
{"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
]
},
{
"name":"zad nadir",
"url":"http://zad.nadir.org/",
"keywords": [
{"keyword":"Écologie"},
{"keyword":"opération césar"},
{"keyword":"expulsion vinci"},
{"keyword":"adresse"},
{"keyword":"la wardine"},
{"keyword":"route des chicanes"},
{"keyword":"opposition à l'aéroport Grand Ouest"}
],
"notable_pages": [
{"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
{"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
]
},
{
"name":"Fnac",
"url":"https://www.fnac.com/",
"keywords": [
{"keyword":"smartphone"},
{"keyword":"SAV"},
{"keyword":"Macbook"},
{"keyword":"TV"},
{"keyword":"PC Gaming"},
{"keyword":"DVD"},
{"keyword":"Home Cinema Philips"},
{"keyword":"Billeterie"}
],
"notable_pages": [
{"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
{"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
{"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
{"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
]
},
{
"name":"Sea Shepherd",
"url":"https://www.seashepherd.fr/",
"keywords": [
{"keyword":"pirates"},
{"keyword":"Phoques"},
{"keyword":"Paul Watson"},
{"keyword":"harponnage"},
{"keyword":"seal"},
{"keyword":"Chasse aux dauphins"},
{"keyword":"participation"},
{"keyword":"boutique"}
],
"notable_pages": [
{"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
{"webpage": "http://nyamba.seashepherd.info/"},
{"webpage": "http://seashepherd-shop.com/en/"},
{"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
]
}
]

View file

@ -0,0 +1,16 @@
from django.core.management.base import BaseCommand
from profiles import models as profiles
from histories.models import generate_history
from datetime import datetime
class Command(BaseCommand):
''' Generates an history and prints the related XML '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
prof = profiles.Profile.objects.all()[0]
history = generate_history(prof, datetime.now())
print(history.to_xml_string())

View file

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View file

@ -1,3 +1,276 @@
from django.db import models
""" Models for the history. This history should be able to generate history
entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
# Create your models here.
from collections import namedtuple
import random
import asyncio
from math import floor
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp.
"""
search = models.URLField(help_text="The url to be searched")
timestamp = models.DateTimeField()
history = models.ForeignKey(
'History',
on_delete=models.CASCADE
)
def __str__(self):
""" Returns the string representation of a history entry.
"""
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = str(self.search)
entry_ts = ET.Element('timestamp')
entry_ts.text = str(self.timestamp.timestamp())
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
)
played = models.BooleanField(default=False)
user = models.ForeignKey(
profiles.Profile,
on_delete=models.CASCADE
)
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
def __str__(self):
""" Returns the string representation of a history.
"""
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
async def _handler(self):
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
await runner.run()
self.played = True
self.save()
def play_histories(self):
""" Actually plays the history.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait([self._handler()]))
def to_xml(self, xml_root=None):
''' Exports the current history to xml '''
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={
'start-ts': str(self.start_ts),
'played': '1' if self.played else '0',
'user': str(self.user.pk),
})
xml_root.append(hist_node)
for entry in self.historyentry_set.all():
entry.to_xml(hist_node)
if standalone:
return xml_root
def to_xml_string(self):
xml = self.to_xml()
return ET.tostring(xml)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
timestamp = t_start
result = []
basis = generate_first_url(user)
t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis)
crawler.start()
crawler.join()
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
is actually played by a user.
"""
# let's define a new history object.
history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
current_timestamp = start_time.timestamp()
hist_size = 0
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
if len(url) < 200:
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
try:
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
return history

123
histories/tor_runner.py Normal file
View file

@ -0,0 +1,123 @@
"""
Modules that handles tor instaces creations in order to safely run histories
"""
import shutil
import datetime as dt
from time import sleep
import asyncio
import aiohttp
from aiosocks.connector import ProxyConnector, ProxyClientRequest
import async_timeout
import stem.process as tor
class TorInstance():
"""
A tor instance object, with some useful information.
It is designed to be used as a worker in order to replay an history.
"""
BASE_SOCKS_PORT = 40000
BASE_CONTROL_PORT = 20000
BASE_DATA_DIR = "/tmp/tor{}/"
TOR_RUNNER = 0
@classmethod
async def create(cls, history, headers):
""" Factory creation of tor processes"""
socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
TorInstance.TOR_RUNNER += 1
self = TorInstance()
self.socks_port = socks_port
self.control_port = control_port
self.data_dir = data_dir
self.history = history
self.headers = headers
self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
self.create_session()
self.process = tor.launch_tor_with_config(
config={
'ControlPort' : str(control_port),
'SocksPort' : str(socks_port),
'DataDir' : data_dir
}
)
return self
def __init__(self):
self.socks_port = 0
self.control_port = 0
self.data_dir = ""
self.history = None
self.proxy = ""
self.headers = {}
self.session = None
self.process = None
async def run(self):
""" Runs the Tor Instance on the history.
"""
while (self.history) and (dt.datetime.combine(self.history[0][1],
dt.datetime.min.time()) -
dt.datetime.now()).total_seconds() >= 10:
print("Sleeping")
sleep(10)
while self.history:
item = self.history.pop(0)
async with async_timeout.timeout(30):
await(self.query(item[0]))
now = dt.datetime.now()
print(self.history[0])
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
def create_session(self):
""" Create a aiohttp session.
"""
conn = ProxyConnector(remote_resolve=True)
self.session = aiohttp.ClientSession(
connector=conn,
headers=self.headers,
request_class=ProxyClientRequest
)
async def query(self, url):
""" Performs a query.
"""
async with async_timeout.timeout(30):
async with self.session.get(
url,
proxy=self.proxy,
proxy_auth=None) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
def __str__(self):
""" Utility function """
return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
'{0.control_port}, DataDir: {0.data_dir}'.format(self))
async def kill(self):
""" Kills the process and remove the data dir"""
self.process.kill()
self.session.close()
shutil.rmtree(self.data_dir)
async def main():
""" Test function """
for _ in range(3):
instance = await TorInstance.create(None, {"user-agent" : "Blah"})
await instance.query("https://python.org/")
print("One page received")
await instance.kill()
if __name__ == "__main__":
LOOP = asyncio.get_event_loop()
LOOP.run_until_complete(main())

View file

@ -13,6 +13,9 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
import os
from .settings_local import BASE_DIR, DEBUG, SECRET_KEY, DATABASES
HISTORY_MIN = 25
ALLOWED_HOSTS = []
@ -26,6 +29,7 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'profiles',
'histories',
'crawl',
]
@ -93,7 +97,7 @@ USE_I18N = True
USE_L10N = True
USE_TZ = True
USE_TZ = False # We don't really care, we want POSIX timestamps
# Static files (CSS, JavaScript, Images)

10
populate.sh Normal file
View file

@ -0,0 +1,10 @@
#!/bin/bash
# -*- coding: UTF8 -*-
python3 manage.py import_browser_fp
python3 manage.py import_search_engine
python3 manage.py import_keywords
python3 manage.py import_website
python3 manage.py import_places
python3 manage.py import_events
python3 manage.py import_interests

View file

View file

View file

@ -0,0 +1,41 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import BrowserFingerprint
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
data = data[0]["list"]
for os_agent in data:
for useragent in os_agent["useragents"]:
import_useragent(useragent)
def import_useragent(useragent):
fingerprint = BrowserFingerprint(
description=useragent.get("description", ""),
useragent=useragent.get("useragent", ""),
appname=useragent.get("appname", ""),
appversion=useragent.get("appversion", ""),
platform=useragent.get("appversion", ""),
vendor=useragent.get("vendor", ""),
vendorsub=useragent.get("vendorsub", ""),
buildID=useragent.get("buildID", ""),
oscpu=useragent.get("oscpu", ""),
accept_encoding=useragent.get("accept_encoding", ""),
accept_default=useragent.get("accept_default", ""),
accept_lang=useragent.get("accept_lang", ""),
pixeldepth=int(useragent.get("pixeldepth", 0)),
colordepth=int(useragent.get("colordepth", 0)),
screens=useragent.get("screen", ""),
)
print(fingerprint)
fingerprint.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/user-agent.json")

View file

@ -0,0 +1,41 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Place, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for event in data:
import_event(event)
def import_place(_place):
place = Place(
name=_place.get("name", ""),
address=_place.get("address", ""),
lat=float(_place.get("lat", 0)),
lon=float(_place.get("lon", 0))
)
place.save()
return place
def import_event(_event):
if isinstance(_event["place"], str):
place = Place.objects.get(name=_event["place"])
else:
place = import_place(_event["place"])
event = Event(
name=_event.get("name", ""),
date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
place=place
)
#print(event)
event.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/events.json")

View file

@ -0,0 +1,51 @@
""" Small module that import interests into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for interest in data:
import_interest(interest)
def import_interest(_interest):
keywords = []
places = []
websites = []
for keyword in _interest.get("keywords", []):
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
websites.append(Website.objects.get(name=website["website"]))
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
for website in websites:
interest.websites.add(website)
interest.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/interests.json")

View file

@ -0,0 +1,20 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Keyword
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for _keyword in data["list"]:
keyword = Keyword(text=_keyword.get("keyword", ""))
keyword.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/keywords.json")

View file

@ -0,0 +1,27 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Place
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for place in data:
import_place(place["place"])
def import_place(_place):
place = Place(
name=_place.get("name", ""),
address=_place.get("address", ""),
lat=float(_place.get("lat", 0)),
lon=float(_place.get("lon", 0))
)
place.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/place.json")

View file

@ -0,0 +1,27 @@
""" Small module that import browser fingerprints into the databose,
based on the data listed in https://huit.re/user-agent-json.
"""
import json
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import SearchEngine
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for search_engine in data:
import_search_engine(search_engine["searchengine"])
def import_search_engine(engine):
search_engine = SearchEngine(
name=engine.get("name", ""),
url=engine.get("url", ""),
query_pattern=engine.get("query_pattern", "")
)
#print(search_engine)
search_engine.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/search_engine.json")

View file

@ -0,0 +1,46 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Webpage, Website, Keyword
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for website in data:
import_website(website)
def import_website(_website):
keywords = []
webpages = []
for keyword in _website.get("keywords", []):
new_keyword = Keyword(
text=keyword.get("keyword", "")
)
new_keyword.save()
keywords.append(new_keyword)
for webpage in _website.get("notable_pages",[]):
new_webpage = Webpage(
url=webpage.get("keyword", "")
)
new_webpage.save()
webpages.append(new_webpage)
website = Website(
name=_website.get("name", ""),
url=_website.get("url", ""),
)
website.save()
for keyword in keywords:
website.keywords.add(keyword)
for webpage in webpages:
website.notable_pages.add(webpage)
print(website)
#website.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/website.json")

View file

@ -1,5 +1,46 @@
"""
A django module that defines a profile, and all the information that can be
stored in a profile.
It stores interests, technical information such as the browser fingerprint,
the preferred search engin, and if the user is likely to directly use urls
or to type in the search engine.
"""
import os
import random
from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform
@ -21,8 +62,14 @@ class Keyword(models.Model):
def __str__(self):
return self.text
def generate_url(self, user):
""" Generates the url for a keyword, based on the user search engine.
"""
return user.search_engine.search_url(self)
class Webpage(models.Model):
''' A webpage url '''
url = models.URLField()
def __str__(self):
@ -40,6 +87,22 @@ class Website(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" Generates the url in case the interest chosen is a website.
"""
rand = random.random()
if user.uses_urls:
url = self.url
elif rand <= 0.1:
url = random.choice(self.notable_pages.all()).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
str(random.choice(self.keywords.all()))
url = user.search_engine.search_url(search_term_text)
else:
url = user.search_engine.search_url(self.name)
return url
class Place(models.Model):
''' A real-life place '''
@ -52,6 +115,16 @@ class Place(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" Generates the url for a place.
"""
rand = random.random()
if rand < 1/2:
url = user.search_engine.search_url(self.name)
else:
url = user.search_engine.search_url(self.address)
return url
class Event(models.Model):
''' A real-life event (protests, meeting, ...) '''
@ -63,6 +136,15 @@ class Event(models.Model):
def __str__(self):
return self.name
def generate_url(self, user):
""" generate the url for an event object.
"""
possibilities = random.sample(
[self.name, self.date, self.place],
3
)
return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent '''
@ -86,6 +168,15 @@ class BrowserFingerprint(models.Model):
def __str__(self):
return self.description
def serialize_headers(self):
return {
"Description": str(self.description),
"User-Agent": str(self.useragent),
"Accept-Encoding": str(self.accept_encoding),
"Accept": str(self.accept_default),
"Accept-Language": str(self.accept_lang),
}
class SearchEngine(models.Model):
''' A search engine, and all the data needed to use it '''
@ -94,8 +185,8 @@ class SearchEngine(models.Model):
url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL that can be resolved to
# perform the search
# search term (using `.format()`), must yield a URL tail that can be
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
def __str__(self):
return self.name
@ -103,9 +194,10 @@ class SearchEngine(models.Model):
def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self))
return str(self.query_pattern).format(search_term)
return self.url + (str(self.query_pattern).format(search_term))
class Interest(models.Model):
@ -139,3 +231,35 @@ class Profile(models.Model):
on_delete=models.CASCADE)
browser_fingerprint = models.ForeignKey(BrowserFingerprint,
on_delete=models.CASCADE)
def generate_email(nick, first_name, last_name):
domain = random.choice(EMAIL_DOMAINS)
if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain
else:
email = nick + "@" + domain
return email
@require_nicknames
def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name)
profile = Profile(
nick=nick,
first_name=first_name,
last_name=last_name,
email=email,
uses_urls=(random.random() < 0.5),
)
profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile

View file

@ -1,5 +1,5 @@
aiodns==1.1.1
aiohttp==3.0.1
aiohttp==2.3.2
async-timeout==2.0.0
attrs==17.4.0
cchardet==2.1.1
@ -12,5 +12,8 @@ pycares==2.3.0
pytz==2017.3
yarl==1.1.1
beautifulsoup4==4.6.0
stem==1.6.0
pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git
aiosocks==0.2.6