Remove duplicated url in history

Fix printing in gen_history
Add gen_history django-admin command
2018-02-26 17:46:49 +01:00 · 2018-02-26 17:25:04 +01:00 · 2018-02-26 17:25:04 +01:00 · 2018-02-26 17:12:26 +01:00 · 2018-02-26 17:12:19 +01:00 · 2018-02-26 17:10:18 +01:00
30 changed files with 7779 additions and 83 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,6 @@
 # mpri-webdam

-Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
+Generate realistic fake browsing histories for borderline and/or activists
+users, to hide real traffic from global surveillance.
+
+Lacks proper documentation at the moment `:(`
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -4,7 +4,7 @@ from urllib.error import URLError
 from urllib.parse import urlparse

 from ssl import CertificateError
-from random import sample, randrange
+from random import sample, randrange, randint
 import re
 from datetime import datetime, timedelta

@ -14,6 +14,8 @@ import async_timeout

 from bs4 import BeautifulSoup, Comment

+from profiles.models import BrowserFingerprint, SearchEngine
+
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings

@ -25,11 +27,11 @@ MAX_PER_PAGE = 10

 FOOTER_URL = re.compile(".*footer.*")

+
 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'Default User'

 settings = Settings()
-startup_time = datetime.now()


 def url_getter(html, current_page, root_url):
@ -60,7 +62,7 @@ def url_getter(html, current_page, root_url):
            elif link.startswith('/'): #Internal link, linking to page root url
                links_list.append(root_url + link)
            elif link.startswith("#"):
-                print("Invalid link : internal bookmark")
+                continue
            else:
                links_list.append(current_page + "/" + link)

@ -71,11 +73,14 @@ def url_getter(html, current_page, root_url):
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))

+    forbidden_words = ['login', 'agreement', 'mailto', 'settings']
+    links_list = [link for link in links_list if not any(word in link.lower()
+                                                         for word in
+                                                         forbidden_words)]
+
    return links_list


-
-
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
@ -98,34 +103,47 @@ class WebsiteSchedulerMeta(type):

 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
-    def __init__(self, name):
+
+    search_engines = []  # Must be set by CrawlingThread.__init__
+
+    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
-        try:
-            robots_url = self.urlroot() + 'robots.txt'
-            self.robot_parser = RobotFileParser(robots_url)
-            self.robot_parser.read()  # TODO async?
-        except (URLError, CertificateError):
+        self.can_fetch_b = False
+        self.user_agent = (user_agent if user_agent is not None
+                           else settings.USER_AGENT)
+        if any(self.urlroot() in item for item in self.search_engines):
+            print("found a search engine for %s" % self.urlroot())
+            self.crawl_delay = timedelta(seconds=5)
+            self.can_fetch_b = True
+        else:
            try:
-                robots_url = self.unsafe_urlroot() + 'robots.txt'
+                robots_url = self.urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
-                self.robot_parser.read()
-            except URLError: # Almost surely an offline website.
+                self.robot_parser.read()  # TODO async?
+            except (URLError, CertificateError):
+                try:
+                    robots_url = self.unsafe_urlroot() + 'robots.txt'
+                    self.robot_parser = RobotFileParser(robots_url)
+                    self.robot_parser.read()
+                except URLError:  # Almost surely an offline website.
+                    self.dead = True
+                    self.crawl_delay = 0
+            except Exception as e:
+                print(e)
+                raise e
+            if not self.robot_parser.default_entry:
                self.dead = True
-                self.crawl_delay = 0
-        except Exception as e:
-            print(e)
-            raise e
-        if not self.dead:
-            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
-            if delay is None:
-                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
-                if req_rate is None:
-                    delay = 5
-                else:
-                    delay = req_rate.requests, req_rate.seconds
-            self.crawl_delay = timedelta(seconds=delay)
+            if not self.dead:
+                delay = self.robot_parser.crawl_delay(self.user_agent)
+                if delay is None:
+                    req_rate = self.robot_parser.request_rate(self.user_agent)
+                    if req_rate is None:
+                        delay = 5
+                    else:
+                        delay = req_rate.requests, req_rate.seconds
+                self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
@ -143,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
+        return ((self.can_fetch_b)
+                or ((not self.dead) and
+                    self.robot_parser.can_fetch(self.user_agent, url)))

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -154,30 +174,47 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self):
+    def __init__(self, url):
+        engine_list = [engine.url for engine in SearchEngine.objects.all()]
+        WebsiteScheduler.search_engines = engine_list
+
+        nb_fingerprint = len(BrowserFingerprint.objects.all())
+        fingerprint = BrowserFingerprint.objects.all()[
+            randint(0, nb_fingerprint - 1)]
+        self.headers = fingerprint.serialize_headers()
+
+        self.output_tree = []
        super(CrawlingThread, self).__init__()
+        self.url = url

    def run(self):
        tasks = []
-        #tasks.append(async_crawler("http://plus.google.com/+Python"))
-        tasks.append(async_print('https://python.org/'))

-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        loop.run_until_complete(asyncio.wait(tasks))
-        loop.close()
+        #tasks.append(async_crawler("http://plus.google.com/+Python"))
+        #tasks.append(async_crawler('https://python.org/'))
+        tasks.append(run_crawl(self.url, self.output_tree, self.headers))
+
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            loop.run_until_complete(asyncio.wait(tasks))
+        finally:
+            loop.close()


 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

-    def __init__(self, session, url):
+    headers = None
+
+    def __init__(self, session, url, user_agent):
        self.url = url
        self.session = session
+        self.user_agent = user_agent

    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
-        scheduler = WebsiteScheduler(self.url)
+        scheduler = WebsiteScheduler(self.url, self.user_agent)
        if not scheduler.can_fetch(self.url):
            return None

@ -187,7 +224,7 @@ class PageGetter:
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
-            async with self.session.get(self.url, ssl=ssl) as resp:
+            async with self.session.get(self.url, verify_ssl=ssl) as resp:
                try:
                    return await resp.text()
                except UnicodeDecodeError:
@ -197,44 +234,89 @@ class PageGetter:
 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url).get(ssl=False)
+        html = await PageGetter(session, url,
+                                settings.USER_AGENT).get(ssl=False)

-        print('GOT {}HTML for {} at {}'.format(
+        print('GOT {}HTML for {}'.format(
            'None ' if html is None else '',
            url,
-            datetime.now() - startup_time))
+        ))

-async def async_crawler(url):
-    queue = [url]
-    crawled = []
-    while queue or (len(crawled) < HARD_LIMIT):
-        async with aiohttp.ClientSession() as session:
-            try:
-                url = queue.pop(0)
-            except IndexError:
-                print("Error queue is empty")
-                return crawled
-            parsed_url = urlparse(url)
-            print("Crawling {}".format(url))
-            html = await PageGetter(session, url).get(ssl=False)
-            if html:
-                new_urls = url_getter(
-                    html,
-                    url,
-                    parsed_url.scheme + "://" + parsed_url.netloc
-                )
-                crawled += [url]
-                if new_urls:
-                    sampled = sample(
-                        new_urls,
-                        randrange(min(MAX_PER_PAGE, len(new_urls)))
-                    )
-                    queue += [sample_url for sample_url in sampled if
-                              sample_url not in queue and sample_url not in
-                              crawled]
-    print(crawled)

-if __name__ == '__main__':
-    crawl = CrawlingThread()
-    crawl.start()
-    crawl.join()
+class CrawlElem:
+    ''' Describes a crawled element, to be assembled into a tree '''
+
+    def __init__(self, url, parent):
+        self.url = url
+        self.parent = parent
+
+
+async def run_crawl(url, output_tree, headers=None):
+    ''' Starts a crawling session '''
+
+    if headers is None:
+        headers = {}
+    if 'User-Agent' not in headers:
+        headers['User-Agent'] = settings.USER_AGENT
+
+    user_agent = headers['User-Agent']
+    crawled = set()
+
+    async with aiohttp.ClientSession(headers=headers) as session:
+        await async_crawler(
+            url, output_tree, crawled, user_agent, session, None)
+
+
+def simplify_url(url):
+    anchor = url.find('#')
+    if anchor >= 0:
+        url = url[:anchor]
+
+    prot = url.find('://')
+    if prot >= 0:
+        url = url[prot+3:]
+
+    if url.startswith('www.'):
+        url = url[4:]
+
+    return url
+
+
+async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
+    if len(crawled) >= HARD_LIMIT:
+        return
+    crawled.add(simplify_url(url))
+    parsed_url = urlparse(url)
+    print("Crawling {}".format(url))
+    try:
+        with async_timeout.timeout(3):
+            html = await PageGetter(session, url, user_agent).get(ssl=False)
+    except asyncio.TimeoutError:
+        return
+
+    new_tasks = []
+
+    if html:
+        this_elem = CrawlElem(url, parent)
+        out_tree.append(this_elem)
+        new_urls = url_getter(
+            html,
+            url,
+            parsed_url.scheme + "://" + parsed_url.netloc
+        )
+        if new_urls:
+            sampled = sample(
+                new_urls,
+                randrange(min(MAX_PER_PAGE, len(new_urls)))
+            )
+            for sample_url in sampled:
+                if simplify_url(sample_url) not in crawled:
+                    new_tasks.append(async_crawler(
+                        sample_url, out_tree, crawled, user_agent, session,
+                        this_elem))
+    else:
+        print("No html received")
+    if len(crawled) >= HARD_LIMIT:
+        return
+    if new_tasks:
+        await asyncio.wait(new_tasks)
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
+nicknames_dict
--- a/data/email_domains.txt
+++ b/data/email_domains.txt
@ -0,0 +1,14 @@
+gmail.com
+protonmail.com
+riseup.net
+tutanoto.com
+outlook.fr
+fastmail.com
+yandex.com
+aim.com
+icloud.com
+yahoo.com
+fmx.fr
+mail.com
+hushmail.com
+inbox.com
--- a/data/events.json
+++ b/data/events.json
@ -0,0 +1,27 @@
+[
+    {
+        "name" : "Atelier Anti-Pub",
+        "date" : "07/03/2018 19:00 UTC",
+        "place" : {
+            "name" : "Centre Social Autogéré Vaydom",
+            "address" : "37 rue Marceau, Ivry-sur-Seine",
+            "lat" : "48.81787",
+            "lon" : "2.38032"
+        }
+    },
+    {
+        "name" : "Rassemblement de soutien pour Bure",
+        "date" : "27/02/2018 17:00 UTC",
+        "place" : {
+            "name" : "Place Saint-Michel",
+            "address" : "Place Saint-Michel, 75005 Paris",
+            "lat" : "48.85374",
+            "lon" : "2.34455"
+        }
+    },
+    {
+        "name" : "Création d'un serveur mail",
+        "date" : "15/02/2018 12:00 UTC",
+        "place" : "La Mutinerie"
+    }
+]
--- a/data/firstnames.txt
+++ b/data/firstnames.txt
@ -0,0 +1,200 @@
+Jean
+Marie
+Philippe
+Nathalie
+Michel
+Isabelle
+Alain
+Sylvie
+Patrick
+Catherine
+Nicolas
+Martine
+Christophe
+Christine
+Pierre
+Françoise
+Christian
+Valérie
+Éric
+Sandrine
+Frédéric
+Stéphanie
+Laurent
+Véronique
+Stéphane
+Sophie
+David
+Céline
+Pascal
+Chantal
+Daniel
+Patricia
+Alexandre
+Anne
+Julien
+Brigitte
+Thierry
+Julie
+Olivier
+Monique
+Bernard
+Aurélie
+Thomas
+Nicole
+Sébastien
+Laurence
+Gérard
+Annie
+Didier
+Émilie
+Dominique
+Dominique
+Vincent
+Virginie
+François
+Corinne
+Bruno
+Élodie
+Guillaume
+Christelle
+Jérôme
+Camille
+Jacques
+Caroline
+Marc
+Léa
+Maxime
+Sarah
+Romain
+Florence
+Claude
+Laetitia
+Antoine
+Audrey
+Franck
+Hélène
+Jean-Pierre
+Laura
+Anthony
+Manon
+Kévin
+Michèle
+Gilles
+Cécile
+Cédric
+Christiane
+Serge
+Béatrice
+André
+Claire
+Mathieu
+Nadine
+Benjamin
+Delphine
+Patrice
+Pauline
+Fabrice
+Karine
+Joël
+Mélanie
+Jérémy
+Marion
+Clément
+Chloe
+Arnaud
+Jacqueline
+Denis
+Elisabeth
+Paul
+Evelyne
+Lucas
+Marine
+Hervé
+Claudine
+Jean-Claude
+Anais
+Sylvain
+Lucie
+Yves
+Danielle
+Ludovic
+Carole
+Guy
+Fabienne
+Florian
+Mathilde
+Damien
+Sandra
+Alexis
+Pascale
+Mickaël
+Annick
+Quentin
+Charlotte
+Emmanuel
+Emma
+Louis
+Severine
+Benoît
+Sabrina
+Jean-Luc
+Amandine
+Fabien
+Myriam
+Francis
+Jocelyne
+Hugo
+Alexandra
+Jonathan
+Angelique
+Loïc
+Josiane
+Xavier
+Joelle
+Théo
+Agnes
+Adrien
+Mireille
+Raphaël
+Vanessa
+Jean-François
+Justine
+Grégory
+Sonia
+Robert
+Bernadette
+Michaël
+Emmanuelle
+Valentin
+Oceane
+Cyril
+Amelie
+Jean-Marc
+Clara
+René
+Maryse
+Lionel
+Anne-marie
+Yannick
+Fanny
+Enzo
+Magali
+Yannis
+Marie-christine
+Jean-Michel
+Morgane
+Baptiste
+Ines
+Matthieu
+Nadia
+Rémi
+Muriel
+Georges
+Jessica
+Aurélien
+Laure
+Nathan
+Genevieve
+Jean-Paul
+Estelle
--- a/data/interests.json
+++ b/data/interests.json
@ -0,0 +1,55 @@
+[
+    {
+        "name": "occupation",
+        "keywords": [
+            {"keyword" : "ZAD NDDL"},
+            {"keyword" : "Organiser un squat"},
+            {"keyword" : "mobilisation et rassemblement"}
+        ],
+        "places": [
+            {"place" : "Zad NDDL"},
+            {"place" : "Zad Bure"}
+        ],
+        "websites": [
+            {"website": "zad nadir"}
+        ],
+        "events":  [
+            {"event": "Rassemblement de soutien pour Bure"}
+        ]
+    },
+    {
+        "name": "LGBT",
+        "keywords": [
+            {"keyword" : "Discrimniation sexistes, quelles actions ?"},
+            {"keyword" : "gender queer Paris"},
+            {"keyword" : "Existrans Paris"}
+        ],
+        "places": [
+            {"place" : "La Mutinerie"}
+        ],
+        "websites": [
+            {"website": "emmaclit"},
+            {"website": "paris-luttes info"}
+        ],
+        "events":  [
+            {"event": "Création d'un serveur mail"}
+        ]
+    },
+    {
+        "name": "Anti pub",
+        "keywords": [
+            {"keyword" : "Affichage JCDecaux"},
+            {"keyword" : "Anti-pub"},
+            {"keyword" : "Journée contre la publicité"}
+        ],
+        "places": [
+            {"place" : "Centre Social Autogéré Vaydom"}
+        ],
+        "websites": [
+            {"website": "paris-luttes info"}
+        ],
+        "events":  [
+            {"event": "Atelier Anti-Pub"}
+        ]
+    }
+]
--- a/data/keywords.json
+++ b/data/keywords.json
@ -0,0 +1,17 @@
+{
+    "list": [
+        { "keyword" : "gender queer Paris"},
+        {"keyword" : "fabriquer masque manif"},
+        {"keyword" : "Se protéger en manif"},
+        {"keyword" : "Legal team manif France"},
+        {"keyword" : "Guide juridique GAV"},
+        {"keyword" : "Échec du capitaisme"},
+        {"keyword" : "Bienfait du communisme"},
+        {"keyword" : "Le comité invisible"},
+        {"keyword" : "À nos enfants"},
+        {"keyword" : "Squats sur Paris"},
+        {"keyword" : "Local facho à Strasbourg"},
+        {"keyword" : "Discrimation sexistes, quelles actions ?"},
+        {"keyword" : "Pourquoi la lutte des classes"}
+    ]
+}
--- a/data/lastnames.txt
+++ b/data/lastnames.txt
@ -0,0 +1,200 @@
+Martin
+Bernard
+Thomas
+Petit
+Robert
+Richard
+Durand
+Dubois
+Moreau
+Laurent
+Simon
+Michel
+Lefebvre
+Leroy
+Roux
+David
+Bertrand
+Morel
+Fournier
+Girard
+Bonnet
+Dupont
+Lambert
+Fontaine
+Rousseau
+Vincent
+Muller
+Lefevre
+Faure
+Andre
+Mercier
+Blanc
+Guerin
+Boyer
+Garnier
+Chevalier
+Francois
+Legrand
+Gauthier
+Garcia
+Perrin
+Robin
+Clement
+Morin
+Nicolas
+Henry
+Roussel
+Mathieu
+Gautier
+Masson
+Marchand
+Duval
+Denis
+Dumont
+Marie
+Lemaire
+Noel
+Meyer
+Dufour
+Meunier
+Brun
+Blanchard
+Giraud
+Joly
+Riviere
+Lucas
+Brunet
+Gaillard
+Barbier
+Arnaud
+Martinez
+Gerard
+Roche
+Renard
+Schmitt
+Roy
+Leroux
+Colin
+Vidal
+Caron
+Picard
+Roger
+Fabre
+Aubert
+Lemoine
+Renaud
+Dumas
+Lacroix
+Olivier
+Philippe
+Bourgeois
+Pierre
+Benoit
+Rey
+Leclerc
+Payet
+Rolland
+Leclercq
+Guillaume
+Lecomte
+Lopez
+Jean
+Dupuy
+Guillot
+Hubert
+Berger
+Carpentier
+Sanchez
+Dupuis
+Moulin
+Louis
+Deschamps
+Huet
+Vasseur
+Perez
+Boucher
+Fleury
+Royer
+Klein
+Jacquet
+Adam
+Paris
+Poirier
+Marty
+Aubry
+Guyot
+Carre
+Charles
+Renault
+Charpentier
+Menard
+Maillard
+Baron
+Bertin
+Bailly
+Herve
+Schneider
+Fernandez
+Le Gall
+Collet
+Leger
+Bouvier
+Julien
+Prevost
+Millet
+Perrot
+Daniel
+Le Roux
+Cousin
+Germain
+Breton
+Besson
+Langlois
+Remy
+Le Goff
+Pelletier
+Leveque
+Perrier
+Leblanc
+Barre
+Lebrun
+Marchal
+Weber
+Mallet
+Hamon
+Boulanger
+Jacob
+Monnier
+Michaud
+Rodriguez
+Guichard
+Gillet
+Etienne
+Grondin
+Poulain
+Tessier
+Chevallier
+Collin
+Chauvin
+Da Silva
+Bouchet
+Gay
+Lemaitre
+Benard
+Marechal
+Humbert
+Reynaud
+Antoine
+Hoarau
+Perret
+Barthelemy
+Cordier
+Pichon
+Lejeune
+Gilbert
+Lamy
+Delaunay
+Pasquier
+Carlier
+Laporte
--- a/data/place.json
+++ b/data/place.json
@ -0,0 +1,26 @@
+[
+    {
+        "place" : {
+            "name" : "Zad NDDL",
+            "address" : "Notre-Dame-des-landes, 44111",
+            "lat" : "47.3435",
+            "lon": "-1.7367"
+        }
+    },
+    {
+        "place" : {
+            "name" : "La Mutinerie",
+            "address" : "176 - 178 rue Saint Martin, 75003 Paris",
+            "lat" : "48.8625665",
+            "lon": "2.3522237"
+        }
+    },
+    {
+        "place" : {
+            "name" : "Zad Bure",
+            "address" : "2 rue de l'Église, 55290 Bure",
+            "lat" : "48.502",
+            "lon": "5.351"
+        }
+    }
+]
--- a/data/search_engine.json
+++ b/data/search_engine.json
@ -0,0 +1,44 @@
+[
+    {
+		"searchengine": {
+            "name":"Google",
+            "url":"https://google.com/",
+            "query_pattern": "search?q={}"
+        }
+    },
+    {
+		"searchengine": {
+            "name":"Duckduckgo",
+            "url":"https://duckduckgo.com/",
+            "query_pattern":"?q={}"
+        }
+    },
+    {
+		"searchengine": {
+            "name":"Duckduckgo Lite",
+            "url":"https://duckduckgo.com/lite/",
+            "query_pattern":"?q={}"
+        }
+    },
+    {
+		"searchengine": {
+            "name":"Qwant",
+            "url":"https://www.qwant.com/",
+            "query_pattern":"?q={}"
+        }
+    },
+    {
+		"searchengine": {
+            "name":"Qwant lite",
+            "url":"https://lite.qwant.com/",
+            "query_pattern":"?q={}"
+        }
+    },
+    {
+		"searchengine": {
+            "name":"Framabee",
+            "url":"https://framabee.org/",
+            "query_pattern":"?q={}"
+        }
+    }
+]
--- a/data/user-agent.json
+++ b/data/user-agent.json
--- a/data/website.json
+++ b/data/website.json
@ -0,0 +1,93 @@
+[
+    {
+        "name":"emmaclit",
+        "url":"https://emmaclit.com/",
+        "keywords": [
+            {"keyword":"Charge mentale"},
+            {"keyword":"Un autre regard"},
+            {"keyword":"Un petit poutou"},
+            {"keyword":"solidarité"},
+            {"keyword":"dédicace"}
+        ],
+        "notable_pages": [
+            {"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
+            {"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
+            {"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
+        ]
+    },
+    {
+        "name":"paris-luttes info",
+        "url":"https://paris-luttes.info/",
+        "keywords": [
+            {"keyword":"manifestations"},
+            {"keyword":"solidarité immigré·e·s"},
+            {"keyword":"grève salariés"},
+            {"keyword":"prison"},
+            {"keyword":"violence policère"}
+        ],
+        "notable_pages": [
+            {"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
+            {"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
+            {"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
+        ]
+    },
+    {
+        "name":"zad nadir",
+        "url":"http://zad.nadir.org/",
+        "keywords": [
+            {"keyword":"Écologie"},
+            {"keyword":"opération césar"},
+            {"keyword":"expulsion vinci"},
+            {"keyword":"adresse"},
+            {"keyword":"la wardine"},
+            {"keyword":"route des chicanes"},
+            {"keyword":"opposition à l'aéroport Grand Ouest"}
+        ],
+        "notable_pages": [
+            {"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
+            {"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
+            {"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
+            {"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
+        ]
+    },
+    {
+        "name":"Fnac",
+        "url":"https://www.fnac.com/",
+        "keywords": [
+            {"keyword":"smartphone"},
+            {"keyword":"SAV"},
+            {"keyword":"Macbook"},
+            {"keyword":"TV"},
+            {"keyword":"PC Gaming"},
+            {"keyword":"DVD"},
+            {"keyword":"Home Cinema Philips"},
+            {"keyword":"Billeterie"}
+        ],
+        "notable_pages": [
+            {"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
+            {"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
+            {"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
+            {"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
+        ]
+    },
+    {
+        "name":"Sea Shepherd",
+        "url":"https://www.seashepherd.fr/",
+        "keywords": [
+            {"keyword":"pirates"},
+            {"keyword":"Phoques"},
+            {"keyword":"Paul Watson"},
+            {"keyword":"harponnage"},
+            {"keyword":"seal"},
+            {"keyword":"Chasse aux dauphins"},
+            {"keyword":"participation"},
+            {"keyword":"boutique"}
+        ],
+        "notable_pages": [
+            {"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
+            {"webpage": "http://nyamba.seashepherd.info/"},
+            {"webpage": "http://seashepherd-shop.com/en/"},
+            {"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
+        ]
+    }
+]
--- a/histories/management/commands/gen_history.py
+++ b/histories/management/commands/gen_history.py
@ -0,0 +1,16 @@
+from django.core.management.base import BaseCommand
+from profiles import models as profiles
+from histories.models import generate_history
+from datetime import datetime
+
+
+class Command(BaseCommand):
+    ''' Generates an history and prints the related XML '''
+
+    def add_arguments(self, parser):
+        pass
+
+    def handle(self, *args, **kwargs):
+        prof = profiles.Profile.objects.all()[0]
+        history = generate_history(prof, datetime.now())
+        print(history.to_xml_string())
--- a/histories/migrations/0001_initial.py
+++ b/histories/migrations/0001_initial.py
@ -0,0 +1,34 @@
+# Generated by Django 2.0.1 on 2018-02-25 19:08
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('profiles', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='History',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
+                ('played', models.BooleanField(default=False)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
+            ],
+        ),
+        migrations.CreateModel(
+            name='HistoryEntry',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('search', models.URLField(help_text='The url to be searched')),
+                ('timestamp', models.DateTimeField()),
+                ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
+            ],
+        ),
+    ]
--- a/histories/models.py
+++ b/histories/models.py
@ -1,3 +1,276 @@
-from django.db import models
+""" Models for the history. This history should be able to generate history
+entries, which looks like human-based browsing, according to a dedicated user
+interests, keywords...
+"""

-# Create your models here.
+from collections import namedtuple
+import random
+import asyncio
+from math import floor
+from xml.etree import ElementTree as ET
+from datetime import datetime
+from django.db import models
+from django.core.exceptions import ValidationError
+import profiles.models as profiles
+from crawl import crawl
+from pinocchio.settings import HISTORY_MIN
+from .tor_runner import TorInstance
+
+
+class InvalidXml(Exception):
+    def __init__(self, what='unexpected XML data.'):
+        super().__init__()
+        self.what = what
+
+    def __str__(self):
+        return "Invalid XML: " + self.what
+
+
+class HistoryEntry(models.Model):
+    """ A history entry, aka a url, and a timestamp.
+    """
+    search = models.URLField(help_text="The url to be searched")
+    timestamp = models.DateTimeField()
+    history = models.ForeignKey(
+        'History',
+        on_delete=models.CASCADE
+    )
+
+    def __str__(self):
+        """ Returns the string representation of a history entry.
+        """
+        return "{} : {}".format(self.timestamp, self.search)
+
+    def to_xml(self, xml_root):
+        entry = ET.Element('history')
+        entry_url = ET.Element('url')
+        entry_url.text = str(self.search)
+        entry_ts = ET.Element('timestamp')
+        entry_ts.text = str(self.timestamp.timestamp())
+        entry.append(entry_url)
+        entry.append(entry_ts)
+        xml_root.append(entry)
+
+    @staticmethod
+    def from_xml(xml_root, in_history):
+        if xml_root.tag != 'history':
+            raise InvalidXml("expected <history> tag here.")
+        url, timestamp = None, None
+
+        for child in xml_root:
+            if child.tag == 'url':
+                url = child.text
+            elif child.tag == 'timestamp':
+                try:
+                    timestamp = datetime.fromtimestamp(child.text)
+                except TypeError:
+                    raise InvalidXml("invalid timestamp {}".format(child.text))
+            else:
+                raise InvalidXml("unknown tag {} as child of <history>".format(
+                    child.tag))
+        output = HistoryEntry()
+        output.search = url
+        output.timestamp = timestamp
+        output.history = in_history
+
+        return output
+
+
+class History(models.Model):
+    """ A history for a user, containing some web connections (http, https).
+    Each history is timed, in a human-behaviour manner. """
+
+    start_ts = models.DateTimeField(
+        help_text=('The starting timestamp of the history. Useful for '
+                   'cron-like structure.')
+
+    )
+    played = models.BooleanField(default=False)
+    user = models.ForeignKey(
+        profiles.Profile,
+        on_delete=models.CASCADE
+    )
+
+    def return_history(self):
+        """ Returns the history, sorted by increasing timestamps
+        """
+        output_history = self.historyentry_set.order_by('timestamp')
+        output_history = [(item.search, item.timestamp.date())
+                          for item in output_history]
+        return output_history
+
+    def __str__(self):
+        """ Returns the string representation of a history.
+        """
+        entries = self.historyentry_set.order_by('timestamp')
+        output = "[History]:\n"
+        for entry in entries:
+            output += str(entry) + '\n'
+        return output
+
+    async def _handler(self):
+        runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
+        await runner.run()
+        self.played = True
+        self.save()
+
+    def play_histories(self):
+        """ Actually plays the history.
+        """
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(asyncio.wait([self._handler()]))
+
+    def to_xml(self, xml_root=None):
+        ''' Exports the current history to xml '''
+        standalone = False
+        if xml_root is None:
+            standalone = True
+            xml_root = ET.Element('root')
+
+        hist_node = ET.Element("history", attrib={
+            'start-ts': str(self.start_ts),
+            'played': '1' if self.played else '0',
+            'user': str(self.user.pk),
+        })
+        xml_root.append(hist_node)
+        for entry in self.historyentry_set.all():
+            entry.to_xml(hist_node)
+
+        if standalone:
+            return xml_root
+
+    def to_xml_string(self):
+        xml = self.to_xml()
+        return ET.tostring(xml)
+
+    @staticmethod
+    def from_xml(xml_root):
+        ''' Loads an history from an XML file '''
+
+        REQUIRED_ATTR = ['start-ts', 'played', 'user']
+
+        if xml_root.tag != 'history':
+            raise InvalidXml('unexpected node {} as root of an history'.format(
+                xml_root.tag))
+        for attr in REQUIRED_ATTR:
+            if attr not in xml_root.attrib:
+                raise InvalidXml(('missing attribute "{}" for tag of type '
+                                  'history').format(attr))
+        start_ts = xml_root.attrib['start-ts']
+        played = xml_root.attrib['played']
+        user_pk = xml_root.attrib['user']
+        users = History.objects.filter(pk=1)
+        if len(users) != 1:
+            raise InvalidXml('primary key for History {} is invalid'.format(
+                user_pk))
+
+        output = History()
+        output.start_ts = start_ts
+        output.played = played > 0
+        output.user = users[0]
+
+        for child in xml_root:
+            HistoryEntry.from_xml(child, output)
+
+        return output
+
+
+PartialHistoryEntry = namedtuple('PartialHistoryEntry',
+                                 ['url', 'timestamp'])
+
+
+def generate_partial_history(user, t_start):
+    """ Generate the part of the history resulting from the crawl starting at
+    the given url.
+    """
+    timestamp = t_start
+    result = []
+    basis = generate_first_url(user)
+    t_start += 5 * random.weibullvariate(1, 1.5)
+    crawler = crawl.CrawlingThread(basis)
+    crawler.start()
+    crawler.join()
+    urls_tree = crawler.output_tree
+
+    open_time = {}
+    for elem in urls_tree:
+        url, parent = elem.url, elem.parent
+        timestamp = 0
+        if parent is None:
+            timestamp = t_start
+        else:
+            timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
+        open_time[elem] = timestamp
+        result.append(PartialHistoryEntry(url, timestamp))
+    return result
+
+
+def generate_first_url(user):
+    """ Generate the first url of a partial history, based on the user
+    information. """
+
+    def nonempty(seq):
+        out = []
+        for elt in seq:
+            if elt:
+                out.append(elt)
+        return out
+
+    all_keywords = profiles.Keyword.objects.filter(
+        interest__profile__in=[user])
+    all_websites = profiles.Website.objects.filter(
+        interest__profile__in=[user])
+    all_places = profiles.Place.objects.filter(
+        interest__profile__in=[user])
+    all_events = profiles.Event.objects.filter(
+        interest__profile__in=[user])
+
+    interest = random.choice(nonempty([
+        all_keywords,
+        all_websites,
+        all_places,
+        all_events,
+    ]))
+    search_term = random.choice(interest)
+    url = search_term.generate_url(user)
+    return url
+
+
+def generate_history(user, start_time):
+    """ Generate a new history for the user `user`, starting from timestamp
+    `ts_start`.
+    A few heuristics are used in order to give the impression that the history
+    is actually played by a user.
+    """
+
+    # let's define a new history object.
+    history = History(start_ts=start_time, user=user)
+    length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
+    history.full_clean()
+    history.save()
+
+    current_timestamp = start_time.timestamp()
+
+    hist_size = 0
+
+    while hist_size < length:
+        current_timestamp += 5 * random.weibullvariate(1, 2.8)
+        history_list = generate_partial_history(user, current_timestamp)
+        current_timestamp = \
+            history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
+        for (url, timestamp) in history_list:
+            if len(url) < 200:
+                new_line = HistoryEntry(
+                    search=url,
+                    timestamp=datetime.fromtimestamp(timestamp),
+                    history=history
+                )
+                try:
+                    new_line.full_clean()
+                    new_line.save()
+                    hist_size += 1
+                except ValidationError:
+                    continue
+
+    return history
--- a/histories/tor_runner.py
+++ b/histories/tor_runner.py
@ -0,0 +1,123 @@
+"""
+Modules that handles tor instaces creations in order to safely run histories
+"""
+
+import shutil
+import datetime as dt
+from time import sleep
+import asyncio
+import aiohttp
+from aiosocks.connector import ProxyConnector, ProxyClientRequest
+import async_timeout
+import stem.process as tor
+
+class TorInstance():
+    """
+    A tor instance object, with some useful information.
+    It is designed to be used as a worker in order to replay an history.
+    """
+    BASE_SOCKS_PORT = 40000
+    BASE_CONTROL_PORT = 20000
+    BASE_DATA_DIR = "/tmp/tor{}/"
+    TOR_RUNNER = 0
+
+    @classmethod
+    async def create(cls, history, headers):
+        """ Factory creation of tor processes"""
+        socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
+        control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
+        data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
+        TorInstance.TOR_RUNNER += 1
+        self = TorInstance()
+        self.socks_port = socks_port
+        self.control_port = control_port
+        self.data_dir = data_dir
+        self.history = history
+        self.headers = headers
+        self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
+        self.create_session()
+        self.process = tor.launch_tor_with_config(
+            config={
+                'ControlPort' : str(control_port),
+                'SocksPort' : str(socks_port),
+                'DataDir' : data_dir
+            }
+        )
+        return self
+
+    def __init__(self):
+        self.socks_port = 0
+        self.control_port = 0
+        self.data_dir = ""
+        self.history = None
+        self.proxy = ""
+        self.headers = {}
+        self.session = None
+        self.process = None
+
+    async def run(self):
+        """ Runs the Tor Instance on the history.
+        """
+        while (self.history) and (dt.datetime.combine(self.history[0][1],
+                                                      dt.datetime.min.time()) -
+                                  dt.datetime.now()).total_seconds() >= 10:
+            print("Sleeping")
+            sleep(10)
+        while self.history:
+            item = self.history.pop(0)
+            async with async_timeout.timeout(30):
+                await(self.query(item[0]))
+                now = dt.datetime.now()
+                print(self.history[0])
+                if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
+                    sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
+
+
+    def create_session(self):
+        """ Create a aiohttp session.
+        """
+        conn = ProxyConnector(remote_resolve=True)
+        self.session = aiohttp.ClientSession(
+            connector=conn,
+            headers=self.headers,
+            request_class=ProxyClientRequest
+        )
+
+
+    async def query(self, url):
+        """ Performs a query.
+        """
+        async with async_timeout.timeout(30):
+            async with self.session.get(
+                url,
+                proxy=self.proxy,
+                proxy_auth=None) as resp:
+                try:
+                    return await resp.text()
+                except UnicodeDecodeError:
+                    return None
+
+
+    def __str__(self):
+        """ Utility function """
+        return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
+                '{0.control_port}, DataDir: {0.data_dir}'.format(self))
+
+    async def kill(self):
+        """ Kills the process and remove the data dir"""
+        self.process.kill()
+        self.session.close()
+        shutil.rmtree(self.data_dir)
+
+
+async def main():
+    """ Test function """
+    for _ in range(3):
+        instance = await TorInstance.create(None, {"user-agent" : "Blah"})
+        await instance.query("https://python.org/")
+        print("One page received")
+        await instance.kill()
+
+if __name__ == "__main__":
+    LOOP = asyncio.get_event_loop()
+    LOOP.run_until_complete(main())
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@ -13,6 +13,9 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
 import os
 from .settings_local import BASE_DIR, DEBUG, SECRET_KEY, DATABASES

+
+HISTORY_MIN = 25
+
 ALLOWED_HOSTS = []


@ -26,6 +29,7 @@ INSTALLED_APPS = [
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'profiles',
+    'histories',
    'crawl',
 ]

@ -93,7 +97,7 @@ USE_I18N = True

 USE_L10N = True

-USE_TZ = True
+USE_TZ = False  # We don't really care, we want POSIX timestamps


 # Static files (CSS, JavaScript, Images)
--- a/populate.sh
+++ b/populate.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+# -*- coding: UTF8 -*-
+
+python3 manage.py import_browser_fp
+python3 manage.py import_search_engine
+python3 manage.py import_keywords
+python3 manage.py import_website
+python3 manage.py import_places
+python3 manage.py import_events
+python3 manage.py import_interests
--- a/profiles/management/init.py
+++ b/profiles/management/init.py
--- a/profiles/management/commands/init.py
+++ b/profiles/management/commands/init.py
--- a/profiles/management/commands/import_browser_fp.py
+++ b/profiles/management/commands/import_browser_fp.py
@ -0,0 +1,41 @@
+""" Small module that import browser fingerprints into the databose,
+based on the data listed in https://huit.re/user-agent-json.
+"""
+
+import json
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import BrowserFingerprint
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        data = data[0]["list"]
+        for os_agent in data:
+            for useragent in os_agent["useragents"]:
+                import_useragent(useragent)
+
+def import_useragent(useragent):
+    fingerprint = BrowserFingerprint(
+        description=useragent.get("description", ""),
+        useragent=useragent.get("useragent", ""),
+        appname=useragent.get("appname", ""),
+        appversion=useragent.get("appversion", ""),
+        platform=useragent.get("appversion", ""),
+        vendor=useragent.get("vendor", ""),
+        vendorsub=useragent.get("vendorsub", ""),
+        buildID=useragent.get("buildID", ""),
+        oscpu=useragent.get("oscpu", ""),
+        accept_encoding=useragent.get("accept_encoding", ""),
+        accept_default=useragent.get("accept_default", ""),
+        accept_lang=useragent.get("accept_lang", ""),
+        pixeldepth=int(useragent.get("pixeldepth", 0)),
+        colordepth=int(useragent.get("colordepth", 0)),
+        screens=useragent.get("screen", ""),
+    )
+    print(fingerprint)
+    fingerprint.save()
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/user-agent.json")
--- a/profiles/management/commands/import_events.py
+++ b/profiles/management/commands/import_events.py
@ -0,0 +1,41 @@
+""" Small module that import events into the database.
+"""
+
+import json
+from datetime import datetime
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import Place, Event
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for event in data:
+            import_event(event)
+
+def import_place(_place):
+    place = Place(
+        name=_place.get("name", ""),
+        address=_place.get("address", ""),
+        lat=float(_place.get("lat", 0)),
+        lon=float(_place.get("lon", 0))
+    )
+    place.save()
+    return place
+
+def import_event(_event):
+    if isinstance(_event["place"], str):
+        place = Place.objects.get(name=_event["place"])
+    else:
+        place = import_place(_event["place"])
+    event = Event(
+        name=_event.get("name", ""),
+        date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
+        place=place
+    )
+    #print(event)
+    event.save()
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/events.json")
--- a/profiles/management/commands/import_interests.py
+++ b/profiles/management/commands/import_interests.py
@ -0,0 +1,51 @@
+""" Small module that import interests into the database.
+"""
+
+import json
+from datetime import datetime
+from django.core.management.base import BaseCommand
+from django.db import models
+from django.core.exceptions import ObjectDoesNotExist
+from profiles.models import Keyword, Interest, Place, Website, Event
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for interest in data:
+            import_interest(interest)
+
+
+def import_interest(_interest):
+    keywords = []
+    places = []
+    websites = []
+    for keyword in _interest.get("keywords", []):
+        try:
+            stored = Keyword.objects.get(text=keyword["keyword"])
+            keywords.append(stored)
+        except ObjectDoesNotExist:
+            new_keyword = Keyword(text=keyword["keyword"])
+            new_keyword.save()
+            keywords.append(new_keyword)
+            print("New keyword %s" % new_keyword)
+    for place in _interest.get("places", []):
+        places.append(Place.objects.get(name=place["place"]))
+    for website in _interest.get("websites", []):
+        websites.append(Website.objects.get(name=website["website"]))
+
+    interest = Interest(
+        name=_interest.get("name", ""),
+    )
+    interest.save()
+    for keyword in keywords:
+        print(keyword)
+        interest.keywords.add(keyword)
+    for place in places:
+        interest.places.add(place)
+    for website in websites:
+        interest.websites.add(website)
+    interest.save()
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/interests.json")
--- a/profiles/management/commands/import_keywords.py
+++ b/profiles/management/commands/import_keywords.py
@ -0,0 +1,20 @@
+""" Small module that import browser fingerprints into the databose,
+based on the data listed in https://huit.re/user-agent-json.
+"""
+
+import json
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import Keyword
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for _keyword in data["list"]:
+            keyword = Keyword(text=_keyword.get("keyword", ""))
+            keyword.save()
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/keywords.json")
--- a/profiles/management/commands/import_places.py
+++ b/profiles/management/commands/import_places.py
@ -0,0 +1,27 @@
+""" Small module that import browser fingerprints into the databose,
+based on the data listed in https://huit.re/user-agent-json.
+"""
+
+import json
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import Place
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for place in data:
+            import_place(place["place"])
+
+def import_place(_place):
+    place = Place(
+        name=_place.get("name", ""),
+        address=_place.get("address", ""),
+        lat=float(_place.get("lat", 0)),
+        lon=float(_place.get("lon", 0))
+    )
+    place.save()
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/place.json")
--- a/profiles/management/commands/import_search_engine.py
+++ b/profiles/management/commands/import_search_engine.py
@ -0,0 +1,27 @@
+""" Small module that import browser fingerprints into the databose,
+based on the data listed in https://huit.re/user-agent-json.
+"""
+
+import json
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import SearchEngine
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for search_engine in data:
+            import_search_engine(search_engine["searchengine"])
+
+def import_search_engine(engine):
+    search_engine = SearchEngine(
+        name=engine.get("name", ""),
+        url=engine.get("url", ""),
+        query_pattern=engine.get("query_pattern", "")
+    )
+    #print(search_engine)
+    search_engine.save()
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/search_engine.json")
--- a/profiles/management/commands/import_website.py
+++ b/profiles/management/commands/import_website.py
@ -0,0 +1,46 @@
+""" Small module that import events into the database.
+"""
+
+import json
+from datetime import datetime
+from django.core.management.base import BaseCommand
+from django.db import models
+from profiles.models import Webpage, Website, Keyword
+
+def import_file(filename):
+    with open(filename, mode='r') as file:
+        data = json.load(file)
+        for website in data:
+            import_website(website)
+
+def import_website(_website):
+    keywords = []
+    webpages = []
+    for keyword in _website.get("keywords", []):
+        new_keyword = Keyword(
+            text=keyword.get("keyword", "")
+        )
+        new_keyword.save()
+        keywords.append(new_keyword)
+    for webpage in _website.get("notable_pages",[]):
+        new_webpage = Webpage(
+            url=webpage.get("keyword", "")
+        )
+        new_webpage.save()
+        webpages.append(new_webpage)
+    website = Website(
+        name=_website.get("name", ""),
+        url=_website.get("url", ""),
+    )
+    website.save()
+    for keyword in keywords:
+        website.keywords.add(keyword)
+    for webpage in webpages:
+        website.notable_pages.add(webpage)
+    print(website)
+    #website.save()
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        import_file("data/website.json")
--- a/profiles/models.py
+++ b/profiles/models.py
@ -1,5 +1,46 @@
+"""
+A django module that defines a profile, and all the information that can be
+stored in a profile.
+It stores interests, technical information such as the browser fingerprint,
+the preferred search engin, and if the user is likely to directly use urls
+or to type in the search engine.
+"""
+
+import os
+import random
 from django.db import models

+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+NICKNAMES = None
+LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
+FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
+EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
+
+
+def require_nicknames(fct):
+    def read_file(path):
+        global NICKNAMES
+        print("Trying {}".format(path))
+        with open(path, 'r') as handle:
+            NICKNAMES = handle.read().splitlines()
+
+    nicknames_files = [
+        os.path.join(BASE_DIR, 'data/nicknames_dict'),
+        "/usr/share/dict/american-english",
+    ]
+    if NICKNAMES is None:
+        for nick_file in nicknames_files:
+            try:
+                read_file(nick_file)
+                break
+            except FileNotFoundError:
+                pass
+        if NICKNAMES is None:
+            raise FileNotFoundError
+
+    return fct
+

 class InvalidData(Exception):
    ''' Thrown when the DB contains invalid data, and cannot perform
@ -21,8 +62,14 @@ class Keyword(models.Model):
    def __str__(self):
        return self.text

+    def generate_url(self, user):
+        """ Generates the url for a keyword, based on the user search engine.
+        """
+        return user.search_engine.search_url(self)
+

 class Webpage(models.Model):
+    ''' A webpage url '''
    url = models.URLField()

    def __str__(self):
@ -40,6 +87,22 @@ class Website(models.Model):
    def __str__(self):
        return self.name

+    def generate_url(self, user):
+        """ Generates the url in case the interest chosen is a website.
+        """
+        rand = random.random()
+        if user.uses_urls:
+            url = self.url
+        elif rand <= 0.1:
+            url = random.choice(self.notable_pages.all()).url
+        elif rand <= 0.8:
+            search_term_text = self.name + " " + \
+                    str(random.choice(self.keywords.all()))
+            url = user.search_engine.search_url(search_term_text)
+        else:
+            url = user.search_engine.search_url(self.name)
+        return url
+

 class Place(models.Model):
    ''' A real-life place '''
@ -52,6 +115,16 @@ class Place(models.Model):
    def __str__(self):
        return self.name

+    def generate_url(self, user):
+        """ Generates the url for a place.
+        """
+        rand = random.random()
+        if rand < 1/2:
+            url = user.search_engine.search_url(self.name)
+        else:
+            url = user.search_engine.search_url(self.address)
+        return url
+

 class Event(models.Model):
    ''' A real-life event (protests, meeting, ...) '''
@ -63,6 +136,15 @@ class Event(models.Model):
    def __str__(self):
        return self.name

+    def generate_url(self, user):
+        """ generate the url for an event object.
+        """
+        possibilities = random.sample(
+            [self.name, self.date, self.place],
+            3
+        )
+        return user.search_engine.search_url(" ".join(possibilities))
+

 class BrowserFingerprint(models.Model):
    ''' A browser fingerprint, containing things like a user agent '''
@ -86,6 +168,15 @@ class BrowserFingerprint(models.Model):
    def __str__(self):
        return self.description

+    def serialize_headers(self):
+        return {
+            "Description": str(self.description),
+            "User-Agent": str(self.useragent),
+            "Accept-Encoding": str(self.accept_encoding),
+            "Accept": str(self.accept_default),
+            "Accept-Language": str(self.accept_lang),
+        }
+

 class SearchEngine(models.Model):
    ''' A search engine, and all the data needed to use it '''
@ -94,8 +185,8 @@ class SearchEngine(models.Model):
    url = models.URLField()
    query_pattern = models.CharField(max_length=256)  # This field is the
    # query pattern. It should contain a `{}`, which, when substituted with a
-    # search term (using `.format()`), must yield a URL that can be resolved to
-    # perform the search
+    # search term (using `.format()`), must yield a URL tail that can be
+    # concatenated with `url` to perform a search (eg. `?q={}` for ddg).

    def __str__(self):
        return self.name
@ -103,9 +194,10 @@ class SearchEngine(models.Model):
    def search_url(self, search_term):
        ''' Obtain a url to search `search_term` with this search engine '''
        pattern = str(self.query_pattern)
+        search_term = str(search_term).replace(' ', '+')
        if '{}' not in pattern:
            raise InvalidData("Search engine {}: bad pattern".format(self))
-        return str(self.query_pattern).format(search_term)
+        return self.url + (str(self.query_pattern).format(search_term))


 class Interest(models.Model):
@ -139,3 +231,35 @@ class Profile(models.Model):
                                      on_delete=models.CASCADE)
    browser_fingerprint = models.ForeignKey(BrowserFingerprint,
                                            on_delete=models.CASCADE)
+
+
+def generate_email(nick, first_name, last_name):
+    domain = random.choice(EMAIL_DOMAINS)
+    if random.random() < 0.3:
+        email = first_name + "." + last_name + "@" + domain
+    else:
+        email = nick + "@" + domain
+    return email
+
+
+@require_nicknames
+def create_profile(nick=None):
+    nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
+    first_name = random.choice(FIRSTNAMES)
+    last_name = random.choice(LASTNAMES)
+    email = generate_email(nick, first_name, last_name)
+    profile = Profile(
+        nick=nick,
+        first_name=first_name,
+        last_name=last_name,
+        email=email,
+        uses_urls=(random.random() < 0.5),
+    )
+    profile.search_engine = random.choice(SearchEngine.objects.all())
+    profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
+
+    profile.full_clean()
+    profile.save()
+    profile.interests.add(random.choice(Interest.objects.all()))
+    profile.save()
+    return profile
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 aiodns==1.1.1
-aiohttp==3.0.1
+aiohttp==2.3.2
 async-timeout==2.0.0
 attrs==17.4.0
 cchardet==2.1.1
@ -12,5 +12,8 @@ pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
 beautifulsoup4==4.6.0
+stem==1.6.0
+pycurl==7.43.0.1
 rdflib==4.2.2
 git+https://github.com/tobast/RDFSerializer.git
+aiosocks==0.2.6
Author	SHA1	Message	Date
Rémi Oudin	89d1f8301a	Remove duplicated url in history	2018-02-26 17:46:49 +01:00
Théophile Bastian	379b53e6ce	Fix printing in gen_history	2018-02-26 17:25:04 +01:00
Théophile Bastian	c94841c17b	Add gen_history django-admin command	2018-02-26 17:25:04 +01:00
Rémi Oudin	97107d9bec	Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam	2018-02-26 17:12:26 +01:00
Rémi Oudin	dedc66bb9d	Bug fix	2018-02-26 17:12:19 +01:00
Théophile Bastian	d3d04739e7	Add DuckDuckGo lite search engine to stock data This search engine works better than the others	2018-02-26 17:10:18 +01:00
Théophile Bastian	b88aeffd5a	Helpful README	2018-02-26 17:09:05 +01:00
Rémi Oudin	7c8ec7351c	Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam	2018-02-26 17:04:09 +01:00
Théophile Bastian	2005c0f24f	Add xml string gen	2018-02-26 17:03:27 +01:00
Rémi Oudin	392e16b797	Merge branch 'histories_models'	2018-02-26 17:03:27 +01:00
Théophile Bastian	185c1cf8a4	Fix XML generation	2018-02-26 17:00:53 +01:00
Rémi Oudin	9dd1954067	Partial runner fix	2018-02-26 17:00:53 +01:00
Rémi Oudin	04270e88c0	Bug fix	2018-02-26 17:00:12 +01:00
Théophile Bastian	6bc64ceb7a	Add requirement for aiohttp	2018-02-26 16:38:16 +01:00
Rémi Oudin	15e0c2a11c	Partial runner fix	2018-02-26 16:37:51 +01:00
Rémi Oudin	2b07779f5c	Bug fix	2018-02-26 16:37:32 +01:00
Théophile Bastian	8cdc50c04e	Fix stupid typo	2018-02-26 16:34:43 +01:00
Rémi Oudin	22fa039f1b	Remove debug print	2018-02-26 16:23:14 +01:00
Théophile Bastian	e4ad8c7ce6	Towards a working XML export	2018-02-26 15:58:30 +01:00
Théophile Bastian	67ad232533	Add a timeout to a single page retrieval	2018-02-26 15:42:36 +01:00
Théophile Bastian	e140d4a8a7	Fix merge remanences	2018-02-26 15:37:05 +01:00
Théophile Bastian	98fe69ba62	Real async crawling	2018-02-26 15:30:38 +01:00
Théophile Bastian	968ff6d24c	More robust crawling	2018-02-26 15:29:36 +01:00
Rémi Oudin	5d4bd30e20	Exception handling	2018-02-26 15:15:03 +01:00
Rémi Oudin	bdfa285e6b	We do not want to use settings	2018-02-26 15:14:53 +01:00
Rémi Oudin	65f777f00f	Should get the objects and not the Manager	2018-02-26 15:04:26 +01:00
Rémi Oudin	236e40d359	Sanity check	2018-02-26 14:57:46 +01:00
Rémi Oudin	22017cea91	Typo in data u_u	2018-02-26 14:56:22 +01:00
Rémi Oudin	549c861908	Bug fixé	2018-02-26 14:38:26 +01:00
Rémi Oudin	517be1d822	Merge rdf branch	2018-02-26 14:11:06 +01:00
Rémi Oudin	c4f63a92b2	Error in the merge, mea culpa	2018-02-26 14:01:29 +01:00
Rémi Oudin	db067e56fc	Typo	2018-02-26 13:59:34 +01:00
Rémi Oudin	33bdae96e4	merge commit from histories_tobast into histories_models	2018-02-26 12:59:38 +01:00
Rémi Oudin	526aad1364	Add interests	2018-02-26 12:33:23 +01:00
Théophile Bastian	02e91bb2b7	Fix function calls	2018-02-26 11:56:02 +01:00
Théophile Bastian	3e5fc2f9b3	Fix search engine URL generation	2018-02-26 11:49:24 +01:00
Théophile Bastian	45ddbff91a	Crawling and histories: fix a lot of stuff	2018-02-26 11:49:24 +01:00
Théophile Bastian	e6d587bffd	Actually save to DB a created history	2018-02-26 11:49:24 +01:00
Théophile Bastian	8baf408e02	Use dict from data/nicknames_dict for nicknames	2018-02-26 11:49:24 +01:00
Théophile Bastian	6463e348ac	Fix populate.sh exec path	2018-02-26 11:48:51 +01:00
Théophile Bastian	22064ebee3	Histories: xml import/export — untested To be tested when history generation is available	2018-02-26 11:48:51 +01:00
Théophile Bastian	a4de51b84a	Crawl: do not use global SEARCH_ENGINES	2018-02-26 11:48:51 +01:00
Théophile Bastian	4f0148cb63	Crawler: use a random fingerprint	2018-02-26 11:48:51 +01:00
Théophile Bastian	4a8bd32516	Fix tor_runner import	2018-02-26 11:48:51 +01:00
Rémi Oudin	44cf26df8f	It can be useful to save a new object	2018-02-26 11:42:45 +01:00
Rémi Oudin	adb892ab7d	Check if crawling a search engine	2018-02-26 11:12:36 +01:00
Rémi Oudin	15db8b4697	Change option name due to downgrade of aiohttp	2018-02-26 10:23:32 +01:00
Rémi Oudin	d6b26c0a46	Better use of history	2018-02-26 10:05:33 +01:00
Rémi Oudin	8f5c4f3f0f	Use datetimes	2018-02-26 09:49:24 +01:00
Rémi Oudin	71d9e18eec	Add headers support	2018-02-25 23:56:51 +01:00
Rémi Oudin	8ad46c0481	Bug fix, syntax erro	2018-02-25 21:59:29 +01:00
Rémi Oudin	f66c978466	Tor runner has a run function to replay the history	2018-02-25 21:53:28 +01:00
Rémi Oudin	0a676a2f65	PEP8	2018-02-25 21:34:20 +01:00
Rémi Oudin	e074d96f02	tor_runner can make requests	2018-02-25 21:27:15 +01:00
Rémi Oudin	93b235cb6c	Fix interests import	2018-02-25 21:20:52 +01:00
Rémi Oudin	ae5699c089	Basic tor runner	2018-02-25 19:42:58 +01:00
Rémi Oudin	f7313ff659	Add populate.sh script	2018-02-25 16:16:04 +01:00
Rémi Oudin	0661fe0f01	Fix path	2018-02-25 16:10:38 +01:00
Rémi Oudin	4b19febdf6	Add interests	2018-02-25 16:10:22 +01:00
Théophile Bastian	15323c3465	[REBASE ME] Crawl: enhance efficiency and output a tree	2018-02-25 15:08:06 +01:00
Rémi Oudin	05a2e2ca3f	Partial generation of profiles	2018-02-25 13:18:12 +01:00
Rémi Oudin	d4aefb6bb7	Load the data	2018-02-25 13:17:44 +01:00
Rémi Oudin	3eb82a4a0b	data for names and emails	2018-02-25 13:17:27 +01:00
Rémi Oudin	7c0fb7dda1	Better naming	2018-02-25 11:49:44 +01:00
Rémi Oudin	ee32e5385b	Finished data import	2018-02-25 11:49:11 +01:00
Rémi Oudin	bc7348f677	Integration of crawl module in histories	2018-02-24 23:17:24 +01:00
Rémi Oudin	60bfc8cb77	Merge branch 'crawl' into histories_models	2018-02-24 18:44:27 +01:00
Rémi Oudin	12c8c652d7	Serialisation function	2018-02-24 18:40:27 +01:00
Rémi Oudin	c58f42476f	Missing script for `854481d`	2018-02-24 17:22:52 +01:00
Rémi Oudin	854481dbd3	Import utilities	2018-02-24 17:21:41 +01:00
Rémi Oudin	d19c2e8216	Add mailto adresses to forbidden list	2018-02-24 15:41:46 +01:00
Rémi Oudin	e56c088632	Better filter	2018-02-24 11:39:04 +01:00
Rémi Oudin	f0b8672c89	Silly me. (bis)	2018-02-23 10:44:51 +01:00
Rémi Oudin	f6da179820	If robots.txt file is invalid, abort mission.	2018-02-23 10:36:14 +01:00
Rémi Oudin	5decd205fb	Typos + improvements	2018-02-22 11:06:45 +01:00
Rémi Oudin	ad0ad0a783	Command to add browser fingerprint data	2018-02-21 16:50:27 +01:00
Rémi Oudin	cd4d8a4c3f	More generic code using @8f4458b	2018-02-21 11:50:28 +01:00
Rémi Oudin	8f4458b009	Url generation method, for more genericity	2018-02-21 11:37:44 +01:00
Rémi Oudin	5539f57139	Add missing docstrings	2018-02-21 11:35:53 +01:00
Rémi Oudin	4920de5838	Going on in the generation of history	2018-02-20 23:42:21 +01:00
Rémi Oudin	7c13ee17d4	Skeleton of history generation	2018-02-19 22:56:16 +01:00
Rémi Oudin	7f343d8ad8	Better formatting	2018-02-19 13:59:29 +01:00
Rémi Oudin	3b0fa27951	Add histories application to settings file	2018-02-19 13:59:29 +01:00
Rémi Oudin	60f09bd4d3	Add basic models for histories	2018-02-19 13:58:55 +01:00