30 changed files with 81 additions and 7777 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,3 @@
 # mpri-webdam

-Generate realistic fake browsing histories for borderline and/or activists
-users, to hide real traffic from global surveillance.
-
-Lacks proper documentation at the moment `:(`
+Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -4,7 +4,7 @@ from urllib.error import URLError
 from urllib.parse import urlparse

 from ssl import CertificateError
-from random import sample, randrange, randint
+from random import sample, randrange
 import re
 from datetime import datetime, timedelta

@ -14,8 +14,6 @@ import async_timeout

 from bs4 import BeautifulSoup, Comment

-from profiles.models import BrowserFingerprint, SearchEngine
-
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings

@ -27,11 +25,11 @@ MAX_PER_PAGE = 10

 FOOTER_URL = re.compile(".*footer.*")

-
 class Settings:
-    USER_AGENT = 'Default User'
+    USER_AGENT = 'Blah'

 settings = Settings()
+startup_time = datetime.now()


 def url_getter(html, current_page, root_url):
@ -62,7 +60,7 @@ def url_getter(html, current_page, root_url):
            elif link.startswith('/'): #Internal link, linking to page root url
                links_list.append(root_url + link)
            elif link.startswith("#"):
-                continue
+                print("Invalid link : internal bookmark")
            else:
                links_list.append(current_page + "/" + link)

@ -73,14 +71,11 @@ def url_getter(html, current_page, root_url):
    # Works only with python >= 3.6
    links_list = list(dict.fromkeys(links_list))

-    forbidden_words = ['login', 'agreement', 'mailto', 'settings']
-    links_list = [link for link in links_list if not any(word in link.lower()
-                                                         for word in
-                                                         forbidden_words)]
-
    return links_list


+
+
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
@ -103,47 +98,34 @@ class WebsiteSchedulerMeta(type):

 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
-
-    search_engines = []  # Must be set by CrawlingThread.__init__
-
-    def __init__(self, name, user_agent):
+    def __init__(self, name):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
-        self.can_fetch_b = False
-        self.user_agent = (user_agent if user_agent is not None
-                           else settings.USER_AGENT)
-        if any(self.urlroot() in item for item in self.search_engines):
-            print("found a search engine for %s" % self.urlroot())
-            self.crawl_delay = timedelta(seconds=5)
-            self.can_fetch_b = True
-        else:
+        try:
+            robots_url = self.urlroot() + 'robots.txt'
+            self.robot_parser = RobotFileParser(robots_url)
+            self.robot_parser.read()  # TODO async?
+        except (URLError, CertificateError):
            try:
-                robots_url = self.urlroot() + 'robots.txt'
+                robots_url = self.unsafe_urlroot() + 'robots.txt'
                self.robot_parser = RobotFileParser(robots_url)
-                self.robot_parser.read()  # TODO async?
-            except (URLError, CertificateError):
-                try:
-                    robots_url = self.unsafe_urlroot() + 'robots.txt'
-                    self.robot_parser = RobotFileParser(robots_url)
-                    self.robot_parser.read()
-                except URLError:  # Almost surely an offline website.
-                    self.dead = True
-                    self.crawl_delay = 0
-            except Exception as e:
-                print(e)
-                raise e
-            if not self.robot_parser.default_entry:
+                self.robot_parser.read()
+            except URLError: # Almost surely an offline website.
                self.dead = True
-            if not self.dead:
-                delay = self.robot_parser.crawl_delay(self.user_agent)
-                if delay is None:
-                    req_rate = self.robot_parser.request_rate(self.user_agent)
-                    if req_rate is None:
-                        delay = 5
-                    else:
-                        delay = req_rate.requests, req_rate.seconds
-                self.crawl_delay = timedelta(seconds=delay)
+                self.crawl_delay = 0
+        except Exception as e:
+            print(e)
+            raise e
+        if not self.dead:
+            delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+            if delay is None:
+                req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                if req_rate is None:
+                    delay = 5
+                else:
+                    delay = req_rate.requests, req_rate.seconds
+            self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
        ''' Get the root url for this website '''
@ -161,9 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return ((self.can_fetch_b)
-                or ((not self.dead) and
-                    self.robot_parser.can_fetch(self.user_agent, url)))
+        return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -174,47 +154,30 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self, url):
-        engine_list = [engine.url for engine in SearchEngine.objects.all()]
-        WebsiteScheduler.search_engines = engine_list
-
-        nb_fingerprint = len(BrowserFingerprint.objects.all())
-        fingerprint = BrowserFingerprint.objects.all()[
-            randint(0, nb_fingerprint - 1)]
-        self.headers = fingerprint.serialize_headers()
-
-        self.output_tree = []
+    def __init__(self):
        super(CrawlingThread, self).__init__()
-        self.url = url

    def run(self):
        tasks = []
-
        #tasks.append(async_crawler("http://plus.google.com/+Python"))
-        #tasks.append(async_crawler('https://python.org/'))
-        tasks.append(run_crawl(self.url, self.output_tree, self.headers))
+        tasks.append(async_print('https://python.org/'))

-        try:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            loop.run_until_complete(asyncio.wait(tasks))
-        finally:
-            loop.close()
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(asyncio.wait(tasks))
+        loop.close()


 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

-    headers = None
-
-    def __init__(self, session, url, user_agent):
+    def __init__(self, session, url):
        self.url = url
        self.session = session
-        self.user_agent = user_agent

    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
-        scheduler = WebsiteScheduler(self.url, self.user_agent)
+        scheduler = WebsiteScheduler(self.url)
        if not scheduler.can_fetch(self.url):
            return None

@ -224,7 +187,7 @@ class PageGetter:
            delay = scheduler.fetch_delay()
        scheduler.fetching()
        async with async_timeout.timeout(10):
-            async with self.session.get(self.url, verify_ssl=ssl) as resp:
+            async with self.session.get(self.url, ssl=ssl) as resp:
                try:
                    return await resp.text()
                except UnicodeDecodeError:
@ -234,89 +197,44 @@ class PageGetter:
 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
-        html = await PageGetter(session, url,
-                                settings.USER_AGENT).get(ssl=False)
+        html = await PageGetter(session, url).get(ssl=False)

-        print('GOT {}HTML for {}'.format(
+        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
            url,
-        ))
+            datetime.now() - startup_time))

+async def async_crawler(url):
+    queue = [url]
+    crawled = []
+    while queue or (len(crawled) < HARD_LIMIT):
+        async with aiohttp.ClientSession() as session:
+            try:
+                url = queue.pop(0)
+            except IndexError:
+                print("Error queue is empty")
+                return crawled
+            parsed_url = urlparse(url)
+            print("Crawling {}".format(url))
+            html = await PageGetter(session, url).get(ssl=False)
+            if html:
+                new_urls = url_getter(
+                    html,
+                    url,
+                    parsed_url.scheme + "://" + parsed_url.netloc
+                )
+                crawled += [url]
+                if new_urls:
+                    sampled = sample(
+                        new_urls,
+                        randrange(min(MAX_PER_PAGE, len(new_urls)))
+                    )
+                    queue += [sample_url for sample_url in sampled if
+                              sample_url not in queue and sample_url not in
+                              crawled]
+    print(crawled)

-class CrawlElem:
-    ''' Describes a crawled element, to be assembled into a tree '''
-
-    def __init__(self, url, parent):
-        self.url = url
-        self.parent = parent
-
-
-async def run_crawl(url, output_tree, headers=None):
-    ''' Starts a crawling session '''
-
-    if headers is None:
-        headers = {}
-    if 'User-Agent' not in headers:
-        headers['User-Agent'] = settings.USER_AGENT
-
-    user_agent = headers['User-Agent']
-    crawled = set()
-
-    async with aiohttp.ClientSession(headers=headers) as session:
-        await async_crawler(
-            url, output_tree, crawled, user_agent, session, None)
-
-
-def simplify_url(url):
-    anchor = url.find('#')
-    if anchor >= 0:
-        url = url[:anchor]
-
-    prot = url.find('://')
-    if prot >= 0:
-        url = url[prot+3:]
-
-    if url.startswith('www.'):
-        url = url[4:]
-
-    return url
-
-
-async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
-    if len(crawled) >= HARD_LIMIT:
-        return
-    crawled.add(simplify_url(url))
-    parsed_url = urlparse(url)
-    print("Crawling {}".format(url))
-    try:
-        with async_timeout.timeout(3):
-            html = await PageGetter(session, url, user_agent).get(ssl=False)
-    except asyncio.TimeoutError:
-        return
-
-    new_tasks = []
-
-    if html:
-        this_elem = CrawlElem(url, parent)
-        out_tree.append(this_elem)
-        new_urls = url_getter(
-            html,
-            url,
-            parsed_url.scheme + "://" + parsed_url.netloc
-        )
-        if new_urls:
-            sampled = sample(
-                new_urls,
-                randrange(min(MAX_PER_PAGE, len(new_urls)))
-            )
-            for sample_url in sampled:
-                if simplify_url(sample_url) not in crawled:
-                    new_tasks.append(async_crawler(
-                        sample_url, out_tree, crawled, user_agent, session,
-                        this_elem))
-    else:
-        print("No html received")
-    if len(crawled) >= HARD_LIMIT:
-        return
-    if new_tasks:
-        await asyncio.wait(new_tasks)
+if __name__ == '__main__':
+    crawl = CrawlingThread()
+    crawl.start()
+    crawl.join()
--- a/data/.gitignore
+++ b/data/.gitignore
@ -1 +0,0 @@
-nicknames_dict
--- a/data/email_domains.txt
+++ b/data/email_domains.txt
@ -1,14 +0,0 @@
-gmail.com
-protonmail.com
-riseup.net
-tutanoto.com
-outlook.fr
-fastmail.com
-yandex.com
-aim.com
-icloud.com
-yahoo.com
-fmx.fr
-mail.com
-hushmail.com
-inbox.com
--- a/data/events.json
+++ b/data/events.json
@ -1,27 +0,0 @@
-[
-    {
-        "name" : "Atelier Anti-Pub",
-        "date" : "07/03/2018 19:00 UTC",
-        "place" : {
-            "name" : "Centre Social Autogéré Vaydom",
-            "address" : "37 rue Marceau, Ivry-sur-Seine",
-            "lat" : "48.81787",
-            "lon" : "2.38032"
-        }
-    },
-    {
-        "name" : "Rassemblement de soutien pour Bure",
-        "date" : "27/02/2018 17:00 UTC",
-        "place" : {
-            "name" : "Place Saint-Michel",
-            "address" : "Place Saint-Michel, 75005 Paris",
-            "lat" : "48.85374",
-            "lon" : "2.34455"
-        }
-    },
-    {
-        "name" : "Création d'un serveur mail",
-        "date" : "15/02/2018 12:00 UTC",
-        "place" : "La Mutinerie"
-    }
-]
--- a/data/firstnames.txt
+++ b/data/firstnames.txt
@ -1,200 +0,0 @@
-Jean
-Marie
-Philippe
-Nathalie
-Michel
-Isabelle
-Alain
-Sylvie
-Patrick
-Catherine
-Nicolas
-Martine
-Christophe
-Christine
-Pierre
-Françoise
-Christian
-Valérie
-Éric
-Sandrine
-Frédéric
-Stéphanie
-Laurent
-Véronique
-Stéphane
-Sophie
-David
-Céline
-Pascal
-Chantal
-Daniel
-Patricia
-Alexandre
-Anne
-Julien
-Brigitte
-Thierry
-Julie
-Olivier
-Monique
-Bernard
-Aurélie
-Thomas
-Nicole
-Sébastien
-Laurence
-Gérard
-Annie
-Didier
-Émilie
-Dominique
-Dominique
-Vincent
-Virginie
-François
-Corinne
-Bruno
-Élodie
-Guillaume
-Christelle
-Jérôme
-Camille
-Jacques
-Caroline
-Marc
-Léa
-Maxime
-Sarah
-Romain
-Florence
-Claude
-Laetitia
-Antoine
-Audrey
-Franck
-Hélène
-Jean-Pierre
-Laura
-Anthony
-Manon
-Kévin
-Michèle
-Gilles
-Cécile
-Cédric
-Christiane
-Serge
-Béatrice
-André
-Claire
-Mathieu
-Nadine
-Benjamin
-Delphine
-Patrice
-Pauline
-Fabrice
-Karine
-Joël
-Mélanie
-Jérémy
-Marion
-Clément
-Chloe
-Arnaud
-Jacqueline
-Denis
-Elisabeth
-Paul
-Evelyne
-Lucas
-Marine
-Hervé
-Claudine
-Jean-Claude
-Anais
-Sylvain
-Lucie
-Yves
-Danielle
-Ludovic
-Carole
-Guy
-Fabienne
-Florian
-Mathilde
-Damien
-Sandra
-Alexis
-Pascale
-Mickaël
-Annick
-Quentin
-Charlotte
-Emmanuel
-Emma
-Louis
-Severine
-Benoît
-Sabrina
-Jean-Luc
-Amandine
-Fabien
-Myriam
-Francis
-Jocelyne
-Hugo
-Alexandra
-Jonathan
-Angelique
-Loïc
-Josiane
-Xavier
-Joelle
-Théo
-Agnes
-Adrien
-Mireille
-Raphaël
-Vanessa
-Jean-François
-Justine
-Grégory
-Sonia
-Robert
-Bernadette
-Michaël
-Emmanuelle
-Valentin
-Oceane
-Cyril
-Amelie
-Jean-Marc
-Clara
-René
-Maryse
-Lionel
-Anne-marie
-Yannick
-Fanny
-Enzo
-Magali
-Yannis
-Marie-christine
-Jean-Michel
-Morgane
-Baptiste
-Ines
-Matthieu
-Nadia
-Rémi
-Muriel
-Georges
-Jessica
-Aurélien
-Laure
-Nathan
-Genevieve
-Jean-Paul
-Estelle
--- a/data/interests.json
+++ b/data/interests.json
@ -1,55 +0,0 @@
-[
-    {
-        "name": "occupation",
-        "keywords": [
-            {"keyword" : "ZAD NDDL"},
-            {"keyword" : "Organiser un squat"},
-            {"keyword" : "mobilisation et rassemblement"}
-        ],
-        "places": [
-            {"place" : "Zad NDDL"},
-            {"place" : "Zad Bure"}
-        ],
-        "websites": [
-            {"website": "zad nadir"}
-        ],
-        "events":  [
-            {"event": "Rassemblement de soutien pour Bure"}
-        ]
-    },
-    {
-        "name": "LGBT",
-        "keywords": [
-            {"keyword" : "Discrimniation sexistes, quelles actions ?"},
-            {"keyword" : "gender queer Paris"},
-            {"keyword" : "Existrans Paris"}
-        ],
-        "places": [
-            {"place" : "La Mutinerie"}
-        ],
-        "websites": [
-            {"website": "emmaclit"},
-            {"website": "paris-luttes info"}
-        ],
-        "events":  [
-            {"event": "Création d'un serveur mail"}
-        ]
-    },
-    {
-        "name": "Anti pub",
-        "keywords": [
-            {"keyword" : "Affichage JCDecaux"},
-            {"keyword" : "Anti-pub"},
-            {"keyword" : "Journée contre la publicité"}
-        ],
-        "places": [
-            {"place" : "Centre Social Autogéré Vaydom"}
-        ],
-        "websites": [
-            {"website": "paris-luttes info"}
-        ],
-        "events":  [
-            {"event": "Atelier Anti-Pub"}
-        ]
-    }
-]
--- a/data/keywords.json
+++ b/data/keywords.json
@ -1,17 +0,0 @@
-{
-    "list": [
-        { "keyword" : "gender queer Paris"},
-        {"keyword" : "fabriquer masque manif"},
-        {"keyword" : "Se protéger en manif"},
-        {"keyword" : "Legal team manif France"},
-        {"keyword" : "Guide juridique GAV"},
-        {"keyword" : "Échec du capitaisme"},
-        {"keyword" : "Bienfait du communisme"},
-        {"keyword" : "Le comité invisible"},
-        {"keyword" : "À nos enfants"},
-        {"keyword" : "Squats sur Paris"},
-        {"keyword" : "Local facho à Strasbourg"},
-        {"keyword" : "Discrimation sexistes, quelles actions ?"},
-        {"keyword" : "Pourquoi la lutte des classes"}
-    ]
-}
--- a/data/lastnames.txt
+++ b/data/lastnames.txt
@ -1,200 +0,0 @@
-Martin
-Bernard
-Thomas
-Petit
-Robert
-Richard
-Durand
-Dubois
-Moreau
-Laurent
-Simon
-Michel
-Lefebvre
-Leroy
-Roux
-David
-Bertrand
-Morel
-Fournier
-Girard
-Bonnet
-Dupont
-Lambert
-Fontaine
-Rousseau
-Vincent
-Muller
-Lefevre
-Faure
-Andre
-Mercier
-Blanc
-Guerin
-Boyer
-Garnier
-Chevalier
-Francois
-Legrand
-Gauthier
-Garcia
-Perrin
-Robin
-Clement
-Morin
-Nicolas
-Henry
-Roussel
-Mathieu
-Gautier
-Masson
-Marchand
-Duval
-Denis
-Dumont
-Marie
-Lemaire
-Noel
-Meyer
-Dufour
-Meunier
-Brun
-Blanchard
-Giraud
-Joly
-Riviere
-Lucas
-Brunet
-Gaillard
-Barbier
-Arnaud
-Martinez
-Gerard
-Roche
-Renard
-Schmitt
-Roy
-Leroux
-Colin
-Vidal
-Caron
-Picard
-Roger
-Fabre
-Aubert
-Lemoine
-Renaud
-Dumas
-Lacroix
-Olivier
-Philippe
-Bourgeois
-Pierre
-Benoit
-Rey
-Leclerc
-Payet
-Rolland
-Leclercq
-Guillaume
-Lecomte
-Lopez
-Jean
-Dupuy
-Guillot
-Hubert
-Berger
-Carpentier
-Sanchez
-Dupuis
-Moulin
-Louis
-Deschamps
-Huet
-Vasseur
-Perez
-Boucher
-Fleury
-Royer
-Klein
-Jacquet
-Adam
-Paris
-Poirier
-Marty
-Aubry
-Guyot
-Carre
-Charles
-Renault
-Charpentier
-Menard
-Maillard
-Baron
-Bertin
-Bailly
-Herve
-Schneider
-Fernandez
-Le Gall
-Collet
-Leger
-Bouvier
-Julien
-Prevost
-Millet
-Perrot
-Daniel
-Le Roux
-Cousin
-Germain
-Breton
-Besson
-Langlois
-Remy
-Le Goff
-Pelletier
-Leveque
-Perrier
-Leblanc
-Barre
-Lebrun
-Marchal
-Weber
-Mallet
-Hamon
-Boulanger
-Jacob
-Monnier
-Michaud
-Rodriguez
-Guichard
-Gillet
-Etienne
-Grondin
-Poulain
-Tessier
-Chevallier
-Collin
-Chauvin
-Da Silva
-Bouchet
-Gay
-Lemaitre
-Benard
-Marechal
-Humbert
-Reynaud
-Antoine
-Hoarau
-Perret
-Barthelemy
-Cordier
-Pichon
-Lejeune
-Gilbert
-Lamy
-Delaunay
-Pasquier
-Carlier
-Laporte
--- a/data/place.json
+++ b/data/place.json
@ -1,26 +0,0 @@
-[
-    {
-        "place" : {
-            "name" : "Zad NDDL",
-            "address" : "Notre-Dame-des-landes, 44111",
-            "lat" : "47.3435",
-            "lon": "-1.7367"
-        }
-    },
-    {
-        "place" : {
-            "name" : "La Mutinerie",
-            "address" : "176 - 178 rue Saint Martin, 75003 Paris",
-            "lat" : "48.8625665",
-            "lon": "2.3522237"
-        }
-    },
-    {
-        "place" : {
-            "name" : "Zad Bure",
-            "address" : "2 rue de l'Église, 55290 Bure",
-            "lat" : "48.502",
-            "lon": "5.351"
-        }
-    }
-]
--- a/data/search_engine.json
+++ b/data/search_engine.json
@ -1,44 +0,0 @@
-[
-    {
-		"searchengine": {
-            "name":"Google",
-            "url":"https://google.com/",
-            "query_pattern": "search?q={}"
-        }
-    },
-    {
-		"searchengine": {
-            "name":"Duckduckgo",
-            "url":"https://duckduckgo.com/",
-            "query_pattern":"?q={}"
-        }
-    },
-    {
-		"searchengine": {
-            "name":"Duckduckgo Lite",
-            "url":"https://duckduckgo.com/lite/",
-            "query_pattern":"?q={}"
-        }
-    },
-    {
-		"searchengine": {
-            "name":"Qwant",
-            "url":"https://www.qwant.com/",
-            "query_pattern":"?q={}"
-        }
-    },
-    {
-		"searchengine": {
-            "name":"Qwant lite",
-            "url":"https://lite.qwant.com/",
-            "query_pattern":"?q={}"
-        }
-    },
-    {
-		"searchengine": {
-            "name":"Framabee",
-            "url":"https://framabee.org/",
-            "query_pattern":"?q={}"
-        }
-    }
-]
--- a/data/user-agent.json
+++ b/data/user-agent.json
--- a/data/website.json
+++ b/data/website.json
@ -1,93 +0,0 @@
-[
-    {
-        "name":"emmaclit",
-        "url":"https://emmaclit.com/",
-        "keywords": [
-            {"keyword":"Charge mentale"},
-            {"keyword":"Un autre regard"},
-            {"keyword":"Un petit poutou"},
-            {"keyword":"solidarité"},
-            {"keyword":"dédicace"}
-        ],
-        "notable_pages": [
-            {"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
-            {"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
-            {"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
-        ]
-    },
-    {
-        "name":"paris-luttes info",
-        "url":"https://paris-luttes.info/",
-        "keywords": [
-            {"keyword":"manifestations"},
-            {"keyword":"solidarité immigré·e·s"},
-            {"keyword":"grève salariés"},
-            {"keyword":"prison"},
-            {"keyword":"violence policère"}
-        ],
-        "notable_pages": [
-            {"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
-            {"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
-            {"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
-        ]
-    },
-    {
-        "name":"zad nadir",
-        "url":"http://zad.nadir.org/",
-        "keywords": [
-            {"keyword":"Écologie"},
-            {"keyword":"opération césar"},
-            {"keyword":"expulsion vinci"},
-            {"keyword":"adresse"},
-            {"keyword":"la wardine"},
-            {"keyword":"route des chicanes"},
-            {"keyword":"opposition à l'aéroport Grand Ouest"}
-        ],
-        "notable_pages": [
-            {"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
-            {"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
-            {"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
-            {"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
-        ]
-    },
-    {
-        "name":"Fnac",
-        "url":"https://www.fnac.com/",
-        "keywords": [
-            {"keyword":"smartphone"},
-            {"keyword":"SAV"},
-            {"keyword":"Macbook"},
-            {"keyword":"TV"},
-            {"keyword":"PC Gaming"},
-            {"keyword":"DVD"},
-            {"keyword":"Home Cinema Philips"},
-            {"keyword":"Billeterie"}
-        ],
-        "notable_pages": [
-            {"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
-            {"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
-            {"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
-            {"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
-        ]
-    },
-    {
-        "name":"Sea Shepherd",
-        "url":"https://www.seashepherd.fr/",
-        "keywords": [
-            {"keyword":"pirates"},
-            {"keyword":"Phoques"},
-            {"keyword":"Paul Watson"},
-            {"keyword":"harponnage"},
-            {"keyword":"seal"},
-            {"keyword":"Chasse aux dauphins"},
-            {"keyword":"participation"},
-            {"keyword":"boutique"}
-        ],
-        "notable_pages": [
-            {"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
-            {"webpage": "http://nyamba.seashepherd.info/"},
-            {"webpage": "http://seashepherd-shop.com/en/"},
-            {"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
-        ]
-    }
-]
--- a/histories/management/commands/gen_history.py
+++ b/histories/management/commands/gen_history.py
@ -1,16 +0,0 @@
-from django.core.management.base import BaseCommand
-from profiles import models as profiles
-from histories.models import generate_history
-from datetime import datetime
-
-
-class Command(BaseCommand):
-    ''' Generates an history and prints the related XML '''
-
-    def add_arguments(self, parser):
-        pass
-
-    def handle(self, *args, **kwargs):
-        prof = profiles.Profile.objects.all()[0]
-        history = generate_history(prof, datetime.now())
-        print(history.to_xml_string())
--- a/histories/migrations/0001_initial.py
+++ b/histories/migrations/0001_initial.py
@ -1,34 +0,0 @@
-# Generated by Django 2.0.1 on 2018-02-25 19:08
-
-from django.db import migrations, models
-import django.db.models.deletion
-
-
-class Migration(migrations.Migration):
-
-    initial = True
-
-    dependencies = [
-        ('profiles', '0001_initial'),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name='History',
-            fields=[
-                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
-                ('played', models.BooleanField(default=False)),
-                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
-            ],
-        ),
-        migrations.CreateModel(
-            name='HistoryEntry',
-            fields=[
-                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('search', models.URLField(help_text='The url to be searched')),
-                ('timestamp', models.DateTimeField()),
-                ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
-            ],
-        ),
-    ]
--- a/histories/models.py
+++ b/histories/models.py
@ -1,276 +1,3 @@
-""" Models for the history. This history should be able to generate history
-entries, which looks like human-based browsing, according to a dedicated user
-interests, keywords...
-"""
-
-from collections import namedtuple
-import random
-import asyncio
-from math import floor
-from xml.etree import ElementTree as ET
-from datetime import datetime
 from django.db import models
-from django.core.exceptions import ValidationError
-import profiles.models as profiles
-from crawl import crawl
-from pinocchio.settings import HISTORY_MIN
-from .tor_runner import TorInstance

-
-class InvalidXml(Exception):
-    def __init__(self, what='unexpected XML data.'):
-        super().__init__()
-        self.what = what
-
-    def __str__(self):
-        return "Invalid XML: " + self.what
-
-
-class HistoryEntry(models.Model):
-    """ A history entry, aka a url, and a timestamp.
-    """
-    search = models.URLField(help_text="The url to be searched")
-    timestamp = models.DateTimeField()
-    history = models.ForeignKey(
-        'History',
-        on_delete=models.CASCADE
-    )
-
-    def __str__(self):
-        """ Returns the string representation of a history entry.
-        """
-        return "{} : {}".format(self.timestamp, self.search)
-
-    def to_xml(self, xml_root):
-        entry = ET.Element('history')
-        entry_url = ET.Element('url')
-        entry_url.text = str(self.search)
-        entry_ts = ET.Element('timestamp')
-        entry_ts.text = str(self.timestamp.timestamp())
-        entry.append(entry_url)
-        entry.append(entry_ts)
-        xml_root.append(entry)
-
-    @staticmethod
-    def from_xml(xml_root, in_history):
-        if xml_root.tag != 'history':
-            raise InvalidXml("expected <history> tag here.")
-        url, timestamp = None, None
-
-        for child in xml_root:
-            if child.tag == 'url':
-                url = child.text
-            elif child.tag == 'timestamp':
-                try:
-                    timestamp = datetime.fromtimestamp(child.text)
-                except TypeError:
-                    raise InvalidXml("invalid timestamp {}".format(child.text))
-            else:
-                raise InvalidXml("unknown tag {} as child of <history>".format(
-                    child.tag))
-        output = HistoryEntry()
-        output.search = url
-        output.timestamp = timestamp
-        output.history = in_history
-
-        return output
-
-
-class History(models.Model):
-    """ A history for a user, containing some web connections (http, https).
-    Each history is timed, in a human-behaviour manner. """
-
-    start_ts = models.DateTimeField(
-        help_text=('The starting timestamp of the history. Useful for '
-                   'cron-like structure.')
-
-    )
-    played = models.BooleanField(default=False)
-    user = models.ForeignKey(
-        profiles.Profile,
-        on_delete=models.CASCADE
-    )
-
-    def return_history(self):
-        """ Returns the history, sorted by increasing timestamps
-        """
-        output_history = self.historyentry_set.order_by('timestamp')
-        output_history = [(item.search, item.timestamp.date())
-                          for item in output_history]
-        return output_history
-
-    def __str__(self):
-        """ Returns the string representation of a history.
-        """
-        entries = self.historyentry_set.order_by('timestamp')
-        output = "[History]:\n"
-        for entry in entries:
-            output += str(entry) + '\n'
-        return output
-
-    async def _handler(self):
-        runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
-        await runner.run()
-        self.played = True
-        self.save()
-
-    def play_histories(self):
-        """ Actually plays the history.
-        """
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        loop.run_until_complete(asyncio.wait([self._handler()]))
-
-    def to_xml(self, xml_root=None):
-        ''' Exports the current history to xml '''
-        standalone = False
-        if xml_root is None:
-            standalone = True
-            xml_root = ET.Element('root')
-
-        hist_node = ET.Element("history", attrib={
-            'start-ts': str(self.start_ts),
-            'played': '1' if self.played else '0',
-            'user': str(self.user.pk),
-        })
-        xml_root.append(hist_node)
-        for entry in self.historyentry_set.all():
-            entry.to_xml(hist_node)
-
-        if standalone:
-            return xml_root
-
-    def to_xml_string(self):
-        xml = self.to_xml()
-        return ET.tostring(xml)
-
-    @staticmethod
-    def from_xml(xml_root):
-        ''' Loads an history from an XML file '''
-
-        REQUIRED_ATTR = ['start-ts', 'played', 'user']
-
-        if xml_root.tag != 'history':
-            raise InvalidXml('unexpected node {} as root of an history'.format(
-                xml_root.tag))
-        for attr in REQUIRED_ATTR:
-            if attr not in xml_root.attrib:
-                raise InvalidXml(('missing attribute "{}" for tag of type '
-                                  'history').format(attr))
-        start_ts = xml_root.attrib['start-ts']
-        played = xml_root.attrib['played']
-        user_pk = xml_root.attrib['user']
-        users = History.objects.filter(pk=1)
-        if len(users) != 1:
-            raise InvalidXml('primary key for History {} is invalid'.format(
-                user_pk))
-
-        output = History()
-        output.start_ts = start_ts
-        output.played = played > 0
-        output.user = users[0]
-
-        for child in xml_root:
-            HistoryEntry.from_xml(child, output)
-
-        return output
-
-
-PartialHistoryEntry = namedtuple('PartialHistoryEntry',
-                                 ['url', 'timestamp'])
-
-
-def generate_partial_history(user, t_start):
-    """ Generate the part of the history resulting from the crawl starting at
-    the given url.
-    """
-    timestamp = t_start
-    result = []
-    basis = generate_first_url(user)
-    t_start += 5 * random.weibullvariate(1, 1.5)
-    crawler = crawl.CrawlingThread(basis)
-    crawler.start()
-    crawler.join()
-    urls_tree = crawler.output_tree
-
-    open_time = {}
-    for elem in urls_tree:
-        url, parent = elem.url, elem.parent
-        timestamp = 0
-        if parent is None:
-            timestamp = t_start
-        else:
-            timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
-        open_time[elem] = timestamp
-        result.append(PartialHistoryEntry(url, timestamp))
-    return result
-
-
-def generate_first_url(user):
-    """ Generate the first url of a partial history, based on the user
-    information. """
-
-    def nonempty(seq):
-        out = []
-        for elt in seq:
-            if elt:
-                out.append(elt)
-        return out
-
-    all_keywords = profiles.Keyword.objects.filter(
-        interest__profile__in=[user])
-    all_websites = profiles.Website.objects.filter(
-        interest__profile__in=[user])
-    all_places = profiles.Place.objects.filter(
-        interest__profile__in=[user])
-    all_events = profiles.Event.objects.filter(
-        interest__profile__in=[user])
-
-    interest = random.choice(nonempty([
-        all_keywords,
-        all_websites,
-        all_places,
-        all_events,
-    ]))
-    search_term = random.choice(interest)
-    url = search_term.generate_url(user)
-    return url
-
-
-def generate_history(user, start_time):
-    """ Generate a new history for the user `user`, starting from timestamp
-    `ts_start`.
-    A few heuristics are used in order to give the impression that the history
-    is actually played by a user.
-    """
-
-    # let's define a new history object.
-    history = History(start_ts=start_time, user=user)
-    length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
-    history.full_clean()
-    history.save()
-
-    current_timestamp = start_time.timestamp()
-
-    hist_size = 0
-
-    while hist_size < length:
-        current_timestamp += 5 * random.weibullvariate(1, 2.8)
-        history_list = generate_partial_history(user, current_timestamp)
-        current_timestamp = \
-            history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
-        for (url, timestamp) in history_list:
-            if len(url) < 200:
-                new_line = HistoryEntry(
-                    search=url,
-                    timestamp=datetime.fromtimestamp(timestamp),
-                    history=history
-                )
-                try:
-                    new_line.full_clean()
-                    new_line.save()
-                    hist_size += 1
-                except ValidationError:
-                    continue
-
-    return history
+# Create your models here.
--- a/histories/tor_runner.py
+++ b/histories/tor_runner.py
@ -1,123 +0,0 @@
-"""
-Modules that handles tor instaces creations in order to safely run histories
-"""
-
-import shutil
-import datetime as dt
-from time import sleep
-import asyncio
-import aiohttp
-from aiosocks.connector import ProxyConnector, ProxyClientRequest
-import async_timeout
-import stem.process as tor
-
-class TorInstance():
-    """
-    A tor instance object, with some useful information.
-    It is designed to be used as a worker in order to replay an history.
-    """
-    BASE_SOCKS_PORT = 40000
-    BASE_CONTROL_PORT = 20000
-    BASE_DATA_DIR = "/tmp/tor{}/"
-    TOR_RUNNER = 0
-
-    @classmethod
-    async def create(cls, history, headers):
-        """ Factory creation of tor processes"""
-        socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
-        control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
-        data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
-        TorInstance.TOR_RUNNER += 1
-        self = TorInstance()
-        self.socks_port = socks_port
-        self.control_port = control_port
-        self.data_dir = data_dir
-        self.history = history
-        self.headers = headers
-        self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
-        self.create_session()
-        self.process = tor.launch_tor_with_config(
-            config={
-                'ControlPort' : str(control_port),
-                'SocksPort' : str(socks_port),
-                'DataDir' : data_dir
-            }
-        )
-        return self
-
-    def __init__(self):
-        self.socks_port = 0
-        self.control_port = 0
-        self.data_dir = ""
-        self.history = None
-        self.proxy = ""
-        self.headers = {}
-        self.session = None
-        self.process = None
-
-    async def run(self):
-        """ Runs the Tor Instance on the history.
-        """
-        while (self.history) and (dt.datetime.combine(self.history[0][1],
-                                                      dt.datetime.min.time()) -
-                                  dt.datetime.now()).total_seconds() >= 10:
-            print("Sleeping")
-            sleep(10)
-        while self.history:
-            item = self.history.pop(0)
-            async with async_timeout.timeout(30):
-                await(self.query(item[0]))
-                now = dt.datetime.now()
-                print(self.history[0])
-                if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
-                    sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
-
-
-    def create_session(self):
-        """ Create a aiohttp session.
-        """
-        conn = ProxyConnector(remote_resolve=True)
-        self.session = aiohttp.ClientSession(
-            connector=conn,
-            headers=self.headers,
-            request_class=ProxyClientRequest
-        )
-
-
-    async def query(self, url):
-        """ Performs a query.
-        """
-        async with async_timeout.timeout(30):
-            async with self.session.get(
-                url,
-                proxy=self.proxy,
-                proxy_auth=None) as resp:
-                try:
-                    return await resp.text()
-                except UnicodeDecodeError:
-                    return None
-
-
-    def __str__(self):
-        """ Utility function """
-        return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
-                '{0.control_port}, DataDir: {0.data_dir}'.format(self))
-
-    async def kill(self):
-        """ Kills the process and remove the data dir"""
-        self.process.kill()
-        self.session.close()
-        shutil.rmtree(self.data_dir)
-
-
-async def main():
-    """ Test function """
-    for _ in range(3):
-        instance = await TorInstance.create(None, {"user-agent" : "Blah"})
-        await instance.query("https://python.org/")
-        print("One page received")
-        await instance.kill()
-
-if __name__ == "__main__":
-    LOOP = asyncio.get_event_loop()
-    LOOP.run_until_complete(main())
--- a/pinocchio/settings.py
+++ b/pinocchio/settings.py
@ -13,9 +13,6 @@ https://docs.djangoproject.com/en/2.0/ref/settings/
 import os
 from .settings_local import BASE_DIR, DEBUG, SECRET_KEY, DATABASES

-
-HISTORY_MIN = 25
-
 ALLOWED_HOSTS = []


@ -29,7 +26,6 @@ INSTALLED_APPS = [
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'profiles',
-    'histories',
    'crawl',
 ]

@ -97,7 +93,7 @@ USE_I18N = True

 USE_L10N = True

-USE_TZ = False  # We don't really care, we want POSIX timestamps
+USE_TZ = True


 # Static files (CSS, JavaScript, Images)
--- a/populate.sh
+++ b/populate.sh
@ -1,10 +0,0 @@
-#!/bin/bash
-# -*- coding: UTF8 -*-
-
-python3 manage.py import_browser_fp
-python3 manage.py import_search_engine
-python3 manage.py import_keywords
-python3 manage.py import_website
-python3 manage.py import_places
-python3 manage.py import_events
-python3 manage.py import_interests
--- a/profiles/management/init.py
+++ b/profiles/management/init.py
--- a/profiles/management/commands/init.py
+++ b/profiles/management/commands/init.py
--- a/profiles/management/commands/import_browser_fp.py
+++ b/profiles/management/commands/import_browser_fp.py
@ -1,41 +0,0 @@
-""" Small module that import browser fingerprints into the databose,
-based on the data listed in https://huit.re/user-agent-json.
-"""
-
-import json
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import BrowserFingerprint
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        data = data[0]["list"]
-        for os_agent in data:
-            for useragent in os_agent["useragents"]:
-                import_useragent(useragent)
-
-def import_useragent(useragent):
-    fingerprint = BrowserFingerprint(
-        description=useragent.get("description", ""),
-        useragent=useragent.get("useragent", ""),
-        appname=useragent.get("appname", ""),
-        appversion=useragent.get("appversion", ""),
-        platform=useragent.get("appversion", ""),
-        vendor=useragent.get("vendor", ""),
-        vendorsub=useragent.get("vendorsub", ""),
-        buildID=useragent.get("buildID", ""),
-        oscpu=useragent.get("oscpu", ""),
-        accept_encoding=useragent.get("accept_encoding", ""),
-        accept_default=useragent.get("accept_default", ""),
-        accept_lang=useragent.get("accept_lang", ""),
-        pixeldepth=int(useragent.get("pixeldepth", 0)),
-        colordepth=int(useragent.get("colordepth", 0)),
-        screens=useragent.get("screen", ""),
-    )
-    print(fingerprint)
-    fingerprint.save()
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/user-agent.json")
--- a/profiles/management/commands/import_events.py
+++ b/profiles/management/commands/import_events.py
@ -1,41 +0,0 @@
-""" Small module that import events into the database.
-"""
-
-import json
-from datetime import datetime
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import Place, Event
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for event in data:
-            import_event(event)
-
-def import_place(_place):
-    place = Place(
-        name=_place.get("name", ""),
-        address=_place.get("address", ""),
-        lat=float(_place.get("lat", 0)),
-        lon=float(_place.get("lon", 0))
-    )
-    place.save()
-    return place
-
-def import_event(_event):
-    if isinstance(_event["place"], str):
-        place = Place.objects.get(name=_event["place"])
-    else:
-        place = import_place(_event["place"])
-    event = Event(
-        name=_event.get("name", ""),
-        date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
-        place=place
-    )
-    #print(event)
-    event.save()
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/events.json")
--- a/profiles/management/commands/import_interests.py
+++ b/profiles/management/commands/import_interests.py
@ -1,51 +0,0 @@
-""" Small module that import interests into the database.
-"""
-
-import json
-from datetime import datetime
-from django.core.management.base import BaseCommand
-from django.db import models
-from django.core.exceptions import ObjectDoesNotExist
-from profiles.models import Keyword, Interest, Place, Website, Event
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for interest in data:
-            import_interest(interest)
-
-
-def import_interest(_interest):
-    keywords = []
-    places = []
-    websites = []
-    for keyword in _interest.get("keywords", []):
-        try:
-            stored = Keyword.objects.get(text=keyword["keyword"])
-            keywords.append(stored)
-        except ObjectDoesNotExist:
-            new_keyword = Keyword(text=keyword["keyword"])
-            new_keyword.save()
-            keywords.append(new_keyword)
-            print("New keyword %s" % new_keyword)
-    for place in _interest.get("places", []):
-        places.append(Place.objects.get(name=place["place"]))
-    for website in _interest.get("websites", []):
-        websites.append(Website.objects.get(name=website["website"]))
-
-    interest = Interest(
-        name=_interest.get("name", ""),
-    )
-    interest.save()
-    for keyword in keywords:
-        print(keyword)
-        interest.keywords.add(keyword)
-    for place in places:
-        interest.places.add(place)
-    for website in websites:
-        interest.websites.add(website)
-    interest.save()
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/interests.json")
--- a/profiles/management/commands/import_keywords.py
+++ b/profiles/management/commands/import_keywords.py
@ -1,20 +0,0 @@
-""" Small module that import browser fingerprints into the databose,
-based on the data listed in https://huit.re/user-agent-json.
-"""
-
-import json
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import Keyword
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for _keyword in data["list"]:
-            keyword = Keyword(text=_keyword.get("keyword", ""))
-            keyword.save()
-
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/keywords.json")
--- a/profiles/management/commands/import_places.py
+++ b/profiles/management/commands/import_places.py
@ -1,27 +0,0 @@
-""" Small module that import browser fingerprints into the databose,
-based on the data listed in https://huit.re/user-agent-json.
-"""
-
-import json
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import Place
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for place in data:
-            import_place(place["place"])
-
-def import_place(_place):
-    place = Place(
-        name=_place.get("name", ""),
-        address=_place.get("address", ""),
-        lat=float(_place.get("lat", 0)),
-        lon=float(_place.get("lon", 0))
-    )
-    place.save()
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/place.json")
--- a/profiles/management/commands/import_search_engine.py
+++ b/profiles/management/commands/import_search_engine.py
@ -1,27 +0,0 @@
-""" Small module that import browser fingerprints into the databose,
-based on the data listed in https://huit.re/user-agent-json.
-"""
-
-import json
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import SearchEngine
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for search_engine in data:
-            import_search_engine(search_engine["searchengine"])
-
-def import_search_engine(engine):
-    search_engine = SearchEngine(
-        name=engine.get("name", ""),
-        url=engine.get("url", ""),
-        query_pattern=engine.get("query_pattern", "")
-    )
-    #print(search_engine)
-    search_engine.save()
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/search_engine.json")
--- a/profiles/management/commands/import_website.py
+++ b/profiles/management/commands/import_website.py
@ -1,46 +0,0 @@
-""" Small module that import events into the database.
-"""
-
-import json
-from datetime import datetime
-from django.core.management.base import BaseCommand
-from django.db import models
-from profiles.models import Webpage, Website, Keyword
-
-def import_file(filename):
-    with open(filename, mode='r') as file:
-        data = json.load(file)
-        for website in data:
-            import_website(website)
-
-def import_website(_website):
-    keywords = []
-    webpages = []
-    for keyword in _website.get("keywords", []):
-        new_keyword = Keyword(
-            text=keyword.get("keyword", "")
-        )
-        new_keyword.save()
-        keywords.append(new_keyword)
-    for webpage in _website.get("notable_pages",[]):
-        new_webpage = Webpage(
-            url=webpage.get("keyword", "")
-        )
-        new_webpage.save()
-        webpages.append(new_webpage)
-    website = Website(
-        name=_website.get("name", ""),
-        url=_website.get("url", ""),
-    )
-    website.save()
-    for keyword in keywords:
-        website.keywords.add(keyword)
-    for webpage in webpages:
-        website.notable_pages.add(webpage)
-    print(website)
-    #website.save()
-
-
-class Command(BaseCommand):
-    def handle(self, *args, **kwargs):
-        import_file("data/website.json")
--- a/profiles/models.py
+++ b/profiles/models.py
@ -1,46 +1,5 @@
-"""
-A django module that defines a profile, and all the information that can be
-stored in a profile.
-It stores interests, technical information such as the browser fingerprint,
-the preferred search engin, and if the user is likely to directly use urls
-or to type in the search engine.
-"""
-
-import os
-import random
 from django.db import models

-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-NICKNAMES = None
-LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
-FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
-EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
-
-
-def require_nicknames(fct):
-    def read_file(path):
-        global NICKNAMES
-        print("Trying {}".format(path))
-        with open(path, 'r') as handle:
-            NICKNAMES = handle.read().splitlines()
-
-    nicknames_files = [
-        os.path.join(BASE_DIR, 'data/nicknames_dict'),
-        "/usr/share/dict/american-english",
-    ]
-    if NICKNAMES is None:
-        for nick_file in nicknames_files:
-            try:
-                read_file(nick_file)
-                break
-            except FileNotFoundError:
-                pass
-        if NICKNAMES is None:
-            raise FileNotFoundError
-
-    return fct
-

 class InvalidData(Exception):
    ''' Thrown when the DB contains invalid data, and cannot perform
@ -62,14 +21,8 @@ class Keyword(models.Model):
    def __str__(self):
        return self.text

-    def generate_url(self, user):
-        """ Generates the url for a keyword, based on the user search engine.
-        """
-        return user.search_engine.search_url(self)
-

 class Webpage(models.Model):
-    ''' A webpage url '''
    url = models.URLField()

    def __str__(self):
@ -87,22 +40,6 @@ class Website(models.Model):
    def __str__(self):
        return self.name

-    def generate_url(self, user):
-        """ Generates the url in case the interest chosen is a website.
-        """
-        rand = random.random()
-        if user.uses_urls:
-            url = self.url
-        elif rand <= 0.1:
-            url = random.choice(self.notable_pages.all()).url
-        elif rand <= 0.8:
-            search_term_text = self.name + " " + \
-                    str(random.choice(self.keywords.all()))
-            url = user.search_engine.search_url(search_term_text)
-        else:
-            url = user.search_engine.search_url(self.name)
-        return url
-

 class Place(models.Model):
    ''' A real-life place '''
@ -115,16 +52,6 @@ class Place(models.Model):
    def __str__(self):
        return self.name

-    def generate_url(self, user):
-        """ Generates the url for a place.
-        """
-        rand = random.random()
-        if rand < 1/2:
-            url = user.search_engine.search_url(self.name)
-        else:
-            url = user.search_engine.search_url(self.address)
-        return url
-

 class Event(models.Model):
    ''' A real-life event (protests, meeting, ...) '''
@ -136,15 +63,6 @@ class Event(models.Model):
    def __str__(self):
        return self.name

-    def generate_url(self, user):
-        """ generate the url for an event object.
-        """
-        possibilities = random.sample(
-            [self.name, self.date, self.place],
-            3
-        )
-        return user.search_engine.search_url(" ".join(possibilities))
-

 class BrowserFingerprint(models.Model):
    ''' A browser fingerprint, containing things like a user agent '''
@ -168,15 +86,6 @@ class BrowserFingerprint(models.Model):
    def __str__(self):
        return self.description

-    def serialize_headers(self):
-        return {
-            "Description": str(self.description),
-            "User-Agent": str(self.useragent),
-            "Accept-Encoding": str(self.accept_encoding),
-            "Accept": str(self.accept_default),
-            "Accept-Language": str(self.accept_lang),
-        }
-

 class SearchEngine(models.Model):
    ''' A search engine, and all the data needed to use it '''
@ -185,8 +94,8 @@ class SearchEngine(models.Model):
    url = models.URLField()
    query_pattern = models.CharField(max_length=256)  # This field is the
    # query pattern. It should contain a `{}`, which, when substituted with a
-    # search term (using `.format()`), must yield a URL tail that can be
-    # concatenated with `url` to perform a search (eg. `?q={}` for ddg).
+    # search term (using `.format()`), must yield a URL that can be resolved to
+    # perform the search

    def __str__(self):
        return self.name
@ -194,10 +103,9 @@ class SearchEngine(models.Model):
    def search_url(self, search_term):
        ''' Obtain a url to search `search_term` with this search engine '''
        pattern = str(self.query_pattern)
-        search_term = str(search_term).replace(' ', '+')
        if '{}' not in pattern:
            raise InvalidData("Search engine {}: bad pattern".format(self))
-        return self.url + (str(self.query_pattern).format(search_term))
+        return str(self.query_pattern).format(search_term)


 class Interest(models.Model):
@ -231,35 +139,3 @@ class Profile(models.Model):
                                      on_delete=models.CASCADE)
    browser_fingerprint = models.ForeignKey(BrowserFingerprint,
                                            on_delete=models.CASCADE)
-
-
-def generate_email(nick, first_name, last_name):
-    domain = random.choice(EMAIL_DOMAINS)
-    if random.random() < 0.3:
-        email = first_name + "." + last_name + "@" + domain
-    else:
-        email = nick + "@" + domain
-    return email
-
-
-@require_nicknames
-def create_profile(nick=None):
-    nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
-    first_name = random.choice(FIRSTNAMES)
-    last_name = random.choice(LASTNAMES)
-    email = generate_email(nick, first_name, last_name)
-    profile = Profile(
-        nick=nick,
-        first_name=first_name,
-        last_name=last_name,
-        email=email,
-        uses_urls=(random.random() < 0.5),
-    )
-    profile.search_engine = random.choice(SearchEngine.objects.all())
-    profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
-
-    profile.full_clean()
-    profile.save()
-    profile.interests.add(random.choice(Interest.objects.all()))
-    profile.save()
-    return profile
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 aiodns==1.1.1
-aiohttp==2.3.2
+aiohttp==3.0.1
 async-timeout==2.0.0
 attrs==17.4.0
 cchardet==2.1.1
@ -12,8 +12,5 @@ pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
 beautifulsoup4==4.6.0
-stem==1.6.0
-pycurl==7.43.0.1
 rdflib==4.2.2
 git+https://github.com/tobast/RDFSerializer.git
-aiosocks==0.2.6