Compare commits
48 commits
bc7348f677
...
67ad232533
Author | SHA1 | Date | |
---|---|---|---|
Théophile Bastian | 67ad232533 | ||
Théophile Bastian | e140d4a8a7 | ||
Théophile Bastian | 98fe69ba62 | ||
Théophile Bastian | 968ff6d24c | ||
Rémi Oudin | 5d4bd30e20 | ||
Rémi Oudin | bdfa285e6b | ||
Rémi Oudin | 65f777f00f | ||
Rémi Oudin | 236e40d359 | ||
Rémi Oudin | 22017cea91 | ||
Rémi Oudin | 549c861908 | ||
Rémi Oudin | 517be1d822 | ||
Rémi Oudin | c4f63a92b2 | ||
Rémi Oudin | db067e56fc | ||
Rémi Oudin | 33bdae96e4 | ||
Rémi Oudin | 526aad1364 | ||
Théophile Bastian | 02e91bb2b7 | ||
Théophile Bastian | 3e5fc2f9b3 | ||
Théophile Bastian | 45ddbff91a | ||
Théophile Bastian | e6d587bffd | ||
Théophile Bastian | 8baf408e02 | ||
Théophile Bastian | 6463e348ac | ||
Théophile Bastian | 22064ebee3 | ||
Théophile Bastian | a4de51b84a | ||
Théophile Bastian | 4f0148cb63 | ||
Théophile Bastian | 4a8bd32516 | ||
Rémi Oudin | 44cf26df8f | ||
Rémi Oudin | adb892ab7d | ||
Rémi Oudin | 15db8b4697 | ||
Rémi Oudin | d6b26c0a46 | ||
Rémi Oudin | 8f5c4f3f0f | ||
Rémi Oudin | 71d9e18eec | ||
Rémi Oudin | 8ad46c0481 | ||
Rémi Oudin | f66c978466 | ||
Rémi Oudin | 0a676a2f65 | ||
Rémi Oudin | e074d96f02 | ||
Rémi Oudin | 93b235cb6c | ||
Rémi Oudin | ae5699c089 | ||
Rémi Oudin | f7313ff659 | ||
Rémi Oudin | 0661fe0f01 | ||
Rémi Oudin | 4b19febdf6 | ||
Théophile Bastian | 15323c3465 | ||
Théophile Bastian | c3bcdea1eb | ||
Rémi Oudin | 05a2e2ca3f | ||
Rémi Oudin | d4aefb6bb7 | ||
Rémi Oudin | 3eb82a4a0b | ||
Rémi Oudin | 7c0fb7dda1 | ||
Rémi Oudin | ee32e5385b | ||
Théophile Bastian | 2732e4115f |
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -65,3 +65,4 @@ venv/
|
||||||
# Django stuff
|
# Django stuff
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
|
|
||||||
|
_vimrc_local.vim
|
||||||
|
|
233
crawl/crawl.py
233
crawl/crawl.py
|
@ -1,11 +1,10 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from queue import Queue
|
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
from urllib.error import URLError
|
from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ssl import CertificateError
|
from ssl import CertificateError
|
||||||
from random import sample, randrange
|
from random import sample, randrange, randint
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -15,6 +14,8 @@ import async_timeout
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Comment
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
|
from profiles.models import BrowserFingerprint, SearchEngine
|
||||||
|
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
|
@ -26,11 +27,11 @@ MAX_PER_PAGE = 10
|
||||||
|
|
||||||
FOOTER_URL = re.compile(".*footer.*")
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Default User'
|
USER_AGENT = 'Default User'
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
startup_time = datetime.min
|
|
||||||
|
|
||||||
|
|
||||||
def url_getter(html, current_page, root_url):
|
def url_getter(html, current_page, root_url):
|
||||||
|
@ -72,7 +73,7 @@ def url_getter(html, current_page, root_url):
|
||||||
# Works only with python >= 3.6
|
# Works only with python >= 3.6
|
||||||
links_list = list(dict.fromkeys(links_list))
|
links_list = list(dict.fromkeys(links_list))
|
||||||
|
|
||||||
forbidden_words = ['login', 'agreement', 'mailto']
|
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
|
||||||
links_list = [link for link in links_list if not any(word in link.lower()
|
links_list = [link for link in links_list if not any(word in link.lower()
|
||||||
for word in
|
for word in
|
||||||
forbidden_words)]
|
forbidden_words)]
|
||||||
|
@ -80,8 +81,6 @@ def url_getter(html, current_page, root_url):
|
||||||
return links_list
|
return links_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||||
interface, but spawning one instance per canonical website URL """
|
interface, but spawning one instance per canonical website URL """
|
||||||
|
@ -104,36 +103,47 @@ class WebsiteSchedulerMeta(type):
|
||||||
|
|
||||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
""" Schedule the accesses to a website as of robots.txt """
|
""" Schedule the accesses to a website as of robots.txt """
|
||||||
def __init__(self, name):
|
|
||||||
|
search_engines = [] # Must be set by CrawlingThread.__init__
|
||||||
|
|
||||||
|
def __init__(self, name, user_agent):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.last_crawled = datetime.fromtimestamp(0)
|
self.last_crawled = datetime.fromtimestamp(0)
|
||||||
self.dead = False
|
self.dead = False
|
||||||
try:
|
self.can_fetch_b = False
|
||||||
robots_url = self.urlroot() + 'robots.txt'
|
self.user_agent = (user_agent if user_agent is not None
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
else settings.USER_AGENT)
|
||||||
self.robot_parser.read() # TODO async?
|
if any(self.urlroot() in item for item in self.search_engines):
|
||||||
except (URLError, CertificateError):
|
print("found a search engine for %s" % self.urlroot())
|
||||||
|
self.crawl_delay = timedelta(seconds=5)
|
||||||
|
self.can_fetch_b = True
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
robots_url = self.urlroot() + 'robots.txt'
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
self.robot_parser.read()
|
self.robot_parser.read() # TODO async?
|
||||||
except URLError: # Almost surely an offline website.
|
except (URLError, CertificateError):
|
||||||
|
try:
|
||||||
|
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||||
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
|
self.robot_parser.read()
|
||||||
|
except URLError: # Almost surely an offline website.
|
||||||
|
self.dead = True
|
||||||
|
self.crawl_delay = 0
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
raise e
|
||||||
|
if not self.robot_parser.default_entry:
|
||||||
self.dead = True
|
self.dead = True
|
||||||
self.crawl_delay = 0
|
if not self.dead:
|
||||||
except Exception as e:
|
delay = self.robot_parser.crawl_delay(self.user_agent)
|
||||||
print(e)
|
if delay is None:
|
||||||
raise e
|
req_rate = self.robot_parser.request_rate(self.user_agent)
|
||||||
if not self.robot_parser.default_entry:
|
if req_rate is None:
|
||||||
self.dead = True
|
delay = 5
|
||||||
if not self.dead:
|
else:
|
||||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
delay = req_rate.requests, req_rate.seconds
|
||||||
if delay is None:
|
self.crawl_delay = timedelta(seconds=delay)
|
||||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
|
||||||
if req_rate is None:
|
|
||||||
delay = 5
|
|
||||||
else:
|
|
||||||
delay = req_rate.requests, req_rate.seconds
|
|
||||||
self.crawl_delay = timedelta(seconds=delay)
|
|
||||||
|
|
||||||
def urlroot(self):
|
def urlroot(self):
|
||||||
''' Get the root url for this website '''
|
''' Get the root url for this website '''
|
||||||
|
@ -151,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
|
|
||||||
def can_fetch(self, url):
|
def can_fetch(self, url):
|
||||||
''' Check whether this program can fetch a given page '''
|
''' Check whether this program can fetch a given page '''
|
||||||
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
return ((self.can_fetch_b)
|
||||||
|
or ((not self.dead) and
|
||||||
|
self.robot_parser.can_fetch(self.user_agent, url)))
|
||||||
|
|
||||||
def fetching(self):
|
def fetching(self):
|
||||||
''' Tell the scheduler that a page is being fetched now '''
|
''' Tell the scheduler that a page is being fetched now '''
|
||||||
|
@ -162,38 +174,47 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, queue):
|
def __init__(self, url):
|
||||||
global settings
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
self.queue = queue
|
WebsiteScheduler.search_engines = engine_list
|
||||||
|
|
||||||
|
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||||
|
fingerprint = BrowserFingerprint.objects.all()[
|
||||||
|
randint(0, nb_fingerprint - 1)]
|
||||||
|
self.headers = fingerprint.serialize_headers()
|
||||||
|
|
||||||
|
self.output_tree = []
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
if user:
|
|
||||||
settings.USER_AGENT = user.serialize_headers()
|
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
global startup_time
|
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(async_crawler(self.url, self.queue))
|
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
try:
|
||||||
asyncio.set_event_loop(loop)
|
loop = asyncio.new_event_loop()
|
||||||
startup_time = datetime.now()
|
asyncio.set_event_loop(loop)
|
||||||
loop.run_until_complete(asyncio.wait(tasks))
|
loop.run_until_complete(asyncio.wait(tasks))
|
||||||
loop.close()
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
class PageGetter:
|
class PageGetter:
|
||||||
""" Asynchronously get a webpage, abiding by robots.txt """
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||||
|
|
||||||
def __init__(self, session, url):
|
headers = None
|
||||||
|
|
||||||
|
def __init__(self, session, url, user_agent):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.session = session
|
self.session = session
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
async def get(self, ssl=True):
|
async def get(self, ssl=True):
|
||||||
""" Actually retrieve the webpage """
|
""" Actually retrieve the webpage """
|
||||||
scheduler = WebsiteScheduler(self.url)
|
scheduler = WebsiteScheduler(self.url, self.user_agent)
|
||||||
if not scheduler.can_fetch(self.url):
|
if not scheduler.can_fetch(self.url):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -203,7 +224,8 @@ class PageGetter:
|
||||||
delay = scheduler.fetch_delay()
|
delay = scheduler.fetch_delay()
|
||||||
scheduler.fetching()
|
scheduler.fetching()
|
||||||
async with async_timeout.timeout(10):
|
async with async_timeout.timeout(10):
|
||||||
async with self.session.get(self.url, ssl=ssl) as resp:
|
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
||||||
|
print("Resp status %s" % resp.status)
|
||||||
try:
|
try:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
@ -213,46 +235,89 @@ class PageGetter:
|
||||||
async def async_print(url):
|
async def async_print(url):
|
||||||
""" Debug function to follow what's actually happening """
|
""" Debug function to follow what's actually happening """
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url).get(ssl=False)
|
html = await PageGetter(session, url,
|
||||||
|
settings.USER_AGENT).get(ssl=False)
|
||||||
|
|
||||||
print('GOT {}HTML for {} at {}'.format(
|
print('GOT {}HTML for {}'.format(
|
||||||
'None ' if html is None else '',
|
'None ' if html is None else '',
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
))
|
||||||
|
|
||||||
async def async_crawler(url, queue):
|
|
||||||
queued = [url]
|
|
||||||
crawled = []
|
|
||||||
while queued and (len(crawled) < HARD_LIMIT):
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
url = queued.pop(0)
|
|
||||||
except IndexError:
|
|
||||||
print("Error queue is empty")
|
|
||||||
return crawled
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
print("Crawling {}".format(url))
|
|
||||||
html = await PageGetter(session, url).get(ssl=False)
|
|
||||||
if html:
|
|
||||||
new_urls = url_getter(
|
|
||||||
html,
|
|
||||||
url,
|
|
||||||
parsed_url.scheme + "://" + parsed_url.netloc
|
|
||||||
)
|
|
||||||
crawled += [url]
|
|
||||||
if new_urls:
|
|
||||||
sampled = sample(
|
|
||||||
new_urls,
|
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
|
||||||
)
|
|
||||||
queued += [sample_url for sample_url in sampled if
|
|
||||||
sample_url not in queued and sample_url not in
|
|
||||||
crawled]
|
|
||||||
print(crawled)
|
|
||||||
queue.put(crawled)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
class CrawlElem:
|
||||||
queue = Queue()
|
''' Describes a crawled element, to be assembled into a tree '''
|
||||||
crawl = CrawlingThread(None, "https://python.org/", queue)
|
|
||||||
crawl.start()
|
def __init__(self, url, parent):
|
||||||
crawl.join()
|
self.url = url
|
||||||
|
self.parent = parent
|
||||||
|
|
||||||
|
|
||||||
|
async def run_crawl(url, output_tree, headers=None):
|
||||||
|
''' Starts a crawling session '''
|
||||||
|
|
||||||
|
if headers is None:
|
||||||
|
headers = {}
|
||||||
|
if 'User-Agent' not in headers:
|
||||||
|
headers['User-Agent'] = settings.USER_AGENT
|
||||||
|
|
||||||
|
user_agent = headers['User-Agent']
|
||||||
|
crawled = set()
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
|
await async_crawler(
|
||||||
|
url, output_tree, crawled, user_agent, session, None)
|
||||||
|
|
||||||
|
|
||||||
|
def simplify_url(url):
|
||||||
|
anchor = url.find('#')
|
||||||
|
if anchor >= 0:
|
||||||
|
url = url[:anchor]
|
||||||
|
|
||||||
|
prot = url.find('://')
|
||||||
|
if prot >= 0:
|
||||||
|
url = url[prot+3:]
|
||||||
|
|
||||||
|
if url.startswith('www.'):
|
||||||
|
url = url[4:]
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
|
||||||
|
if len(crawled) >= HARD_LIMIT:
|
||||||
|
return
|
||||||
|
crawled.add(simplify_url(url))
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
print("Crawling {}".format(url))
|
||||||
|
try:
|
||||||
|
with async_timeout.timeout(3):
|
||||||
|
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_tasks = []
|
||||||
|
|
||||||
|
if html:
|
||||||
|
this_elem = CrawlElem(url, parent)
|
||||||
|
out_tree.append(this_elem)
|
||||||
|
new_urls = url_getter(
|
||||||
|
html,
|
||||||
|
url,
|
||||||
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
|
)
|
||||||
|
if new_urls:
|
||||||
|
sampled = sample(
|
||||||
|
new_urls,
|
||||||
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
|
)
|
||||||
|
for sample_url in sampled:
|
||||||
|
if simplify_url(sample_url) not in crawled:
|
||||||
|
new_tasks.append(async_crawler(
|
||||||
|
sample_url, out_tree, crawled, user_agent, session,
|
||||||
|
this_elem))
|
||||||
|
else:
|
||||||
|
print("No html received")
|
||||||
|
if len(crawled) >= HARD_LIMIT:
|
||||||
|
return
|
||||||
|
if new_tasks:
|
||||||
|
await asyncio.wait(new_tasks)
|
||||||
|
|
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
nicknames_dict
|
14
data/email_domains.txt
Normal file
14
data/email_domains.txt
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
gmail.com
|
||||||
|
protonmail.com
|
||||||
|
riseup.net
|
||||||
|
tutanoto.com
|
||||||
|
outlook.fr
|
||||||
|
fastmail.com
|
||||||
|
yandex.com
|
||||||
|
aim.com
|
||||||
|
icloud.com
|
||||||
|
yahoo.com
|
||||||
|
fmx.fr
|
||||||
|
mail.com
|
||||||
|
hushmail.com
|
||||||
|
inbox.com
|
27
data/events.json
Normal file
27
data/events.json
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name" : "Atelier Anti-Pub",
|
||||||
|
"date" : "07/03/2018 19:00 UTC",
|
||||||
|
"place" : {
|
||||||
|
"name" : "Centre Social Autogéré Vaydom",
|
||||||
|
"address" : "37 rue Marceau, Ivry-sur-Seine",
|
||||||
|
"lat" : "48.81787",
|
||||||
|
"lon" : "2.38032"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name" : "Rassemblement de soutien pour Bure",
|
||||||
|
"date" : "27/02/2018 17:00 UTC",
|
||||||
|
"place" : {
|
||||||
|
"name" : "Place Saint-Michel",
|
||||||
|
"address" : "Place Saint-Michel, 75005 Paris",
|
||||||
|
"lat" : "48.85374",
|
||||||
|
"lon" : "2.34455"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name" : "Création d'un serveur mail",
|
||||||
|
"date" : "15/02/2018 12:00 UTC",
|
||||||
|
"place" : "La Mutinerie"
|
||||||
|
}
|
||||||
|
]
|
200
data/firstnames.txt
Normal file
200
data/firstnames.txt
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
Jean
|
||||||
|
Marie
|
||||||
|
Philippe
|
||||||
|
Nathalie
|
||||||
|
Michel
|
||||||
|
Isabelle
|
||||||
|
Alain
|
||||||
|
Sylvie
|
||||||
|
Patrick
|
||||||
|
Catherine
|
||||||
|
Nicolas
|
||||||
|
Martine
|
||||||
|
Christophe
|
||||||
|
Christine
|
||||||
|
Pierre
|
||||||
|
Françoise
|
||||||
|
Christian
|
||||||
|
Valérie
|
||||||
|
Éric
|
||||||
|
Sandrine
|
||||||
|
Frédéric
|
||||||
|
Stéphanie
|
||||||
|
Laurent
|
||||||
|
Véronique
|
||||||
|
Stéphane
|
||||||
|
Sophie
|
||||||
|
David
|
||||||
|
Céline
|
||||||
|
Pascal
|
||||||
|
Chantal
|
||||||
|
Daniel
|
||||||
|
Patricia
|
||||||
|
Alexandre
|
||||||
|
Anne
|
||||||
|
Julien
|
||||||
|
Brigitte
|
||||||
|
Thierry
|
||||||
|
Julie
|
||||||
|
Olivier
|
||||||
|
Monique
|
||||||
|
Bernard
|
||||||
|
Aurélie
|
||||||
|
Thomas
|
||||||
|
Nicole
|
||||||
|
Sébastien
|
||||||
|
Laurence
|
||||||
|
Gérard
|
||||||
|
Annie
|
||||||
|
Didier
|
||||||
|
Émilie
|
||||||
|
Dominique
|
||||||
|
Dominique
|
||||||
|
Vincent
|
||||||
|
Virginie
|
||||||
|
François
|
||||||
|
Corinne
|
||||||
|
Bruno
|
||||||
|
Élodie
|
||||||
|
Guillaume
|
||||||
|
Christelle
|
||||||
|
Jérôme
|
||||||
|
Camille
|
||||||
|
Jacques
|
||||||
|
Caroline
|
||||||
|
Marc
|
||||||
|
Léa
|
||||||
|
Maxime
|
||||||
|
Sarah
|
||||||
|
Romain
|
||||||
|
Florence
|
||||||
|
Claude
|
||||||
|
Laetitia
|
||||||
|
Antoine
|
||||||
|
Audrey
|
||||||
|
Franck
|
||||||
|
Hélène
|
||||||
|
Jean-Pierre
|
||||||
|
Laura
|
||||||
|
Anthony
|
||||||
|
Manon
|
||||||
|
Kévin
|
||||||
|
Michèle
|
||||||
|
Gilles
|
||||||
|
Cécile
|
||||||
|
Cédric
|
||||||
|
Christiane
|
||||||
|
Serge
|
||||||
|
Béatrice
|
||||||
|
André
|
||||||
|
Claire
|
||||||
|
Mathieu
|
||||||
|
Nadine
|
||||||
|
Benjamin
|
||||||
|
Delphine
|
||||||
|
Patrice
|
||||||
|
Pauline
|
||||||
|
Fabrice
|
||||||
|
Karine
|
||||||
|
Joël
|
||||||
|
Mélanie
|
||||||
|
Jérémy
|
||||||
|
Marion
|
||||||
|
Clément
|
||||||
|
Chloe
|
||||||
|
Arnaud
|
||||||
|
Jacqueline
|
||||||
|
Denis
|
||||||
|
Elisabeth
|
||||||
|
Paul
|
||||||
|
Evelyne
|
||||||
|
Lucas
|
||||||
|
Marine
|
||||||
|
Hervé
|
||||||
|
Claudine
|
||||||
|
Jean-Claude
|
||||||
|
Anais
|
||||||
|
Sylvain
|
||||||
|
Lucie
|
||||||
|
Yves
|
||||||
|
Danielle
|
||||||
|
Ludovic
|
||||||
|
Carole
|
||||||
|
Guy
|
||||||
|
Fabienne
|
||||||
|
Florian
|
||||||
|
Mathilde
|
||||||
|
Damien
|
||||||
|
Sandra
|
||||||
|
Alexis
|
||||||
|
Pascale
|
||||||
|
Mickaël
|
||||||
|
Annick
|
||||||
|
Quentin
|
||||||
|
Charlotte
|
||||||
|
Emmanuel
|
||||||
|
Emma
|
||||||
|
Louis
|
||||||
|
Severine
|
||||||
|
Benoît
|
||||||
|
Sabrina
|
||||||
|
Jean-Luc
|
||||||
|
Amandine
|
||||||
|
Fabien
|
||||||
|
Myriam
|
||||||
|
Francis
|
||||||
|
Jocelyne
|
||||||
|
Hugo
|
||||||
|
Alexandra
|
||||||
|
Jonathan
|
||||||
|
Angelique
|
||||||
|
Loïc
|
||||||
|
Josiane
|
||||||
|
Xavier
|
||||||
|
Joelle
|
||||||
|
Théo
|
||||||
|
Agnes
|
||||||
|
Adrien
|
||||||
|
Mireille
|
||||||
|
Raphaël
|
||||||
|
Vanessa
|
||||||
|
Jean-François
|
||||||
|
Justine
|
||||||
|
Grégory
|
||||||
|
Sonia
|
||||||
|
Robert
|
||||||
|
Bernadette
|
||||||
|
Michaël
|
||||||
|
Emmanuelle
|
||||||
|
Valentin
|
||||||
|
Oceane
|
||||||
|
Cyril
|
||||||
|
Amelie
|
||||||
|
Jean-Marc
|
||||||
|
Clara
|
||||||
|
René
|
||||||
|
Maryse
|
||||||
|
Lionel
|
||||||
|
Anne-marie
|
||||||
|
Yannick
|
||||||
|
Fanny
|
||||||
|
Enzo
|
||||||
|
Magali
|
||||||
|
Yannis
|
||||||
|
Marie-christine
|
||||||
|
Jean-Michel
|
||||||
|
Morgane
|
||||||
|
Baptiste
|
||||||
|
Ines
|
||||||
|
Matthieu
|
||||||
|
Nadia
|
||||||
|
Rémi
|
||||||
|
Muriel
|
||||||
|
Georges
|
||||||
|
Jessica
|
||||||
|
Aurélien
|
||||||
|
Laure
|
||||||
|
Nathan
|
||||||
|
Genevieve
|
||||||
|
Jean-Paul
|
||||||
|
Estelle
|
55
data/interests.json
Normal file
55
data/interests.json
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "occupation",
|
||||||
|
"keywords": [
|
||||||
|
{"keyword" : "ZAD NDDL"},
|
||||||
|
{"keyword" : "Organiser un squat"},
|
||||||
|
{"keyword" : "mobilisation et rassemblement"}
|
||||||
|
],
|
||||||
|
"places": [
|
||||||
|
{"place" : "Zad NDDL"},
|
||||||
|
{"place" : "Zad Bure"}
|
||||||
|
],
|
||||||
|
"websites": [
|
||||||
|
{"website": "zad nadir"}
|
||||||
|
],
|
||||||
|
"events": [
|
||||||
|
{"event": "Rassemblement de soutien pour Bure"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "LGBT",
|
||||||
|
"keywords": [
|
||||||
|
{"keyword" : "Discrimniation sexistes, quelles actions ?"},
|
||||||
|
{"keyword" : "gender queer Paris"},
|
||||||
|
{"keyword" : "Existrans Paris"}
|
||||||
|
],
|
||||||
|
"places": [
|
||||||
|
{"place" : "La Mutinerie"}
|
||||||
|
],
|
||||||
|
"websites": [
|
||||||
|
{"website": "emmaclit"},
|
||||||
|
{"website": "paris-luttes info"}
|
||||||
|
],
|
||||||
|
"events": [
|
||||||
|
{"event": "Création d'un serveur mail"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Anti pub",
|
||||||
|
"keywords": [
|
||||||
|
{"keyword" : "Affichage JCDecaux"},
|
||||||
|
{"keyword" : "Anti-pub"},
|
||||||
|
{"keyword" : "Journée contre la publicité"}
|
||||||
|
],
|
||||||
|
"places": [
|
||||||
|
{"place" : "Centre Social Autogéré Vaydom"}
|
||||||
|
],
|
||||||
|
"websites": [
|
||||||
|
{"website": "paris-luttes info"}
|
||||||
|
],
|
||||||
|
"events": [
|
||||||
|
{"event": "Atelier Anti-Pub"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
200
data/lastnames.txt
Normal file
200
data/lastnames.txt
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
Martin
|
||||||
|
Bernard
|
||||||
|
Thomas
|
||||||
|
Petit
|
||||||
|
Robert
|
||||||
|
Richard
|
||||||
|
Durand
|
||||||
|
Dubois
|
||||||
|
Moreau
|
||||||
|
Laurent
|
||||||
|
Simon
|
||||||
|
Michel
|
||||||
|
Lefebvre
|
||||||
|
Leroy
|
||||||
|
Roux
|
||||||
|
David
|
||||||
|
Bertrand
|
||||||
|
Morel
|
||||||
|
Fournier
|
||||||
|
Girard
|
||||||
|
Bonnet
|
||||||
|
Dupont
|
||||||
|
Lambert
|
||||||
|
Fontaine
|
||||||
|
Rousseau
|
||||||
|
Vincent
|
||||||
|
Muller
|
||||||
|
Lefevre
|
||||||
|
Faure
|
||||||
|
Andre
|
||||||
|
Mercier
|
||||||
|
Blanc
|
||||||
|
Guerin
|
||||||
|
Boyer
|
||||||
|
Garnier
|
||||||
|
Chevalier
|
||||||
|
Francois
|
||||||
|
Legrand
|
||||||
|
Gauthier
|
||||||
|
Garcia
|
||||||
|
Perrin
|
||||||
|
Robin
|
||||||
|
Clement
|
||||||
|
Morin
|
||||||
|
Nicolas
|
||||||
|
Henry
|
||||||
|
Roussel
|
||||||
|
Mathieu
|
||||||
|
Gautier
|
||||||
|
Masson
|
||||||
|
Marchand
|
||||||
|
Duval
|
||||||
|
Denis
|
||||||
|
Dumont
|
||||||
|
Marie
|
||||||
|
Lemaire
|
||||||
|
Noel
|
||||||
|
Meyer
|
||||||
|
Dufour
|
||||||
|
Meunier
|
||||||
|
Brun
|
||||||
|
Blanchard
|
||||||
|
Giraud
|
||||||
|
Joly
|
||||||
|
Riviere
|
||||||
|
Lucas
|
||||||
|
Brunet
|
||||||
|
Gaillard
|
||||||
|
Barbier
|
||||||
|
Arnaud
|
||||||
|
Martinez
|
||||||
|
Gerard
|
||||||
|
Roche
|
||||||
|
Renard
|
||||||
|
Schmitt
|
||||||
|
Roy
|
||||||
|
Leroux
|
||||||
|
Colin
|
||||||
|
Vidal
|
||||||
|
Caron
|
||||||
|
Picard
|
||||||
|
Roger
|
||||||
|
Fabre
|
||||||
|
Aubert
|
||||||
|
Lemoine
|
||||||
|
Renaud
|
||||||
|
Dumas
|
||||||
|
Lacroix
|
||||||
|
Olivier
|
||||||
|
Philippe
|
||||||
|
Bourgeois
|
||||||
|
Pierre
|
||||||
|
Benoit
|
||||||
|
Rey
|
||||||
|
Leclerc
|
||||||
|
Payet
|
||||||
|
Rolland
|
||||||
|
Leclercq
|
||||||
|
Guillaume
|
||||||
|
Lecomte
|
||||||
|
Lopez
|
||||||
|
Jean
|
||||||
|
Dupuy
|
||||||
|
Guillot
|
||||||
|
Hubert
|
||||||
|
Berger
|
||||||
|
Carpentier
|
||||||
|
Sanchez
|
||||||
|
Dupuis
|
||||||
|
Moulin
|
||||||
|
Louis
|
||||||
|
Deschamps
|
||||||
|
Huet
|
||||||
|
Vasseur
|
||||||
|
Perez
|
||||||
|
Boucher
|
||||||
|
Fleury
|
||||||
|
Royer
|
||||||
|
Klein
|
||||||
|
Jacquet
|
||||||
|
Adam
|
||||||
|
Paris
|
||||||
|
Poirier
|
||||||
|
Marty
|
||||||
|
Aubry
|
||||||
|
Guyot
|
||||||
|
Carre
|
||||||
|
Charles
|
||||||
|
Renault
|
||||||
|
Charpentier
|
||||||
|
Menard
|
||||||
|
Maillard
|
||||||
|
Baron
|
||||||
|
Bertin
|
||||||
|
Bailly
|
||||||
|
Herve
|
||||||
|
Schneider
|
||||||
|
Fernandez
|
||||||
|
Le Gall
|
||||||
|
Collet
|
||||||
|
Leger
|
||||||
|
Bouvier
|
||||||
|
Julien
|
||||||
|
Prevost
|
||||||
|
Millet
|
||||||
|
Perrot
|
||||||
|
Daniel
|
||||||
|
Le Roux
|
||||||
|
Cousin
|
||||||
|
Germain
|
||||||
|
Breton
|
||||||
|
Besson
|
||||||
|
Langlois
|
||||||
|
Remy
|
||||||
|
Le Goff
|
||||||
|
Pelletier
|
||||||
|
Leveque
|
||||||
|
Perrier
|
||||||
|
Leblanc
|
||||||
|
Barre
|
||||||
|
Lebrun
|
||||||
|
Marchal
|
||||||
|
Weber
|
||||||
|
Mallet
|
||||||
|
Hamon
|
||||||
|
Boulanger
|
||||||
|
Jacob
|
||||||
|
Monnier
|
||||||
|
Michaud
|
||||||
|
Rodriguez
|
||||||
|
Guichard
|
||||||
|
Gillet
|
||||||
|
Etienne
|
||||||
|
Grondin
|
||||||
|
Poulain
|
||||||
|
Tessier
|
||||||
|
Chevallier
|
||||||
|
Collin
|
||||||
|
Chauvin
|
||||||
|
Da Silva
|
||||||
|
Bouchet
|
||||||
|
Gay
|
||||||
|
Lemaitre
|
||||||
|
Benard
|
||||||
|
Marechal
|
||||||
|
Humbert
|
||||||
|
Reynaud
|
||||||
|
Antoine
|
||||||
|
Hoarau
|
||||||
|
Perret
|
||||||
|
Barthelemy
|
||||||
|
Cordier
|
||||||
|
Pichon
|
||||||
|
Lejeune
|
||||||
|
Gilbert
|
||||||
|
Lamy
|
||||||
|
Delaunay
|
||||||
|
Pasquier
|
||||||
|
Carlier
|
||||||
|
Laporte
|
|
@ -1,93 +1,93 @@
|
||||||
[
|
[
|
||||||
"website": {
|
{
|
||||||
"name":"emmaclit",
|
"name":"emmaclit",
|
||||||
"url":"https://emmaclit.com/",
|
"url":"https://emmaclit.com/",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"keyword":"Charge mentale",
|
{"keyword":"Charge mentale"},
|
||||||
"keyword":"Un autre regard",
|
{"keyword":"Un autre regard"},
|
||||||
"keyword":"Un petit poutou",
|
{"keyword":"Un petit poutou"},
|
||||||
"keyword":"solidarité",
|
{"keyword":"solidarité"},
|
||||||
"keyword":"dédicace"
|
{"keyword":"dédicace"}
|
||||||
},
|
|
||||||
"notable_pages": [
|
|
||||||
"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/",
|
|
||||||
"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/",
|
|
||||||
"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"website": {
|
|
||||||
"name":"paris-luttes info",
|
|
||||||
"url":"https//paris-luttes.info/",
|
|
||||||
"keywords": [
|
|
||||||
"keyword":"manifestations",
|
|
||||||
"keyword":"solidarité immigré·e·s",
|
|
||||||
"keyword":"grève salariés",
|
|
||||||
"keyword":"prison",
|
|
||||||
"keyword":"violence policère"
|
|
||||||
],
|
],
|
||||||
"notable_pages": [
|
"notable_pages": [
|
||||||
"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr",
|
{"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
|
||||||
"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr",
|
{"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
|
||||||
"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"
|
{"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"website": {
|
{
|
||||||
|
"name":"paris-luttes info",
|
||||||
|
"url":"https://paris-luttes.info/",
|
||||||
|
"keywords": [
|
||||||
|
{"keyword":"manifestations"},
|
||||||
|
{"keyword":"solidarité immigré·e·s"},
|
||||||
|
{"keyword":"grève salariés"},
|
||||||
|
{"keyword":"prison"},
|
||||||
|
{"keyword":"violence policère"}
|
||||||
|
],
|
||||||
|
"notable_pages": [
|
||||||
|
{"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
|
||||||
|
{"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
|
||||||
|
{"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
"name":"zad nadir",
|
"name":"zad nadir",
|
||||||
"url":"http://zad.nadir.org/",
|
"url":"http://zad.nadir.org/",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"keyword":"Écologie",
|
{"keyword":"Écologie"},
|
||||||
"keyword":"opération césar",
|
{"keyword":"opération césar"},
|
||||||
"keyword":"expulsion vinci",
|
{"keyword":"expulsion vinci"},
|
||||||
"keyword":"adresse",
|
{"keyword":"adresse"},
|
||||||
"keyword":"la wardine",
|
{"keyword":"la wardine"},
|
||||||
"keyword":"route des chicanes",
|
{"keyword":"route des chicanes"},
|
||||||
"keyword":"opposition à l'aéroport Grand Ouest"
|
{"keyword":"opposition à l'aéroport Grand Ouest"}
|
||||||
],
|
],
|
||||||
"notable_pages": [
|
"notable_pages": [
|
||||||
"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr",
|
{"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
|
||||||
"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr",
|
{"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
|
||||||
"webpage": "http://zad.nadir.org/spip.php?rubrique71",
|
{"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
|
||||||
"webpage": "https://zad.nadir.org/spip.php?rubrique70"
|
{"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"website": {
|
{
|
||||||
"name":"Fnac",
|
"name":"Fnac",
|
||||||
"url":"https://www.fnac.com/",
|
"url":"https://www.fnac.com/",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"keyword":"smartphone",
|
{"keyword":"smartphone"},
|
||||||
"keyword":"SAV",
|
{"keyword":"SAV"},
|
||||||
"keyword":"Macbook",
|
{"keyword":"Macbook"},
|
||||||
"keyword":"TV",
|
{"keyword":"TV"},
|
||||||
"keyword":"PC Gaming",
|
{"keyword":"PC Gaming"},
|
||||||
"keyword":"DVD",
|
{"keyword":"DVD"},
|
||||||
"keyword":"Home Cinema Philips",
|
{"keyword":"Home Cinema Philips"},
|
||||||
"keyword":"Billeterie"
|
{"keyword":"Billeterie"}
|
||||||
],
|
],
|
||||||
"notable_pages": [
|
"notable_pages": [
|
||||||
"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo",
|
{"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
|
||||||
"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer",
|
{"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
|
||||||
"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1",
|
{"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
|
||||||
"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"
|
{"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"website": {
|
{
|
||||||
"name":"Sea Shepherd",
|
"name":"Sea Shepherd",
|
||||||
"url":"https://www.seashepherd.fr/",
|
"url":"https://www.seashepherd.fr/",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"keyword":"pirates",
|
{"keyword":"pirates"},
|
||||||
"keyword":"Phoques",
|
{"keyword":"Phoques"},
|
||||||
"keyword":"Paul Watson",
|
{"keyword":"Paul Watson"},
|
||||||
"keyword":"harponnage",
|
{"keyword":"harponnage"},
|
||||||
"keyword":"seal",
|
{"keyword":"seal"},
|
||||||
"keyword":"Chasse aux dauphins",
|
{"keyword":"Chasse aux dauphins"},
|
||||||
"keyword":"participation",
|
{"keyword":"participation"},
|
||||||
"keyword":"boutique"
|
{"keyword":"boutique"}
|
||||||
],
|
],
|
||||||
"notable_pages": [
|
"notable_pages": [
|
||||||
"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous",
|
{"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
|
||||||
"webpage": "http://nyamba.seashepherd.info/",
|
{"webpage": "http://nyamba.seashepherd.info/"},
|
||||||
"webpage": "http://seashepherd-shop.com/en/",
|
{"webpage": "http://seashepherd-shop.com/en/"},
|
||||||
"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"
|
{"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
34
histories/migrations/0001_initial.py
Normal file
34
histories/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# Generated by Django 2.0.1 on 2018-02-25 19:08
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('profiles', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='History',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
|
||||||
|
('played', models.BooleanField(default=False)),
|
||||||
|
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='HistoryEntry',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('search', models.URLField(help_text='The url to be searched')),
|
||||||
|
('timestamp', models.DateTimeField()),
|
||||||
|
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
|
@ -3,13 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
|
||||||
interests, keywords...
|
interests, keywords...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
import random
|
import random
|
||||||
from math import floor
|
from math import floor
|
||||||
from queue import Queue
|
from xml.etree import ElementTree as ET
|
||||||
|
from datetime import datetime
|
||||||
from django.db import models
|
from django.db import models
|
||||||
|
from django.core.exceptions import ValidationError
|
||||||
import profiles.models as profiles
|
import profiles.models as profiles
|
||||||
from crawl import crawl
|
from crawl import crawl
|
||||||
from pinocchio.settings import HISTORY_MIN
|
from pinocchio.settings import HISTORY_MIN
|
||||||
|
from .tor_runner import TorInstance
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidXml(Exception):
|
||||||
|
def __init__(self, what='unexpected XML data.'):
|
||||||
|
super().__init__()
|
||||||
|
self.what = what
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "Invalid XML: " + self.what
|
||||||
|
|
||||||
|
|
||||||
class HistoryEntry(models.Model):
|
class HistoryEntry(models.Model):
|
||||||
|
@ -27,14 +40,48 @@ class HistoryEntry(models.Model):
|
||||||
"""
|
"""
|
||||||
return "{} : {}".format(self.timestamp, self.search)
|
return "{} : {}".format(self.timestamp, self.search)
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
entry = ET.Element('history')
|
||||||
|
entry_url = ET.Element('url')
|
||||||
|
entry_url.text = self.search
|
||||||
|
entry_ts = ET.Element('timestamp')
|
||||||
|
entry_ts.text = self.timestamp.timestamp()
|
||||||
|
entry.append(entry_url)
|
||||||
|
entry.append(entry_ts)
|
||||||
|
xml_root.append(entry)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root, in_history):
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml("expected <history> tag here.")
|
||||||
|
url, timestamp = None, None
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
if child.tag == 'url':
|
||||||
|
url = child.text
|
||||||
|
elif child.tag == 'timestamp':
|
||||||
|
try:
|
||||||
|
timestamp = datetime.fromtimestamp(child.text)
|
||||||
|
except TypeError:
|
||||||
|
raise InvalidXml("invalid timestamp {}".format(child.text))
|
||||||
|
else:
|
||||||
|
raise InvalidXml("unknown tag {} as child of <history>".format(
|
||||||
|
child.tag))
|
||||||
|
output = HistoryEntry()
|
||||||
|
output.search = url
|
||||||
|
output.timestamp = timestamp
|
||||||
|
output.history = in_history
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class History(models.Model):
|
class History(models.Model):
|
||||||
""" A history for a user, containing some web connections (http, https).
|
""" A history for a user, containing some web connections (http, https).
|
||||||
Each history is timed, in a human-behaviour manner. """
|
Each history is timed, in a human-behaviour manner. """
|
||||||
|
|
||||||
start_ts = models.DateTimeField(
|
start_ts = models.DateTimeField(
|
||||||
help_text='The starting timestamp of the history. Useful for cron-like '
|
help_text=('The starting timestamp of the history. Useful for '
|
||||||
'structure.'
|
'cron-like structure.')
|
||||||
|
|
||||||
)
|
)
|
||||||
played = models.BooleanField(default=False)
|
played = models.BooleanField(default=False)
|
||||||
|
@ -46,58 +93,134 @@ class History(models.Model):
|
||||||
def return_history(self):
|
def return_history(self):
|
||||||
""" Returns the history, sorted by increasing timestamps
|
""" Returns the history, sorted by increasing timestamps
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
output_history = self.historyentry_set.order_by('timestamp')
|
||||||
return history_set
|
output_history = [(item.search, item.timestamp.date())
|
||||||
|
for item in output_history]
|
||||||
|
return output_history
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
""" Returns the string representation of a history.
|
""" Returns the string representation of a history.
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
entries = self.historyentry_set.order_by('timestamp')
|
||||||
header = "[History]:\n"
|
output = "[History]:\n"
|
||||||
return header + "\n".join(history_set)
|
for entry in entries:
|
||||||
|
output += str(entry) + '\n'
|
||||||
|
return output
|
||||||
|
|
||||||
|
def play_histories(self):
|
||||||
def play_history(self):
|
|
||||||
""" Actually plays the history.
|
""" Actually plays the history.
|
||||||
"""
|
"""
|
||||||
self.played = True
|
self.played = True
|
||||||
|
runner = TorInstance(self.return_history())
|
||||||
|
runnner.run()
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
''' Exports the current history to xml '''
|
||||||
|
hist_node = ET.Element("history", attrib={
|
||||||
|
'start-ts': self.start_ts,
|
||||||
|
'played': 1 if self.played else 0,
|
||||||
|
'user': self.user.pk,
|
||||||
|
})
|
||||||
|
xml_root.append(hist_node)
|
||||||
|
for entry in self.historyentry_set:
|
||||||
|
entry.to_xml(hist_node)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root):
|
||||||
|
''' Loads an history from an XML file '''
|
||||||
|
|
||||||
|
REQUIRED_ATTR = ['start-ts', 'played', 'user']
|
||||||
|
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml('unexpected node {} as root of an history'.format(
|
||||||
|
xml_root.tag))
|
||||||
|
for attr in REQUIRED_ATTR:
|
||||||
|
if attr not in xml_root.attrib:
|
||||||
|
raise InvalidXml(('missing attribute "{}" for tag of type '
|
||||||
|
'history').format(attr))
|
||||||
|
start_ts = xml_root.attrib['start-ts']
|
||||||
|
played = xml_root.attrib['played']
|
||||||
|
user_pk = xml_root.attrib['user']
|
||||||
|
users = History.objects.filter(pk=1)
|
||||||
|
if len(users) != 1:
|
||||||
|
raise InvalidXml('primary key for History {} is invalid'.format(
|
||||||
|
user_pk))
|
||||||
|
|
||||||
|
output = History()
|
||||||
|
output.start_ts = start_ts
|
||||||
|
output.played = played > 0
|
||||||
|
output.user = users[0]
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
HistoryEntry.from_xml(child, output)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
|
||||||
|
['url', 'timestamp'])
|
||||||
|
|
||||||
|
|
||||||
def generate_partial_history(user, t_start):
|
def generate_partial_history(user, t_start):
|
||||||
""" Generate the part of the history resulting from the crawl starting at
|
""" Generate the part of the history resulting from the crawl starting at
|
||||||
the given url.
|
the given url.
|
||||||
"""
|
"""
|
||||||
|
timestamp = t_start
|
||||||
result = []
|
result = []
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
result.append((basis, t_start))
|
result.append(PartialHistoryEntry(basis, timestamp))
|
||||||
t_start += 5* random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
crawler = crawl.CrawlingThread(basis)
|
||||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls_tree = crawler.output_tree
|
||||||
for url in urls:
|
|
||||||
t_start += 5* random.weibullvariate(1, 1.5)
|
open_time = {}
|
||||||
result.append((url, t_start)
|
for elem in urls_tree:
|
||||||
|
url, parent = elem.url, elem.parent
|
||||||
|
timestamp = 0
|
||||||
|
if parent is None:
|
||||||
|
timestamp = t_start
|
||||||
|
else:
|
||||||
|
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
|
||||||
|
open_time[elem] = timestamp
|
||||||
|
result.append(PartialHistoryEntry(url, timestamp))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def generate_first_url(user):
|
def generate_first_url(user):
|
||||||
""" Generate the first url of a partial history, based on the user
|
""" Generate the first url of a partial history, based on the user
|
||||||
information. """
|
information. """
|
||||||
interest = random.choice(
|
|
||||||
[user.interests.keywords.all(), user.interests.places.all(),
|
def nonempty(seq):
|
||||||
user.interests.websites.all(), user.interests.events.all()
|
out = []
|
||||||
]
|
for elt in seq:
|
||||||
)
|
if elt:
|
||||||
|
out.append(elt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
all_keywords = profiles.Keyword.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_websites = profiles.Website.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_places = profiles.Place.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_events = profiles.Event.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
|
||||||
|
interest = random.choice(nonempty([
|
||||||
|
all_keywords,
|
||||||
|
all_websites,
|
||||||
|
all_places,
|
||||||
|
all_events,
|
||||||
|
]))
|
||||||
search_term = random.choice(interest)
|
search_term = random.choice(interest)
|
||||||
url = search_term.generate_url(user)
|
url = search_term.generate_url(user)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def generate_history(user, start_time):
|
||||||
|
|
||||||
def generate_history(user, ts_start):
|
|
||||||
""" Generate a new history for the user `user`, starting from timestamp
|
""" Generate a new history for the user `user`, starting from timestamp
|
||||||
`ts_start`.
|
`ts_start`.
|
||||||
A few heuristics are used in order to give the impression that the history
|
A few heuristics are used in order to give the impression that the history
|
||||||
|
@ -105,19 +228,32 @@ def generate_history(user, ts_start):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# let's define a new history object.
|
# let's define a new history object.
|
||||||
history = History(start_ts=ts_start, user=user)
|
history = History(start_ts=start_time, user=user)
|
||||||
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
||||||
|
history.full_clean()
|
||||||
|
history.save()
|
||||||
|
|
||||||
history_line = 0
|
current_timestamp = start_time.timestamp()
|
||||||
|
|
||||||
while history_line < length:
|
hist_size = 0
|
||||||
ts_start += 5 * random.weibullvariate(1, 2.8)
|
|
||||||
history_list = generate_partial_history(user, ts_start)
|
while hist_size < length:
|
||||||
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
||||||
|
history_list = generate_partial_history(user, current_timestamp)
|
||||||
|
current_timestamp = \
|
||||||
|
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||||
for (url, timestamp) in history_list:
|
for (url, timestamp) in history_list:
|
||||||
new_line = HistoryEntry(
|
if len(url) < 200:
|
||||||
search=url,
|
new_line = HistoryEntry(
|
||||||
timestamp=timestamp,
|
search=url,
|
||||||
history=history
|
timestamp=datetime.fromtimestamp(timestamp),
|
||||||
)
|
history=history
|
||||||
new_line.save()
|
)
|
||||||
|
try:
|
||||||
|
new_line.full_clean()
|
||||||
|
new_line.save()
|
||||||
|
hist_size += 1
|
||||||
|
except ValidationError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return history
|
||||||
|
|
120
histories/tor_runner.py
Normal file
120
histories/tor_runner.py
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
"""
|
||||||
|
Modules that handles tor instaces creations in order to safely run histories
|
||||||
|
"""
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import datetime as dt
|
||||||
|
from time import sleep
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
from aiosocks.connector import ProxyConnector, ProxyClientRequest
|
||||||
|
import async_timeout
|
||||||
|
import stem.process as tor
|
||||||
|
|
||||||
|
class TorInstance():
|
||||||
|
"""
|
||||||
|
A tor instance object, with some useful information.
|
||||||
|
It is designed to be used as a worker in order to replay an history.
|
||||||
|
"""
|
||||||
|
BASE_SOCKS_PORT = 40000
|
||||||
|
BASE_CONTROL_PORT = 20000
|
||||||
|
BASE_DATA_DIR = "/tmp/tor{}/"
|
||||||
|
TOR_RUNNER = 0
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls, history, headers):
|
||||||
|
""" Factory creation of tor processes"""
|
||||||
|
socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
|
||||||
|
control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
|
||||||
|
data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
|
||||||
|
TorInstance.TOR_RUNNER += 1
|
||||||
|
self = TorInstance()
|
||||||
|
self.socks_port = socks_port
|
||||||
|
self.control_port = control_port
|
||||||
|
self.data_dir = data_dir
|
||||||
|
self.history = history
|
||||||
|
self.headers = headers
|
||||||
|
self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
|
||||||
|
self.create_session()
|
||||||
|
self.process = tor.launch_tor_with_config(
|
||||||
|
config={
|
||||||
|
'ControlPort' : str(control_port),
|
||||||
|
'SocksPort' : str(socks_port),
|
||||||
|
'DataDir' : data_dir
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.socks_port = 0
|
||||||
|
self.control_port = 0
|
||||||
|
self.data_dir = ""
|
||||||
|
self.history = None
|
||||||
|
self.proxy = ""
|
||||||
|
self.headers = {}
|
||||||
|
self.session = None
|
||||||
|
self.process = None
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
""" Runs the Tor Instance on the history.
|
||||||
|
"""
|
||||||
|
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
|
||||||
|
print("Sleeping")
|
||||||
|
sleep(10)
|
||||||
|
while self.history:
|
||||||
|
item = self.history.pop(0)
|
||||||
|
async with async_timeout.timeout(30):
|
||||||
|
await(self.query(item[0]))
|
||||||
|
now = dt.datetime.now()
|
||||||
|
if now <= self.history[0][1]:
|
||||||
|
sleep((self.history[0][1] - now).total_seconds())
|
||||||
|
|
||||||
|
|
||||||
|
def create_session(self):
|
||||||
|
""" Create a aiohttp session.
|
||||||
|
"""
|
||||||
|
conn = ProxyConnector(remote_resolve=True)
|
||||||
|
self.session = aiohttp.ClientSession(
|
||||||
|
connector=conn,
|
||||||
|
headers=self.headers,
|
||||||
|
request_class=ProxyClientRequest
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def query(self, url):
|
||||||
|
""" Performs a query.
|
||||||
|
"""
|
||||||
|
async with async_timeout.timeout(30):
|
||||||
|
async with self.session.get(
|
||||||
|
url,
|
||||||
|
proxy=self.proxy,
|
||||||
|
proxy_auth=None) as resp:
|
||||||
|
try:
|
||||||
|
return await resp.text()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
""" Utility function """
|
||||||
|
return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
|
||||||
|
'{0.control_port}, DataDir: {0.data_dir}'.format(self))
|
||||||
|
|
||||||
|
async def kill(self):
|
||||||
|
""" Kills the process and remove the data dir"""
|
||||||
|
self.process.kill()
|
||||||
|
self.session.close()
|
||||||
|
shutil.rmtree(self.data_dir)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
""" Test function """
|
||||||
|
for _ in range(3):
|
||||||
|
instance = await TorInstance.create(None, {"user-agent" : "Blah"})
|
||||||
|
await instance.query("https://python.org/")
|
||||||
|
print("One page received")
|
||||||
|
await instance.kill()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
LOOP = asyncio.get_event_loop()
|
||||||
|
LOOP.run_until_complete(main())
|
|
@ -97,7 +97,7 @@ USE_I18N = True
|
||||||
|
|
||||||
USE_L10N = True
|
USE_L10N = True
|
||||||
|
|
||||||
USE_TZ = True
|
USE_TZ = False # We don't really care, we want POSIX timestamps
|
||||||
|
|
||||||
|
|
||||||
# Static files (CSS, JavaScript, Images)
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
|
10
populate.sh
Normal file
10
populate.sh
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# -*- coding: UTF8 -*-
|
||||||
|
|
||||||
|
python3 manage.py import_browser_fp
|
||||||
|
python3 manage.py import_search_engine
|
||||||
|
python3 manage.py import_keywords
|
||||||
|
python3 manage.py import_website
|
||||||
|
python3 manage.py import_places
|
||||||
|
python3 manage.py import_events
|
||||||
|
python3 manage.py import_interests
|
27
profiles/management/commands/exportrdf.py
Normal file
27
profiles/management/commands/exportrdf.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from profiles.models_rdf import RdfProfile
|
||||||
|
from profiles import models
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
''' Exports database models to RDF '''
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
exported_models = [
|
||||||
|
models.Keyword,
|
||||||
|
models.Webpage,
|
||||||
|
models.Website,
|
||||||
|
models.Place,
|
||||||
|
models.Event,
|
||||||
|
models.BrowserFingerprint,
|
||||||
|
models.SearchEngine,
|
||||||
|
models.Interest,
|
||||||
|
models.Profile,
|
||||||
|
]
|
||||||
|
output_xml = RdfProfile().serialize(
|
||||||
|
# models=exported_models,
|
||||||
|
)
|
||||||
|
self.stdout.write(output_xml)
|
41
profiles/management/commands/import_events.py
Normal file
41
profiles/management/commands/import_events.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
""" Small module that import events into the database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.db import models
|
||||||
|
from profiles.models import Place, Event
|
||||||
|
|
||||||
|
def import_file(filename):
|
||||||
|
with open(filename, mode='r') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
for event in data:
|
||||||
|
import_event(event)
|
||||||
|
|
||||||
|
def import_place(_place):
|
||||||
|
place = Place(
|
||||||
|
name=_place.get("name", ""),
|
||||||
|
address=_place.get("address", ""),
|
||||||
|
lat=float(_place.get("lat", 0)),
|
||||||
|
lon=float(_place.get("lon", 0))
|
||||||
|
)
|
||||||
|
place.save()
|
||||||
|
return place
|
||||||
|
|
||||||
|
def import_event(_event):
|
||||||
|
if isinstance(_event["place"], str):
|
||||||
|
place = Place.objects.get(name=_event["place"])
|
||||||
|
else:
|
||||||
|
place = import_place(_event["place"])
|
||||||
|
event = Event(
|
||||||
|
name=_event.get("name", ""),
|
||||||
|
date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
|
||||||
|
place=place
|
||||||
|
)
|
||||||
|
#print(event)
|
||||||
|
event.save()
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
import_file("data/events.json")
|
51
profiles/management/commands/import_interests.py
Normal file
51
profiles/management/commands/import_interests.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
""" Small module that import interests into the database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.db import models
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist
|
||||||
|
from profiles.models import Keyword, Interest, Place, Website, Event
|
||||||
|
|
||||||
|
def import_file(filename):
|
||||||
|
with open(filename, mode='r') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
for interest in data:
|
||||||
|
import_interest(interest)
|
||||||
|
|
||||||
|
|
||||||
|
def import_interest(_interest):
|
||||||
|
keywords = []
|
||||||
|
places = []
|
||||||
|
websites = []
|
||||||
|
for keyword in _interest.get("keywords", []):
|
||||||
|
try:
|
||||||
|
stored = Keyword.objects.get(text=keyword["keyword"])
|
||||||
|
keywords.append(stored)
|
||||||
|
except ObjectDoesNotExist:
|
||||||
|
new_keyword = Keyword(text=keyword["keyword"])
|
||||||
|
new_keyword.save()
|
||||||
|
keywords.append(new_keyword)
|
||||||
|
print("New keyword %s" % new_keyword)
|
||||||
|
for place in _interest.get("places", []):
|
||||||
|
places.append(Place.objects.get(name=place["place"]))
|
||||||
|
for website in _interest.get("websites", []):
|
||||||
|
websites.append(Website.objects.get(name=website["website"]))
|
||||||
|
|
||||||
|
interest = Interest(
|
||||||
|
name=_interest.get("name", ""),
|
||||||
|
)
|
||||||
|
interest.save()
|
||||||
|
for keyword in keywords:
|
||||||
|
print(keyword)
|
||||||
|
interest.keywords.add(keyword)
|
||||||
|
for place in places:
|
||||||
|
interest.places.add(place)
|
||||||
|
for website in websites:
|
||||||
|
interest.websites.add(website)
|
||||||
|
interest.save()
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
import_file("data/interests.json")
|
46
profiles/management/commands/import_website.py
Normal file
46
profiles/management/commands/import_website.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
""" Small module that import events into the database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.db import models
|
||||||
|
from profiles.models import Webpage, Website, Keyword
|
||||||
|
|
||||||
|
def import_file(filename):
|
||||||
|
with open(filename, mode='r') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
for website in data:
|
||||||
|
import_website(website)
|
||||||
|
|
||||||
|
def import_website(_website):
|
||||||
|
keywords = []
|
||||||
|
webpages = []
|
||||||
|
for keyword in _website.get("keywords", []):
|
||||||
|
new_keyword = Keyword(
|
||||||
|
text=keyword.get("keyword", "")
|
||||||
|
)
|
||||||
|
new_keyword.save()
|
||||||
|
keywords.append(new_keyword)
|
||||||
|
for webpage in _website.get("notable_pages",[]):
|
||||||
|
new_webpage = Webpage(
|
||||||
|
url=webpage.get("keyword", "")
|
||||||
|
)
|
||||||
|
new_webpage.save()
|
||||||
|
webpages.append(new_webpage)
|
||||||
|
website = Website(
|
||||||
|
name=_website.get("name", ""),
|
||||||
|
url=_website.get("url", ""),
|
||||||
|
)
|
||||||
|
website.save()
|
||||||
|
for keyword in keywords:
|
||||||
|
website.keywords.add(keyword)
|
||||||
|
for webpage in webpages:
|
||||||
|
website.notable_pages.add(webpage)
|
||||||
|
print(website)
|
||||||
|
#website.save()
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
import_file("data/website.json")
|
|
@ -6,9 +6,41 @@ the preferred search engin, and if the user is likely to directly use urls
|
||||||
or to type in the search engine.
|
or to type in the search engine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
from django.db import models
|
from django.db import models
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
NICKNAMES = None
|
||||||
|
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
|
||||||
|
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
|
||||||
|
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
|
||||||
|
|
||||||
|
|
||||||
|
def require_nicknames(fct):
|
||||||
|
def read_file(path):
|
||||||
|
global NICKNAMES
|
||||||
|
print("Trying {}".format(path))
|
||||||
|
with open(path, 'r') as handle:
|
||||||
|
NICKNAMES = handle.read().splitlines()
|
||||||
|
|
||||||
|
nicknames_files = [
|
||||||
|
os.path.join(BASE_DIR, 'data/nicknames_dict'),
|
||||||
|
"/usr/share/dict/american-english",
|
||||||
|
]
|
||||||
|
if NICKNAMES is None:
|
||||||
|
for nick_file in nicknames_files:
|
||||||
|
try:
|
||||||
|
read_file(nick_file)
|
||||||
|
break
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
if NICKNAMES is None:
|
||||||
|
raise FileNotFoundError
|
||||||
|
|
||||||
|
return fct
|
||||||
|
|
||||||
|
|
||||||
class InvalidData(Exception):
|
class InvalidData(Exception):
|
||||||
''' Thrown when the DB contains invalid data, and cannot perform
|
''' Thrown when the DB contains invalid data, and cannot perform
|
||||||
|
@ -59,13 +91,13 @@ class Website(models.Model):
|
||||||
""" Generates the url in case the interest chosen is a website.
|
""" Generates the url in case the interest chosen is a website.
|
||||||
"""
|
"""
|
||||||
rand = random.random()
|
rand = random.random()
|
||||||
if user.uses_url:
|
if user.uses_urls:
|
||||||
url = self.url
|
url = self.url
|
||||||
elif rand <= 0.1:
|
elif rand <= 0.1:
|
||||||
url = random.choice(self.notable_pages).url
|
url = random.choice(self.notable_pages.all()).url
|
||||||
elif rand <= 0.8:
|
elif rand <= 0.8:
|
||||||
search_term_text = self.name + " " + \
|
search_term_text = self.name + " " + \
|
||||||
random.choice(self.keywords)
|
random.choice(self.keywords.all())
|
||||||
url = user.search_engine.search_url(search_term_text)
|
url = user.search_engine.search_url(search_term_text)
|
||||||
else:
|
else:
|
||||||
url = user.search_engine.search_url(self.name)
|
url = user.search_engine.search_url(self.name)
|
||||||
|
@ -114,7 +146,6 @@ class Event(models.Model):
|
||||||
return user.search_engine.search_url(" ".join(possibilities))
|
return user.search_engine.search_url(" ".join(possibilities))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserFingerprint(models.Model):
|
class BrowserFingerprint(models.Model):
|
||||||
''' A browser fingerprint, containing things like a user agent '''
|
''' A browser fingerprint, containing things like a user agent '''
|
||||||
|
|
||||||
|
@ -139,11 +170,11 @@ class BrowserFingerprint(models.Model):
|
||||||
|
|
||||||
def serialize_headers(self):
|
def serialize_headers(self):
|
||||||
return {
|
return {
|
||||||
"Description" : str(self.description),
|
"Description": str(self.description),
|
||||||
"User-Agent" : str(self.useragent),
|
"User-Agent": str(self.useragent),
|
||||||
"Accept-Encoding" : str(self.accept_encoding),
|
"Accept-Encoding": str(self.accept_encoding),
|
||||||
"Accept" : str(self.accept_default),
|
"Accept": str(self.accept_default),
|
||||||
"Accept-Language" : str(self.accept_lang),
|
"Accept-Language": str(self.accept_lang),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -154,8 +185,8 @@ class SearchEngine(models.Model):
|
||||||
url = models.URLField()
|
url = models.URLField()
|
||||||
query_pattern = models.CharField(max_length=256) # This field is the
|
query_pattern = models.CharField(max_length=256) # This field is the
|
||||||
# query pattern. It should contain a `{}`, which, when substituted with a
|
# query pattern. It should contain a `{}`, which, when substituted with a
|
||||||
# search term (using `.format()`), must yield a URL that can be resolved to
|
# search term (using `.format()`), must yield a URL tail that can be
|
||||||
# perform the search
|
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
@ -163,9 +194,10 @@ class SearchEngine(models.Model):
|
||||||
def search_url(self, search_term):
|
def search_url(self, search_term):
|
||||||
''' Obtain a url to search `search_term` with this search engine '''
|
''' Obtain a url to search `search_term` with this search engine '''
|
||||||
pattern = str(self.query_pattern)
|
pattern = str(self.query_pattern)
|
||||||
|
search_term = str(search_term).replace(' ', '+')
|
||||||
if '{}' not in pattern:
|
if '{}' not in pattern:
|
||||||
raise InvalidData("Search engine {}: bad pattern".format(self))
|
raise InvalidData("Search engine {}: bad pattern".format(self))
|
||||||
return str(self.query_pattern).format(search_term)
|
return self.url + (str(self.query_pattern).format(search_term))
|
||||||
|
|
||||||
|
|
||||||
class Interest(models.Model):
|
class Interest(models.Model):
|
||||||
|
@ -199,3 +231,35 @@ class Profile(models.Model):
|
||||||
on_delete=models.CASCADE)
|
on_delete=models.CASCADE)
|
||||||
browser_fingerprint = models.ForeignKey(BrowserFingerprint,
|
browser_fingerprint = models.ForeignKey(BrowserFingerprint,
|
||||||
on_delete=models.CASCADE)
|
on_delete=models.CASCADE)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_email(nick, first_name, last_name):
|
||||||
|
domain = random.choice(EMAIL_DOMAINS)
|
||||||
|
if random.random() < 0.3:
|
||||||
|
email = first_name + "." + last_name + "@" + domain
|
||||||
|
else:
|
||||||
|
email = nick + "@" + domain
|
||||||
|
return email
|
||||||
|
|
||||||
|
|
||||||
|
@require_nicknames
|
||||||
|
def create_profile(nick=None):
|
||||||
|
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
|
||||||
|
first_name = random.choice(FIRSTNAMES)
|
||||||
|
last_name = random.choice(LASTNAMES)
|
||||||
|
email = generate_email(nick, first_name, last_name)
|
||||||
|
profile = Profile(
|
||||||
|
nick=nick,
|
||||||
|
first_name=first_name,
|
||||||
|
last_name=last_name,
|
||||||
|
email=email,
|
||||||
|
uses_urls=(random.random() < 0.5),
|
||||||
|
)
|
||||||
|
profile.search_engine = random.choice(SearchEngine.objects.all())
|
||||||
|
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
|
||||||
|
|
||||||
|
profile.full_clean()
|
||||||
|
profile.save()
|
||||||
|
profile.interests.add(random.choice(Interest.objects.all()))
|
||||||
|
profile.save()
|
||||||
|
return profile
|
||||||
|
|
131
profiles/models_rdf.py
Normal file
131
profiles/models_rdf.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
""" RDF serialization class for profile models """
|
||||||
|
|
||||||
|
import rdfserializer as rdf
|
||||||
|
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
|
||||||
|
# ^ This was hurting my eyes way too much
|
||||||
|
from rdfserializer import SCHEMA as schema
|
||||||
|
from rdflib.namespace import Namespace
|
||||||
|
|
||||||
|
import profiles.models as profile_models
|
||||||
|
|
||||||
|
|
||||||
|
LOCAL_NS = Namespace('local:')
|
||||||
|
|
||||||
|
|
||||||
|
class RdfWebpage(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Webpage """
|
||||||
|
|
||||||
|
_type = schema.WebPage
|
||||||
|
model = profile_models.Webpage
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.url, 'url'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfWebsite(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Website """
|
||||||
|
|
||||||
|
_type = schema.WebSite
|
||||||
|
model = profile_models.Website
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.name, 'name'),
|
||||||
|
rdf.RDFSimpleField(schema.url, 'url'),
|
||||||
|
rdf.RDFManyField(schema.keywords, 'keywords',
|
||||||
|
lambda keyword: keyword.text),
|
||||||
|
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfPlace(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Place """
|
||||||
|
|
||||||
|
_type = schema.Place
|
||||||
|
model = profile_models.Place
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.name, 'name'),
|
||||||
|
rdf.RDFSimpleField(schema.address, 'address'),
|
||||||
|
rdf.RDFSimpleField(schema.latitude, 'lat'),
|
||||||
|
rdf.RDFSimpleField(schema.longitude, 'lon'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfEvent(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Event """
|
||||||
|
|
||||||
|
_type = schema.Event
|
||||||
|
model = profile_models.Event
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.name, 'name'),
|
||||||
|
rdf.RDFSimpleField(schema.startDate, 'date'),
|
||||||
|
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfBrowserFingerprint(RDFModelSerializer):
|
||||||
|
""" RDF serializer for BrowserFingerprint """
|
||||||
|
|
||||||
|
_type = schema.Intangible
|
||||||
|
model = profile_models.BrowserFingerprint
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.description, 'description'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfSearchEngine(RDFModelSerializer):
|
||||||
|
""" RDF serializer for SearchEngine """
|
||||||
|
|
||||||
|
_type = schema.WebSite
|
||||||
|
model = profile_models.SearchEngine
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.url, 'url'),
|
||||||
|
rdf.RDFSimpleField(schema.name, 'name'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfInterest(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Interest """
|
||||||
|
|
||||||
|
Interesttype = 'interest'
|
||||||
|
model = profile_models.Interest
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(schema.name, 'name'),
|
||||||
|
rdf.RDFManyField(schema.keywords, 'keywords',
|
||||||
|
lambda keyword: keyword.text),
|
||||||
|
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
|
||||||
|
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
|
||||||
|
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RdfProfile(RDFModelSerializer):
|
||||||
|
""" RDF serializer for Profile """
|
||||||
|
|
||||||
|
_type = schema.Person
|
||||||
|
model = profile_models.Profile
|
||||||
|
entries = [
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
|
||||||
|
rdf.RDFSimpleField(schema.given_name, 'first_name'),
|
||||||
|
rdf.RDFSimpleField(schema.family_name, 'last_name'),
|
||||||
|
rdf.RDFSimpleField(schema.email, 'email'),
|
||||||
|
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
|
||||||
|
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
|
||||||
|
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
|
||||||
|
RdfSearchEngine),
|
||||||
|
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
|
||||||
|
RdfBrowserFingerprint)
|
||||||
|
]
|
|
@ -1,5 +1,5 @@
|
||||||
aiodns==1.1.1
|
aiodns==1.1.1
|
||||||
aiohttp==3.0.1
|
aiohttp==2.3.2
|
||||||
async-timeout==2.0.0
|
async-timeout==2.0.0
|
||||||
attrs==17.4.0
|
attrs==17.4.0
|
||||||
cchardet==2.1.1
|
cchardet==2.1.1
|
||||||
|
@ -12,3 +12,7 @@ pycares==2.3.0
|
||||||
pytz==2017.3
|
pytz==2017.3
|
||||||
yarl==1.1.1
|
yarl==1.1.1
|
||||||
beautifulsoup4==4.6.0
|
beautifulsoup4==4.6.0
|
||||||
|
stem==1.6.0
|
||||||
|
pycurl==7.43.0.1
|
||||||
|
rdflib==4.2.2
|
||||||
|
git+https://github.com/tobast/RDFSerializer.git
|
||||||
|
|
Loading…
Reference in a new issue