Compare commits

...

48 commits

Author SHA1 Message Date
Théophile Bastian 67ad232533 Add a timeout to a single page retrieval 2018-02-26 15:42:36 +01:00
Théophile Bastian e140d4a8a7 Fix merge remanences 2018-02-26 15:37:05 +01:00
Théophile Bastian 98fe69ba62 Real async crawling 2018-02-26 15:30:38 +01:00
Théophile Bastian 968ff6d24c More robust crawling 2018-02-26 15:29:36 +01:00
Rémi Oudin 5d4bd30e20 Exception handling 2018-02-26 15:15:03 +01:00
Rémi Oudin bdfa285e6b We do not want to use settings 2018-02-26 15:14:53 +01:00
Rémi Oudin 65f777f00f Should get the objects and not the Manager 2018-02-26 15:04:26 +01:00
Rémi Oudin 236e40d359 Sanity check 2018-02-26 14:57:46 +01:00
Rémi Oudin 22017cea91 Typo in data u_u 2018-02-26 14:56:22 +01:00
Rémi Oudin 549c861908 Bug fixé 2018-02-26 14:38:26 +01:00
Rémi Oudin 517be1d822 Merge rdf branch 2018-02-26 14:11:06 +01:00
Rémi Oudin c4f63a92b2 Error in the merge, mea culpa 2018-02-26 14:01:29 +01:00
Rémi Oudin db067e56fc Typo 2018-02-26 13:59:34 +01:00
Rémi Oudin 33bdae96e4 merge commit from histories_tobast into histories_models 2018-02-26 12:59:38 +01:00
Rémi Oudin 526aad1364 Add interests 2018-02-26 12:33:23 +01:00
Théophile Bastian 02e91bb2b7 Fix function calls 2018-02-26 11:56:02 +01:00
Théophile Bastian 3e5fc2f9b3 Fix search engine URL generation 2018-02-26 11:49:24 +01:00
Théophile Bastian 45ddbff91a Crawling and histories: fix a lot of stuff 2018-02-26 11:49:24 +01:00
Théophile Bastian e6d587bffd Actually save to DB a created history 2018-02-26 11:49:24 +01:00
Théophile Bastian 8baf408e02 Use dict from data/nicknames_dict for nicknames 2018-02-26 11:49:24 +01:00
Théophile Bastian 6463e348ac Fix populate.sh exec path 2018-02-26 11:48:51 +01:00
Théophile Bastian 22064ebee3 Histories: xml import/export — untested
To be tested when history generation is available
2018-02-26 11:48:51 +01:00
Théophile Bastian a4de51b84a Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:48:51 +01:00
Théophile Bastian 4f0148cb63 Crawler: use a random fingerprint 2018-02-26 11:48:51 +01:00
Théophile Bastian 4a8bd32516 Fix tor_runner import 2018-02-26 11:48:51 +01:00
Rémi Oudin 44cf26df8f It can be useful to save a new object 2018-02-26 11:42:45 +01:00
Rémi Oudin adb892ab7d Check if crawling a search engine 2018-02-26 11:12:36 +01:00
Rémi Oudin 15db8b4697 Change option name due to downgrade of aiohttp 2018-02-26 10:23:32 +01:00
Rémi Oudin d6b26c0a46 Better use of history 2018-02-26 10:05:33 +01:00
Rémi Oudin 8f5c4f3f0f Use datetimes 2018-02-26 09:49:24 +01:00
Rémi Oudin 71d9e18eec Add headers support 2018-02-25 23:56:51 +01:00
Rémi Oudin 8ad46c0481 Bug fix, syntax erro 2018-02-25 21:59:29 +01:00
Rémi Oudin f66c978466 Tor runner has a run function to replay the history 2018-02-25 21:53:28 +01:00
Rémi Oudin 0a676a2f65 PEP8 2018-02-25 21:34:20 +01:00
Rémi Oudin e074d96f02 tor_runner can make requests 2018-02-25 21:27:15 +01:00
Rémi Oudin 93b235cb6c Fix interests import 2018-02-25 21:20:52 +01:00
Rémi Oudin ae5699c089 Basic tor runner 2018-02-25 19:42:58 +01:00
Rémi Oudin f7313ff659 Add populate.sh script 2018-02-25 16:16:04 +01:00
Rémi Oudin 0661fe0f01 Fix path 2018-02-25 16:10:38 +01:00
Rémi Oudin 4b19febdf6 Add interests 2018-02-25 16:10:22 +01:00
Théophile Bastian 15323c3465 [REBASE ME] Crawl: enhance efficiency and output a tree 2018-02-25 15:08:06 +01:00
Théophile Bastian c3bcdea1eb Add tentative export to RDF 2018-02-25 14:37:30 +01:00
Rémi Oudin 05a2e2ca3f Partial generation of profiles 2018-02-25 13:18:12 +01:00
Rémi Oudin d4aefb6bb7 Load the data 2018-02-25 13:17:44 +01:00
Rémi Oudin 3eb82a4a0b data for names and emails 2018-02-25 13:17:27 +01:00
Rémi Oudin 7c0fb7dda1 Better naming 2018-02-25 11:49:44 +01:00
Rémi Oudin ee32e5385b Finished data import 2018-02-25 11:49:11 +01:00
Théophile Bastian 2732e4115f Add RDF models export classes — untested
Also add a dependency to https://github.com/tobast/RDFSerializer/
2018-02-23 13:32:32 +01:00
21 changed files with 1427 additions and 200 deletions

1
.gitignore vendored
View file

@ -65,3 +65,4 @@ venv/
# Django stuff
db.sqlite3
_vimrc_local.vim

View file

@ -1,11 +1,10 @@
from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
@ -15,6 +14,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -26,11 +27,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'Default User'
settings = Settings()
startup_time = datetime.min
def url_getter(html, current_page, root_url):
@ -72,7 +73,7 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto']
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
@ -80,8 +81,6 @@ def url_getter(html, current_page, root_url):
return links_list
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -104,36 +103,47 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
try:
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
if not self.dead:
delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None:
req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
@ -151,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -162,38 +174,47 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, queue):
global settings
self.queue = queue
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.output_tree = []
super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url
def run(self):
global startup_time
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
self.user_agent = user_agent
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
@ -203,7 +224,8 @@ class PageGetter:
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, ssl=ssl) as resp:
async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try:
return await resp.text()
except UnicodeDecodeError:
@ -213,46 +235,89 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False)
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
print('GOT {}HTML for {} at {}'.format(
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
))
async def async_crawler(url, queue):
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queued.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
print(crawled)
queue.put(crawled)
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue)
crawl.start()
crawl.join()
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)

1
data/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
nicknames_dict

14
data/email_domains.txt Normal file
View file

@ -0,0 +1,14 @@
gmail.com
protonmail.com
riseup.net
tutanoto.com
outlook.fr
fastmail.com
yandex.com
aim.com
icloud.com
yahoo.com
fmx.fr
mail.com
hushmail.com
inbox.com

27
data/events.json Normal file
View file

@ -0,0 +1,27 @@
[
{
"name" : "Atelier Anti-Pub",
"date" : "07/03/2018 19:00 UTC",
"place" : {
"name" : "Centre Social Autogéré Vaydom",
"address" : "37 rue Marceau, Ivry-sur-Seine",
"lat" : "48.81787",
"lon" : "2.38032"
}
},
{
"name" : "Rassemblement de soutien pour Bure",
"date" : "27/02/2018 17:00 UTC",
"place" : {
"name" : "Place Saint-Michel",
"address" : "Place Saint-Michel, 75005 Paris",
"lat" : "48.85374",
"lon" : "2.34455"
}
},
{
"name" : "Création d'un serveur mail",
"date" : "15/02/2018 12:00 UTC",
"place" : "La Mutinerie"
}
]

200
data/firstnames.txt Normal file
View file

@ -0,0 +1,200 @@
Jean
Marie
Philippe
Nathalie
Michel
Isabelle
Alain
Sylvie
Patrick
Catherine
Nicolas
Martine
Christophe
Christine
Pierre
Françoise
Christian
Valérie
Éric
Sandrine
Frédéric
Stéphanie
Laurent
Véronique
Stéphane
Sophie
David
Céline
Pascal
Chantal
Daniel
Patricia
Alexandre
Anne
Julien
Brigitte
Thierry
Julie
Olivier
Monique
Bernard
Aurélie
Thomas
Nicole
Sébastien
Laurence
Gérard
Annie
Didier
Émilie
Dominique
Dominique
Vincent
Virginie
François
Corinne
Bruno
Élodie
Guillaume
Christelle
Jérôme
Camille
Jacques
Caroline
Marc
Léa
Maxime
Sarah
Romain
Florence
Claude
Laetitia
Antoine
Audrey
Franck
Hélène
Jean-Pierre
Laura
Anthony
Manon
Kévin
Michèle
Gilles
Cécile
Cédric
Christiane
Serge
Béatrice
André
Claire
Mathieu
Nadine
Benjamin
Delphine
Patrice
Pauline
Fabrice
Karine
Joël
Mélanie
Jérémy
Marion
Clément
Chloe
Arnaud
Jacqueline
Denis
Elisabeth
Paul
Evelyne
Lucas
Marine
Hervé
Claudine
Jean-Claude
Anais
Sylvain
Lucie
Yves
Danielle
Ludovic
Carole
Guy
Fabienne
Florian
Mathilde
Damien
Sandra
Alexis
Pascale
Mickaël
Annick
Quentin
Charlotte
Emmanuel
Emma
Louis
Severine
Benoît
Sabrina
Jean-Luc
Amandine
Fabien
Myriam
Francis
Jocelyne
Hugo
Alexandra
Jonathan
Angelique
Loïc
Josiane
Xavier
Joelle
Théo
Agnes
Adrien
Mireille
Raphaël
Vanessa
Jean-François
Justine
Grégory
Sonia
Robert
Bernadette
Michaël
Emmanuelle
Valentin
Oceane
Cyril
Amelie
Jean-Marc
Clara
René
Maryse
Lionel
Anne-marie
Yannick
Fanny
Enzo
Magali
Yannis
Marie-christine
Jean-Michel
Morgane
Baptiste
Ines
Matthieu
Nadia
Rémi
Muriel
Georges
Jessica
Aurélien
Laure
Nathan
Genevieve
Jean-Paul
Estelle

55
data/interests.json Normal file
View file

@ -0,0 +1,55 @@
[
{
"name": "occupation",
"keywords": [
{"keyword" : "ZAD NDDL"},
{"keyword" : "Organiser un squat"},
{"keyword" : "mobilisation et rassemblement"}
],
"places": [
{"place" : "Zad NDDL"},
{"place" : "Zad Bure"}
],
"websites": [
{"website": "zad nadir"}
],
"events": [
{"event": "Rassemblement de soutien pour Bure"}
]
},
{
"name": "LGBT",
"keywords": [
{"keyword" : "Discrimniation sexistes, quelles actions ?"},
{"keyword" : "gender queer Paris"},
{"keyword" : "Existrans Paris"}
],
"places": [
{"place" : "La Mutinerie"}
],
"websites": [
{"website": "emmaclit"},
{"website": "paris-luttes info"}
],
"events": [
{"event": "Création d'un serveur mail"}
]
},
{
"name": "Anti pub",
"keywords": [
{"keyword" : "Affichage JCDecaux"},
{"keyword" : "Anti-pub"},
{"keyword" : "Journée contre la publicité"}
],
"places": [
{"place" : "Centre Social Autogéré Vaydom"}
],
"websites": [
{"website": "paris-luttes info"}
],
"events": [
{"event": "Atelier Anti-Pub"}
]
}
]

200
data/lastnames.txt Normal file
View file

@ -0,0 +1,200 @@
Martin
Bernard
Thomas
Petit
Robert
Richard
Durand
Dubois
Moreau
Laurent
Simon
Michel
Lefebvre
Leroy
Roux
David
Bertrand
Morel
Fournier
Girard
Bonnet
Dupont
Lambert
Fontaine
Rousseau
Vincent
Muller
Lefevre
Faure
Andre
Mercier
Blanc
Guerin
Boyer
Garnier
Chevalier
Francois
Legrand
Gauthier
Garcia
Perrin
Robin
Clement
Morin
Nicolas
Henry
Roussel
Mathieu
Gautier
Masson
Marchand
Duval
Denis
Dumont
Marie
Lemaire
Noel
Meyer
Dufour
Meunier
Brun
Blanchard
Giraud
Joly
Riviere
Lucas
Brunet
Gaillard
Barbier
Arnaud
Martinez
Gerard
Roche
Renard
Schmitt
Roy
Leroux
Colin
Vidal
Caron
Picard
Roger
Fabre
Aubert
Lemoine
Renaud
Dumas
Lacroix
Olivier
Philippe
Bourgeois
Pierre
Benoit
Rey
Leclerc
Payet
Rolland
Leclercq
Guillaume
Lecomte
Lopez
Jean
Dupuy
Guillot
Hubert
Berger
Carpentier
Sanchez
Dupuis
Moulin
Louis
Deschamps
Huet
Vasseur
Perez
Boucher
Fleury
Royer
Klein
Jacquet
Adam
Paris
Poirier
Marty
Aubry
Guyot
Carre
Charles
Renault
Charpentier
Menard
Maillard
Baron
Bertin
Bailly
Herve
Schneider
Fernandez
Le Gall
Collet
Leger
Bouvier
Julien
Prevost
Millet
Perrot
Daniel
Le Roux
Cousin
Germain
Breton
Besson
Langlois
Remy
Le Goff
Pelletier
Leveque
Perrier
Leblanc
Barre
Lebrun
Marchal
Weber
Mallet
Hamon
Boulanger
Jacob
Monnier
Michaud
Rodriguez
Guichard
Gillet
Etienne
Grondin
Poulain
Tessier
Chevallier
Collin
Chauvin
Da Silva
Bouchet
Gay
Lemaitre
Benard
Marechal
Humbert
Reynaud
Antoine
Hoarau
Perret
Barthelemy
Cordier
Pichon
Lejeune
Gilbert
Lamy
Delaunay
Pasquier
Carlier
Laporte

View file

@ -1,93 +1,93 @@
[
"website": {
{
"name":"emmaclit",
"url":"https://emmaclit.com/",
"keywords": [
"keyword":"Charge mentale",
"keyword":"Un autre regard",
"keyword":"Un petit poutou",
"keyword":"solidarité",
"keyword":"dédicace"
},
"notable_pages": [
"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/",
"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/",
"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"
]
},
"website": {
"name":"paris-luttes info",
"url":"https//paris-luttes.info/",
"keywords": [
"keyword":"manifestations",
"keyword":"solidarité immigré·e·s",
"keyword":"grève salariés",
"keyword":"prison",
"keyword":"violence policère"
{"keyword":"Charge mentale"},
{"keyword":"Un autre regard"},
{"keyword":"Un petit poutou"},
{"keyword":"solidarité"},
{"keyword":"dédicace"}
],
"notable_pages": [
"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr",
"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr",
"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"
{"webpage": "https://emmaclit.com/2017/05/09/repartition-des-taches-hommes-femmes/"},
{"webpage": "https://emmaclit.com/2016/12/01/une-famille-parmi-dautres/"},
{"webpage": "https://emmaclit.com/2017/09/11/travaille-pourquoi/"}
]
},
"website": {
{
"name":"paris-luttes info",
"url":"https://paris-luttes.info/",
"keywords": [
{"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"},
{"keyword":"grève salariés"},
{"keyword":"prison"},
{"keyword":"violence policère"}
],
"notable_pages": [
{"webpage": "https://paris-luttes.info/-analyse-et-reflexion-?lang=fr"},
{"webpage": "https://paris-luttes.info/comment-publier-sur-paris-luttes-134?lang=fr"},
{"webpage": "https://paris-luttes.info/pourquoi-et-comment-utiliser-tor-9013?lang=fr"}
]
},
{
"name":"zad nadir",
"url":"http://zad.nadir.org/",
"keywords": [
"keyword":"Écologie",
"keyword":"opération césar",
"keyword":"expulsion vinci",
"keyword":"adresse",
"keyword":"la wardine",
"keyword":"route des chicanes",
"keyword":"opposition à l'aéroport Grand Ouest"
{"keyword":"Écologie"},
{"keyword":"opération césar"},
{"keyword":"expulsion vinci"},
{"keyword":"adresse"},
{"keyword":"la wardine"},
{"keyword":"route des chicanes"},
{"keyword":"opposition à l'aéroport Grand Ouest"}
],
"notable_pages": [
"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr",
"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr",
"webpage": "http://zad.nadir.org/spip.php?rubrique71",
"webpage": "https://zad.nadir.org/spip.php?rubrique70"
{"webpage": "http://zad.nadir.org/spip.php?article86&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?article515&lang=fr"},
{"webpage": "http://zad.nadir.org/spip.php?rubrique71"},
{"webpage": "https://zad.nadir.org/spip.php?rubrique70"}
]
},
"website": {
{
"name":"Fnac",
"url":"https://www.fnac.com/",
"keywords": [
"keyword":"smartphone",
"keyword":"SAV",
"keyword":"Macbook",
"keyword":"TV",
"keyword":"PC Gaming",
"keyword":"DVD",
"keyword":"Home Cinema Philips",
"keyword":"Billeterie"
{"keyword":"smartphone"},
{"keyword":"SAV"},
{"keyword":"Macbook"},
{"keyword":"TV"},
{"keyword":"PC Gaming"},
{"keyword":"DVD"},
{"keyword":"Home Cinema Philips"},
{"keyword":"Billeterie"}
],
"notable_pages": [
"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo",
"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer",
"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1",
"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"
{"webpage": "https://www.fnac.com/Informatique/shi48966/w-4#bl=MMinfo"},
{"webpage": "https://www.fnac.com/Service/default.aspx#bl=footer"},
{"webpage": "https://www.fnac.com/Ventes-Flash/shi42245/w-4#bl=marktlink1"},
{"webpage": "https://www.fnac.com/Home-cinema-barre-de-son-et-enceinte-TV/shi474916/w-4#bl=MMtvh"}
]
},
"website": {
{
"name":"Sea Shepherd",
"url":"https://www.seashepherd.fr/",
"keywords": [
"keyword":"pirates",
"keyword":"Phoques",
"keyword":"Paul Watson",
"keyword":"harponnage",
"keyword":"seal",
"keyword":"Chasse aux dauphins",
"keyword":"participation",
"keyword":"boutique"
{"keyword":"pirates"},
{"keyword":"Phoques"},
{"keyword":"Paul Watson"},
{"keyword":"harponnage"},
{"keyword":"seal"},
{"keyword":"Chasse aux dauphins"},
{"keyword":"participation"},
{"keyword":"boutique"}
],
"notable_pages": [
"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous",
"webpage": "http://nyamba.seashepherd.info/",
"webpage": "http://seashepherd-shop.com/en/",
"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"
{"webpage": "http://www.seashepherd.fr/index.php/qui-sommes-nous"},
{"webpage": "http://nyamba.seashepherd.info/"},
{"webpage": "http://seashepherd-shop.com/en/"},
{"webpage": "http://seashepherd.fr/index.php/qui-sommes-nous/sea-shepherd-france"}
]
}
]

View file

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View file

@ -3,13 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
from collections import namedtuple
import random
from math import floor
from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model):
@ -27,14 +40,48 @@ class HistoryEntry(models.Model):
"""
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = self.search
entry_ts = ET.Element('timestamp')
entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like '
'structure.'
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
)
played = models.BooleanField(default=False)
@ -46,58 +93,134 @@ class History(models.Model):
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
history_set = self.history_set.order_by('timestamp')
return history_set
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
def __str__(self):
""" Returns the string representation of a history.
"""
history_set = self.history_set.order_by('timestamp')
header = "[History]:\n"
return header + "\n".join(history_set)
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
def play_history(self):
def play_histories(self):
""" Actually plays the history.
"""
self.played = True
runner = TorInstance(self.return_history())
runnner.run()
self.save()
def to_xml(self, xml_root):
''' Exports the current history to xml '''
hist_node = ET.Element("history", attrib={
'start-ts': self.start_ts,
'played': 1 if self.played else 0,
'user': self.user.pk,
})
xml_root.append(hist_node)
for entry in self.historyentry_set:
entry.to_xml(hist_node)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
timestamp = t_start
result = []
basis = generate_first_url(user)
result.append((basis, t_start))
t_start += 5* random.weibullvariate(1, 1.5)
queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue)
result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
t_start += 5* random.weibullvariate(1, 1.5)
result.append((url, t_start)
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all()
]
)
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, ts_start):
def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
@ -105,19 +228,32 @@ def generate_history(user, ts_start):
"""
# let's define a new history object.
history = History(start_ts=ts_start, user=user)
history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
history_line = 0
current_timestamp = start_time.timestamp()
while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
hist_size = 0
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
new_line = HistoryEntry(
search=url,
timestamp=timestamp,
history=history
)
new_line.save()
if len(url) < 200:
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
try:
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
return history

120
histories/tor_runner.py Normal file
View file

@ -0,0 +1,120 @@
"""
Modules that handles tor instaces creations in order to safely run histories
"""
import shutil
import datetime as dt
from time import sleep
import asyncio
import aiohttp
from aiosocks.connector import ProxyConnector, ProxyClientRequest
import async_timeout
import stem.process as tor
class TorInstance():
"""
A tor instance object, with some useful information.
It is designed to be used as a worker in order to replay an history.
"""
BASE_SOCKS_PORT = 40000
BASE_CONTROL_PORT = 20000
BASE_DATA_DIR = "/tmp/tor{}/"
TOR_RUNNER = 0
@classmethod
async def create(cls, history, headers):
""" Factory creation of tor processes"""
socks_port = cls.BASE_SOCKS_PORT + cls.TOR_RUNNER
control_port = cls.BASE_CONTROL_PORT + cls.TOR_RUNNER
data_dir = cls.BASE_DATA_DIR.format(cls.TOR_RUNNER)
TorInstance.TOR_RUNNER += 1
self = TorInstance()
self.socks_port = socks_port
self.control_port = control_port
self.data_dir = data_dir
self.history = history
self.headers = headers
self.proxy = "socks5://127.0.0.1:{}".format(self.socks_port)
self.create_session()
self.process = tor.launch_tor_with_config(
config={
'ControlPort' : str(control_port),
'SocksPort' : str(socks_port),
'DataDir' : data_dir
}
)
return self
def __init__(self):
self.socks_port = 0
self.control_port = 0
self.data_dir = ""
self.history = None
self.proxy = ""
self.headers = {}
self.session = None
self.process = None
async def run(self):
""" Runs the Tor Instance on the history.
"""
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
print("Sleeping")
sleep(10)
while self.history:
item = self.history.pop(0)
async with async_timeout.timeout(30):
await(self.query(item[0]))
now = dt.datetime.now()
if now <= self.history[0][1]:
sleep((self.history[0][1] - now).total_seconds())
def create_session(self):
""" Create a aiohttp session.
"""
conn = ProxyConnector(remote_resolve=True)
self.session = aiohttp.ClientSession(
connector=conn,
headers=self.headers,
request_class=ProxyClientRequest
)
async def query(self, url):
""" Performs a query.
"""
async with async_timeout.timeout(30):
async with self.session.get(
url,
proxy=self.proxy,
proxy_auth=None) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
def __str__(self):
""" Utility function """
return ('[TOR] SOCKSPort: {0.socks_port}, ControlPort: '
'{0.control_port}, DataDir: {0.data_dir}'.format(self))
async def kill(self):
""" Kills the process and remove the data dir"""
self.process.kill()
self.session.close()
shutil.rmtree(self.data_dir)
async def main():
""" Test function """
for _ in range(3):
instance = await TorInstance.create(None, {"user-agent" : "Blah"})
await instance.query("https://python.org/")
print("One page received")
await instance.kill()
if __name__ == "__main__":
LOOP = asyncio.get_event_loop()
LOOP.run_until_complete(main())

View file

@ -97,7 +97,7 @@ USE_I18N = True
USE_L10N = True
USE_TZ = True
USE_TZ = False # We don't really care, we want POSIX timestamps
# Static files (CSS, JavaScript, Images)

10
populate.sh Normal file
View file

@ -0,0 +1,10 @@
#!/bin/bash
# -*- coding: UTF8 -*-
python3 manage.py import_browser_fp
python3 manage.py import_search_engine
python3 manage.py import_keywords
python3 manage.py import_website
python3 manage.py import_places
python3 manage.py import_events
python3 manage.py import_interests

View file

@ -0,0 +1,27 @@
from django.core.management.base import BaseCommand
from profiles.models_rdf import RdfProfile
from profiles import models
class Command(BaseCommand):
''' Exports database models to RDF '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
exported_models = [
models.Keyword,
models.Webpage,
models.Website,
models.Place,
models.Event,
models.BrowserFingerprint,
models.SearchEngine,
models.Interest,
models.Profile,
]
output_xml = RdfProfile().serialize(
# models=exported_models,
)
self.stdout.write(output_xml)

View file

@ -0,0 +1,41 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Place, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for event in data:
import_event(event)
def import_place(_place):
place = Place(
name=_place.get("name", ""),
address=_place.get("address", ""),
lat=float(_place.get("lat", 0)),
lon=float(_place.get("lon", 0))
)
place.save()
return place
def import_event(_event):
if isinstance(_event["place"], str):
place = Place.objects.get(name=_event["place"])
else:
place = import_place(_event["place"])
event = Event(
name=_event.get("name", ""),
date=datetime.strptime(_event.get("date", "01/01/1970 00:00 UTC"), "%d/%m/%Y %H:%M %Z"),
place=place
)
#print(event)
event.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/events.json")

View file

@ -0,0 +1,51 @@
""" Small module that import interests into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for interest in data:
import_interest(interest)
def import_interest(_interest):
keywords = []
places = []
websites = []
for keyword in _interest.get("keywords", []):
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
websites.append(Website.objects.get(name=website["website"]))
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
for website in websites:
interest.websites.add(website)
interest.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/interests.json")

View file

@ -0,0 +1,46 @@
""" Small module that import events into the database.
"""
import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from profiles.models import Webpage, Website, Keyword
def import_file(filename):
with open(filename, mode='r') as file:
data = json.load(file)
for website in data:
import_website(website)
def import_website(_website):
keywords = []
webpages = []
for keyword in _website.get("keywords", []):
new_keyword = Keyword(
text=keyword.get("keyword", "")
)
new_keyword.save()
keywords.append(new_keyword)
for webpage in _website.get("notable_pages",[]):
new_webpage = Webpage(
url=webpage.get("keyword", "")
)
new_webpage.save()
webpages.append(new_webpage)
website = Website(
name=_website.get("name", ""),
url=_website.get("url", ""),
)
website.save()
for keyword in keywords:
website.keywords.add(keyword)
for webpage in webpages:
website.notable_pages.add(webpage)
print(website)
#website.save()
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/website.json")

View file

@ -6,9 +6,41 @@ the preferred search engin, and if the user is likely to directly use urls
or to type in the search engine.
"""
import os
import random
from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform
@ -59,13 +91,13 @@ class Website(models.Model):
""" Generates the url in case the interest chosen is a website.
"""
rand = random.random()
if user.uses_url:
if user.uses_urls:
url = self.url
elif rand <= 0.1:
url = random.choice(self.notable_pages).url
url = random.choice(self.notable_pages.all()).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
random.choice(self.keywords)
search_term_text = self.name + " " + \
random.choice(self.keywords.all())
url = user.search_engine.search_url(search_term_text)
else:
url = user.search_engine.search_url(self.name)
@ -114,7 +146,6 @@ class Event(models.Model):
return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent '''
@ -139,11 +170,11 @@ class BrowserFingerprint(models.Model):
def serialize_headers(self):
return {
"Description" : str(self.description),
"User-Agent" : str(self.useragent),
"Accept-Encoding" : str(self.accept_encoding),
"Accept" : str(self.accept_default),
"Accept-Language" : str(self.accept_lang),
"Description": str(self.description),
"User-Agent": str(self.useragent),
"Accept-Encoding": str(self.accept_encoding),
"Accept": str(self.accept_default),
"Accept-Language": str(self.accept_lang),
}
@ -154,8 +185,8 @@ class SearchEngine(models.Model):
url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL that can be resolved to
# perform the search
# search term (using `.format()`), must yield a URL tail that can be
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
def __str__(self):
return self.name
@ -163,9 +194,10 @@ class SearchEngine(models.Model):
def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self))
return str(self.query_pattern).format(search_term)
return self.url + (str(self.query_pattern).format(search_term))
class Interest(models.Model):
@ -199,3 +231,35 @@ class Profile(models.Model):
on_delete=models.CASCADE)
browser_fingerprint = models.ForeignKey(BrowserFingerprint,
on_delete=models.CASCADE)
def generate_email(nick, first_name, last_name):
domain = random.choice(EMAIL_DOMAINS)
if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain
else:
email = nick + "@" + domain
return email
@require_nicknames
def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name)
profile = Profile(
nick=nick,
first_name=first_name,
last_name=last_name,
email=email,
uses_urls=(random.random() < 0.5),
)
profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile

131
profiles/models_rdf.py Normal file
View file

@ -0,0 +1,131 @@
""" RDF serialization class for profile models """
import rdfserializer as rdf
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
# ^ This was hurting my eyes way too much
from rdfserializer import SCHEMA as schema
from rdflib.namespace import Namespace
import profiles.models as profile_models
LOCAL_NS = Namespace('local:')
class RdfWebpage(RDFModelSerializer):
""" RDF serializer for Webpage """
_type = schema.WebPage
model = profile_models.Webpage
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
]
class RdfWebsite(RDFModelSerializer):
""" RDF serializer for Website """
_type = schema.WebSite
model = profile_models.Website
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
]
class RdfPlace(RDFModelSerializer):
""" RDF serializer for Place """
_type = schema.Place
model = profile_models.Place
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.address, 'address'),
rdf.RDFSimpleField(schema.latitude, 'lat'),
rdf.RDFSimpleField(schema.longitude, 'lon'),
]
class RdfEvent(RDFModelSerializer):
""" RDF serializer for Event """
_type = schema.Event
model = profile_models.Event
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.startDate, 'date'),
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
]
class RdfBrowserFingerprint(RDFModelSerializer):
""" RDF serializer for BrowserFingerprint """
_type = schema.Intangible
model = profile_models.BrowserFingerprint
entries = [
rdf.RDFSimpleField(schema.description, 'description'),
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
]
class RdfSearchEngine(RDFModelSerializer):
""" RDF serializer for SearchEngine """
_type = schema.WebSite
model = profile_models.SearchEngine
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
]
class RdfInterest(RDFModelSerializer):
""" RDF serializer for Interest """
Interesttype = 'interest'
model = profile_models.Interest
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
]
class RdfProfile(RDFModelSerializer):
""" RDF serializer for Profile """
_type = schema.Person
model = profile_models.Profile
entries = [
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
rdf.RDFSimpleField(schema.given_name, 'first_name'),
rdf.RDFSimpleField(schema.family_name, 'last_name'),
rdf.RDFSimpleField(schema.email, 'email'),
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
RdfSearchEngine),
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
RdfBrowserFingerprint)
]

View file

@ -1,5 +1,5 @@
aiodns==1.1.1
aiohttp==3.0.1
aiohttp==2.3.2
async-timeout==2.0.0
attrs==17.4.0
cchardet==2.1.1
@ -12,3 +12,7 @@ pycares==2.3.0
pytz==2017.3
yarl==1.1.1
beautifulsoup4==4.6.0
stem==1.6.0
pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git