Compare commits
49 Commits
histories_
...
master
Author | SHA1 | Date |
---|---|---|
Rémi Oudin | 89d1f8301a | |
Théophile Bastian | 379b53e6ce | |
Théophile Bastian | c94841c17b | |
Rémi Oudin | 97107d9bec | |
Rémi Oudin | dedc66bb9d | |
Théophile Bastian | d3d04739e7 | |
Théophile Bastian | b88aeffd5a | |
Rémi Oudin | 7c8ec7351c | |
Théophile Bastian | 2005c0f24f | |
Rémi Oudin | 392e16b797 | |
Théophile Bastian | 185c1cf8a4 | |
Rémi Oudin | 9dd1954067 | |
Rémi Oudin | 04270e88c0 | |
Théophile Bastian | 6bc64ceb7a | |
Rémi Oudin | 15e0c2a11c | |
Rémi Oudin | 2b07779f5c | |
Théophile Bastian | 8cdc50c04e | |
Rémi Oudin | 22fa039f1b | |
Théophile Bastian | e4ad8c7ce6 | |
Théophile Bastian | 67ad232533 | |
Théophile Bastian | e140d4a8a7 | |
Théophile Bastian | 98fe69ba62 | |
Théophile Bastian | 968ff6d24c | |
Rémi Oudin | 5d4bd30e20 | |
Rémi Oudin | bdfa285e6b | |
Rémi Oudin | 65f777f00f | |
Rémi Oudin | 236e40d359 | |
Rémi Oudin | 22017cea91 | |
Rémi Oudin | 549c861908 | |
Rémi Oudin | 517be1d822 | |
Rémi Oudin | c4f63a92b2 | |
Rémi Oudin | db067e56fc | |
Rémi Oudin | 33bdae96e4 | |
Rémi Oudin | 526aad1364 | |
Théophile Bastian | 02e91bb2b7 | |
Théophile Bastian | 3e5fc2f9b3 | |
Théophile Bastian | 45ddbff91a | |
Théophile Bastian | e6d587bffd | |
Théophile Bastian | 8baf408e02 | |
Théophile Bastian | 6463e348ac | |
Théophile Bastian | 22064ebee3 | |
Théophile Bastian | a4de51b84a | |
Théophile Bastian | 4f0148cb63 | |
Théophile Bastian | 4a8bd32516 | |
Rémi Oudin | 44cf26df8f | |
Rémi Oudin | 93b235cb6c | |
Théophile Bastian | 15323c3465 | |
Théophile Bastian | c3bcdea1eb | |
Théophile Bastian | 2732e4115f |
|
@ -65,3 +65,4 @@ venv/
|
|||
# Django stuff
|
||||
db.sqlite3
|
||||
|
||||
_vimrc_local.vim
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# mpri-webdam
|
||||
|
||||
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
|
||||
Generate realistic fake browsing histories for borderline and/or activists
|
||||
users, to hide real traffic from global surveillance.
|
||||
|
||||
Lacks proper documentation at the moment `:(`
|
||||
|
|
193
crawl/crawl.py
193
crawl/crawl.py
|
@ -1,11 +1,10 @@
|
|||
from threading import Thread
|
||||
from queue import Queue
|
||||
from urllib.robotparser import RobotFileParser
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ssl import CertificateError
|
||||
from random import sample, randrange
|
||||
from random import sample, randrange, randint
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
@ -15,6 +14,8 @@ import async_timeout
|
|||
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
from profiles.models import BrowserFingerprint, SearchEngine
|
||||
|
||||
# Ugly hack to use this module alone instead of integrating it with Django
|
||||
# from django.conf import settings
|
||||
|
||||
|
@ -26,13 +27,11 @@ MAX_PER_PAGE = 10
|
|||
|
||||
FOOTER_URL = re.compile(".*footer.*")
|
||||
|
||||
SEARCH_ENGINE = []
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Default User'
|
||||
|
||||
settings = Settings()
|
||||
startup_time = datetime.min
|
||||
|
||||
|
||||
def url_getter(html, current_page, root_url):
|
||||
|
@ -74,7 +73,7 @@ def url_getter(html, current_page, root_url):
|
|||
# Works only with python >= 3.6
|
||||
links_list = list(dict.fromkeys(links_list))
|
||||
|
||||
forbidden_words = ['login', 'agreement', 'mailto']
|
||||
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
|
||||
links_list = [link for link in links_list if not any(word in link.lower()
|
||||
for word in
|
||||
forbidden_words)]
|
||||
|
@ -82,8 +81,6 @@ def url_getter(html, current_page, root_url):
|
|||
return links_list
|
||||
|
||||
|
||||
|
||||
|
||||
class WebsiteSchedulerMeta(type):
|
||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||
interface, but spawning one instance per canonical website URL """
|
||||
|
@ -106,12 +103,17 @@ class WebsiteSchedulerMeta(type):
|
|||
|
||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||
""" Schedule the accesses to a website as of robots.txt """
|
||||
def __init__(self, name):
|
||||
|
||||
search_engines = [] # Must be set by CrawlingThread.__init__
|
||||
|
||||
def __init__(self, name, user_agent):
|
||||
self.name = name
|
||||
self.last_crawled = datetime.fromtimestamp(0)
|
||||
self.dead = False
|
||||
self.can_fetch_b = False
|
||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
||||
self.user_agent = (user_agent if user_agent is not None
|
||||
else settings.USER_AGENT)
|
||||
if any(self.urlroot() in item for item in self.search_engines):
|
||||
print("found a search engine for %s" % self.urlroot())
|
||||
self.crawl_delay = timedelta(seconds=5)
|
||||
self.can_fetch_b = True
|
||||
|
@ -125,7 +127,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read()
|
||||
except URLError: # Almost surely an offline website.
|
||||
except URLError: # Almost surely an offline website.
|
||||
self.dead = True
|
||||
self.crawl_delay = 0
|
||||
except Exception as e:
|
||||
|
@ -134,9 +136,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
if not self.robot_parser.default_entry:
|
||||
self.dead = True
|
||||
if not self.dead:
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
delay = self.robot_parser.crawl_delay(self.user_agent)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
req_rate = self.robot_parser.request_rate(self.user_agent)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
|
@ -159,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
|
||||
def can_fetch(self, url):
|
||||
''' Check whether this program can fetch a given page '''
|
||||
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
|
||||
return ((self.can_fetch_b)
|
||||
or ((not self.dead) and
|
||||
self.robot_parser.can_fetch(self.user_agent, url)))
|
||||
|
||||
def fetching(self):
|
||||
''' Tell the scheduler that a page is being fetched now '''
|
||||
|
@ -170,40 +174,47 @@ class CrawlingThread(Thread):
|
|||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||
since the thread will need its own event loop. """
|
||||
|
||||
def __init__(self, user, url, engine_list, queue):
|
||||
global settings
|
||||
global SEARCH_ENGINE
|
||||
SEARCH_ENGINE = engine_list
|
||||
self.queue = queue
|
||||
def __init__(self, url):
|
||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||
WebsiteScheduler.search_engines = engine_list
|
||||
|
||||
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||
fingerprint = BrowserFingerprint.objects.all()[
|
||||
randint(0, nb_fingerprint - 1)]
|
||||
self.headers = fingerprint.serialize_headers()
|
||||
|
||||
self.output_tree = []
|
||||
super(CrawlingThread, self).__init__()
|
||||
if user:
|
||||
settings.USER_AGENT = user.serialize_headers()
|
||||
self.url = url
|
||||
|
||||
def run(self):
|
||||
global startup_time
|
||||
tasks = []
|
||||
|
||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||
#tasks.append(async_crawler('https://python.org/'))
|
||||
tasks.append(async_crawler(self.url, self.queue))
|
||||
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
startup_time = datetime.now()
|
||||
loop.run_until_complete(asyncio.wait(tasks))
|
||||
loop.close()
|
||||
try:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(asyncio.wait(tasks))
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
class PageGetter:
|
||||
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||
|
||||
def __init__(self, session, url):
|
||||
headers = None
|
||||
|
||||
def __init__(self, session, url, user_agent):
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.user_agent = user_agent
|
||||
|
||||
async def get(self, ssl=True):
|
||||
""" Actually retrieve the webpage """
|
||||
scheduler = WebsiteScheduler(self.url)
|
||||
scheduler = WebsiteScheduler(self.url, self.user_agent)
|
||||
if not scheduler.can_fetch(self.url):
|
||||
return None
|
||||
|
||||
|
@ -214,7 +225,6 @@ class PageGetter:
|
|||
scheduler.fetching()
|
||||
async with async_timeout.timeout(10):
|
||||
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
||||
print("Resp status %s" % resp.status)
|
||||
try:
|
||||
return await resp.text()
|
||||
except UnicodeDecodeError:
|
||||
|
@ -224,50 +234,89 @@ class PageGetter:
|
|||
async def async_print(url):
|
||||
""" Debug function to follow what's actually happening """
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await PageGetter(session, url).get(ssl=False)
|
||||
html = await PageGetter(session, url,
|
||||
settings.USER_AGENT).get(ssl=False)
|
||||
|
||||
print('GOT {}HTML for {} at {}'.format(
|
||||
print('GOT {}HTML for {}'.format(
|
||||
'None ' if html is None else '',
|
||||
url,
|
||||
datetime.now() - startup_time))
|
||||
))
|
||||
|
||||
async def async_crawler(url, queue):
|
||||
queued = [url]
|
||||
crawled = []
|
||||
while queued and (len(crawled) < HARD_LIMIT):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
url = queued.pop(0)
|
||||
except IndexError:
|
||||
print("Error queue is empty")
|
||||
return crawled
|
||||
parsed_url = urlparse(url)
|
||||
print("Crawling {}".format(url))
|
||||
html = await PageGetter(session, url).get(ssl=False)
|
||||
if html:
|
||||
new_urls = url_getter(
|
||||
html,
|
||||
url,
|
||||
parsed_url.scheme + "://" + parsed_url.netloc
|
||||
)
|
||||
crawled += [url]
|
||||
if new_urls:
|
||||
sampled = sample(
|
||||
new_urls,
|
||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||
)
|
||||
queued += [sample_url for sample_url in sampled if
|
||||
sample_url not in queued and sample_url not in
|
||||
crawled]
|
||||
else:
|
||||
print("No html received")
|
||||
print(crawled)
|
||||
queue.put(crawled)
|
||||
|
||||
if __name__ == '__main__':
|
||||
queue = Queue()
|
||||
crawl = CrawlingThread(None,
|
||||
"https://google.com/search?q=fabriquer+masque+manif",
|
||||
["https://google.com/search/"], queue)
|
||||
crawl.start()
|
||||
crawl.join()
|
||||
class CrawlElem:
|
||||
''' Describes a crawled element, to be assembled into a tree '''
|
||||
|
||||
def __init__(self, url, parent):
|
||||
self.url = url
|
||||
self.parent = parent
|
||||
|
||||
|
||||
async def run_crawl(url, output_tree, headers=None):
|
||||
''' Starts a crawling session '''
|
||||
|
||||
if headers is None:
|
||||
headers = {}
|
||||
if 'User-Agent' not in headers:
|
||||
headers['User-Agent'] = settings.USER_AGENT
|
||||
|
||||
user_agent = headers['User-Agent']
|
||||
crawled = set()
|
||||
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
await async_crawler(
|
||||
url, output_tree, crawled, user_agent, session, None)
|
||||
|
||||
|
||||
def simplify_url(url):
|
||||
anchor = url.find('#')
|
||||
if anchor >= 0:
|
||||
url = url[:anchor]
|
||||
|
||||
prot = url.find('://')
|
||||
if prot >= 0:
|
||||
url = url[prot+3:]
|
||||
|
||||
if url.startswith('www.'):
|
||||
url = url[4:]
|
||||
|
||||
return url
|
||||
|
||||
|
||||
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
|
||||
if len(crawled) >= HARD_LIMIT:
|
||||
return
|
||||
crawled.add(simplify_url(url))
|
||||
parsed_url = urlparse(url)
|
||||
print("Crawling {}".format(url))
|
||||
try:
|
||||
with async_timeout.timeout(3):
|
||||
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
||||
except asyncio.TimeoutError:
|
||||
return
|
||||
|
||||
new_tasks = []
|
||||
|
||||
if html:
|
||||
this_elem = CrawlElem(url, parent)
|
||||
out_tree.append(this_elem)
|
||||
new_urls = url_getter(
|
||||
html,
|
||||
url,
|
||||
parsed_url.scheme + "://" + parsed_url.netloc
|
||||
)
|
||||
if new_urls:
|
||||
sampled = sample(
|
||||
new_urls,
|
||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||
)
|
||||
for sample_url in sampled:
|
||||
if simplify_url(sample_url) not in crawled:
|
||||
new_tasks.append(async_crawler(
|
||||
sample_url, out_tree, crawled, user_agent, session,
|
||||
this_elem))
|
||||
else:
|
||||
print("No html received")
|
||||
if len(crawled) >= HARD_LIMIT:
|
||||
return
|
||||
if new_tasks:
|
||||
await asyncio.wait(new_tasks)
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
nicknames_dict
|
|
@ -13,6 +13,13 @@
|
|||
"query_pattern":"?q={}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"searchengine": {
|
||||
"name":"Duckduckgo Lite",
|
||||
"url":"https://duckduckgo.com/lite/",
|
||||
"query_pattern":"?q={}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"searchengine": {
|
||||
"name":"Qwant",
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
},
|
||||
{
|
||||
"name":"paris-luttes info",
|
||||
"url":"https//paris-luttes.info/",
|
||||
"url":"https://paris-luttes.info/",
|
||||
"keywords": [
|
||||
{"keyword":"manifestations"},
|
||||
{"keyword":"solidarité immigré·e·s"},
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from profiles import models as profiles
|
||||
from histories.models import generate_history
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
''' Generates an history and prints the related XML '''
|
||||
|
||||
def add_arguments(self, parser):
|
||||
pass
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
prof = profiles.Profile.objects.all()[0]
|
||||
history = generate_history(prof, datetime.now())
|
||||
print(history.to_xml_string())
|
|
@ -0,0 +1,34 @@
|
|||
# Generated by Django 2.0.1 on 2018-02-25 19:08
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('profiles', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='History',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
|
||||
('played', models.BooleanField(default=False)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='HistoryEntry',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('search', models.URLField(help_text='The url to be searched')),
|
||||
('timestamp', models.DateTimeField()),
|
||||
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
|
||||
],
|
||||
),
|
||||
]
|
|
@ -3,14 +3,27 @@ entries, which looks like human-based browsing, according to a dedicated user
|
|||
interests, keywords...
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
import random
|
||||
import asyncio
|
||||
from math import floor
|
||||
from queue import Queue
|
||||
from xml.etree import ElementTree as ET
|
||||
from datetime import datetime
|
||||
from django.db import models
|
||||
from django.core.exceptions import ValidationError
|
||||
import profiles.models as profiles
|
||||
from tor_runner import TorInstance
|
||||
from crawl import crawl
|
||||
from pinocchio.settings import HISTORY_MIN
|
||||
from .tor_runner import TorInstance
|
||||
|
||||
|
||||
class InvalidXml(Exception):
|
||||
def __init__(self, what='unexpected XML data.'):
|
||||
super().__init__()
|
||||
self.what = what
|
||||
|
||||
def __str__(self):
|
||||
return "Invalid XML: " + self.what
|
||||
|
||||
|
||||
class HistoryEntry(models.Model):
|
||||
|
@ -28,14 +41,48 @@ class HistoryEntry(models.Model):
|
|||
"""
|
||||
return "{} : {}".format(self.timestamp, self.search)
|
||||
|
||||
def to_xml(self, xml_root):
|
||||
entry = ET.Element('history')
|
||||
entry_url = ET.Element('url')
|
||||
entry_url.text = str(self.search)
|
||||
entry_ts = ET.Element('timestamp')
|
||||
entry_ts.text = str(self.timestamp.timestamp())
|
||||
entry.append(entry_url)
|
||||
entry.append(entry_ts)
|
||||
xml_root.append(entry)
|
||||
|
||||
@staticmethod
|
||||
def from_xml(xml_root, in_history):
|
||||
if xml_root.tag != 'history':
|
||||
raise InvalidXml("expected <history> tag here.")
|
||||
url, timestamp = None, None
|
||||
|
||||
for child in xml_root:
|
||||
if child.tag == 'url':
|
||||
url = child.text
|
||||
elif child.tag == 'timestamp':
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(child.text)
|
||||
except TypeError:
|
||||
raise InvalidXml("invalid timestamp {}".format(child.text))
|
||||
else:
|
||||
raise InvalidXml("unknown tag {} as child of <history>".format(
|
||||
child.tag))
|
||||
output = HistoryEntry()
|
||||
output.search = url
|
||||
output.timestamp = timestamp
|
||||
output.history = in_history
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class History(models.Model):
|
||||
""" A history for a user, containing some web connections (http, https).
|
||||
Each history is timed, in a human-behaviour manner. """
|
||||
|
||||
start_ts = models.DateTimeField(
|
||||
help_text='The starting timestamp of the history. Useful for cron-like '
|
||||
'structure.'
|
||||
help_text=('The starting timestamp of the history. Useful for '
|
||||
'cron-like structure.')
|
||||
|
||||
)
|
||||
played = models.BooleanField(default=False)
|
||||
|
@ -47,24 +94,90 @@ class History(models.Model):
|
|||
def return_history(self):
|
||||
""" Returns the history, sorted by increasing timestamps
|
||||
"""
|
||||
history_set = self.history_set.order_by('timestamp')
|
||||
history_set = [(item.search, item.timestamp.date()) for item in history_set]
|
||||
return history_set
|
||||
output_history = self.historyentry_set.order_by('timestamp')
|
||||
output_history = [(item.search, item.timestamp.date())
|
||||
for item in output_history]
|
||||
return output_history
|
||||
|
||||
def __str__(self):
|
||||
""" Returns the string representation of a history.
|
||||
"""
|
||||
history_set = self.history_set.order_by('timestamp')
|
||||
header = "[History]:\n"
|
||||
return header + "\n".join(history_set)
|
||||
entries = self.historyentry_set.order_by('timestamp')
|
||||
output = "[History]:\n"
|
||||
for entry in entries:
|
||||
output += str(entry) + '\n'
|
||||
return output
|
||||
|
||||
async def _handler(self):
|
||||
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
|
||||
await runner.run()
|
||||
self.played = True
|
||||
self.save()
|
||||
|
||||
def play_histories(self):
|
||||
""" Actually plays the history.
|
||||
"""
|
||||
self.played = True
|
||||
runner = TorInstance(self.history)
|
||||
self.save()
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(asyncio.wait([self._handler()]))
|
||||
|
||||
def to_xml(self, xml_root=None):
|
||||
''' Exports the current history to xml '''
|
||||
standalone = False
|
||||
if xml_root is None:
|
||||
standalone = True
|
||||
xml_root = ET.Element('root')
|
||||
|
||||
hist_node = ET.Element("history", attrib={
|
||||
'start-ts': str(self.start_ts),
|
||||
'played': '1' if self.played else '0',
|
||||
'user': str(self.user.pk),
|
||||
})
|
||||
xml_root.append(hist_node)
|
||||
for entry in self.historyentry_set.all():
|
||||
entry.to_xml(hist_node)
|
||||
|
||||
if standalone:
|
||||
return xml_root
|
||||
|
||||
def to_xml_string(self):
|
||||
xml = self.to_xml()
|
||||
return ET.tostring(xml)
|
||||
|
||||
@staticmethod
|
||||
def from_xml(xml_root):
|
||||
''' Loads an history from an XML file '''
|
||||
|
||||
REQUIRED_ATTR = ['start-ts', 'played', 'user']
|
||||
|
||||
if xml_root.tag != 'history':
|
||||
raise InvalidXml('unexpected node {} as root of an history'.format(
|
||||
xml_root.tag))
|
||||
for attr in REQUIRED_ATTR:
|
||||
if attr not in xml_root.attrib:
|
||||
raise InvalidXml(('missing attribute "{}" for tag of type '
|
||||
'history').format(attr))
|
||||
start_ts = xml_root.attrib['start-ts']
|
||||
played = xml_root.attrib['played']
|
||||
user_pk = xml_root.attrib['user']
|
||||
users = History.objects.filter(pk=1)
|
||||
if len(users) != 1:
|
||||
raise InvalidXml('primary key for History {} is invalid'.format(
|
||||
user_pk))
|
||||
|
||||
output = History()
|
||||
output.start_ts = start_ts
|
||||
output.played = played > 0
|
||||
output.user = users[0]
|
||||
|
||||
for child in xml_root:
|
||||
HistoryEntry.from_xml(child, output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
|
||||
['url', 'timestamp'])
|
||||
|
||||
|
||||
def generate_partial_history(user, t_start):
|
||||
|
@ -74,36 +187,57 @@ def generate_partial_history(user, t_start):
|
|||
timestamp = t_start
|
||||
result = []
|
||||
basis = generate_first_url(user)
|
||||
result.append((basis, timestamp))
|
||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||
queue = Queue()
|
||||
search_engine_query = profiles.SearchEngine.objects.all()
|
||||
search_engine_list = [item.url for item in search_engine_query]
|
||||
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
||||
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||
crawler = crawl.CrawlingThread(basis)
|
||||
crawler.start()
|
||||
crawler.join()
|
||||
urls = queue.get()
|
||||
for url in urls:
|
||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||
result.append((url, timestamp))
|
||||
urls_tree = crawler.output_tree
|
||||
|
||||
open_time = {}
|
||||
for elem in urls_tree:
|
||||
url, parent = elem.url, elem.parent
|
||||
timestamp = 0
|
||||
if parent is None:
|
||||
timestamp = t_start
|
||||
else:
|
||||
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
|
||||
open_time[elem] = timestamp
|
||||
result.append(PartialHistoryEntry(url, timestamp))
|
||||
return result
|
||||
|
||||
|
||||
def generate_first_url(user):
|
||||
""" Generate the first url of a partial history, based on the user
|
||||
information. """
|
||||
interest = random.choice(
|
||||
[user.interests.keywords.all(), user.interests.places.all(),
|
||||
user.interests.websites.all(), user.interests.events.all()
|
||||
]
|
||||
)
|
||||
|
||||
def nonempty(seq):
|
||||
out = []
|
||||
for elt in seq:
|
||||
if elt:
|
||||
out.append(elt)
|
||||
return out
|
||||
|
||||
all_keywords = profiles.Keyword.objects.filter(
|
||||
interest__profile__in=[user])
|
||||
all_websites = profiles.Website.objects.filter(
|
||||
interest__profile__in=[user])
|
||||
all_places = profiles.Place.objects.filter(
|
||||
interest__profile__in=[user])
|
||||
all_events = profiles.Event.objects.filter(
|
||||
interest__profile__in=[user])
|
||||
|
||||
interest = random.choice(nonempty([
|
||||
all_keywords,
|
||||
all_websites,
|
||||
all_places,
|
||||
all_events,
|
||||
]))
|
||||
search_term = random.choice(interest)
|
||||
url = search_term.generate_url(user)
|
||||
return url
|
||||
|
||||
|
||||
|
||||
|
||||
def generate_history(user, ts_start):
|
||||
def generate_history(user, start_time):
|
||||
""" Generate a new history for the user `user`, starting from timestamp
|
||||
`ts_start`.
|
||||
A few heuristics are used in order to give the impression that the history
|
||||
|
@ -111,19 +245,32 @@ def generate_history(user, ts_start):
|
|||
"""
|
||||
|
||||
# let's define a new history object.
|
||||
history = History(start_ts=ts_start, user=user)
|
||||
history = History(start_ts=start_time, user=user)
|
||||
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
||||
history.full_clean()
|
||||
history.save()
|
||||
|
||||
history_line = 0
|
||||
current_timestamp = start_time.timestamp()
|
||||
|
||||
while history_line < length:
|
||||
ts_start += 5 * random.weibullvariate(1, 2.8)
|
||||
history_list = generate_partial_history(user, ts_start)
|
||||
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||
hist_size = 0
|
||||
|
||||
while hist_size < length:
|
||||
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
||||
history_list = generate_partial_history(user, current_timestamp)
|
||||
current_timestamp = \
|
||||
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||
for (url, timestamp) in history_list:
|
||||
new_line = HistoryEntry(
|
||||
search=url,
|
||||
timestamp=timestamp,
|
||||
history=history
|
||||
)
|
||||
new_line.save()
|
||||
if len(url) < 200:
|
||||
new_line = HistoryEntry(
|
||||
search=url,
|
||||
timestamp=datetime.fromtimestamp(timestamp),
|
||||
history=history
|
||||
)
|
||||
try:
|
||||
new_line.full_clean()
|
||||
new_line.save()
|
||||
hist_size += 1
|
||||
except ValidationError:
|
||||
continue
|
||||
|
||||
return history
|
||||
|
|
|
@ -58,7 +58,9 @@ class TorInstance():
|
|||
async def run(self):
|
||||
""" Runs the Tor Instance on the history.
|
||||
"""
|
||||
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
|
||||
while (self.history) and (dt.datetime.combine(self.history[0][1],
|
||||
dt.datetime.min.time()) -
|
||||
dt.datetime.now()).total_seconds() >= 10:
|
||||
print("Sleeping")
|
||||
sleep(10)
|
||||
while self.history:
|
||||
|
@ -66,8 +68,9 @@ class TorInstance():
|
|||
async with async_timeout.timeout(30):
|
||||
await(self.query(item[0]))
|
||||
now = dt.datetime.now()
|
||||
if now <= self.history[0][1]:
|
||||
sleep((self.history[0][1] - now).total_seconds())
|
||||
print(self.history[0])
|
||||
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
|
||||
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
|
||||
|
||||
|
||||
def create_session(self):
|
||||
|
|
|
@ -97,7 +97,7 @@ USE_I18N = True
|
|||
|
||||
USE_L10N = True
|
||||
|
||||
USE_TZ = True
|
||||
USE_TZ = False # We don't really care, we want POSIX timestamps
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
|
|
15
populate.sh
15
populate.sh
|
@ -1,11 +1,10 @@
|
|||
#!/bin/bash
|
||||
# -*- coding: UTF8 -*-
|
||||
|
||||
/usr/bin/python3 manage.py import_browser_fp
|
||||
/usr/bin/python3 manage.py import_search_engine
|
||||
/usr/bin/python3 manage.py import_keywords
|
||||
/usr/bin/python3 manage.py import_website
|
||||
/usr/bin/python3 manage.py import_places
|
||||
/usr/bin/python3 manage.py import_events
|
||||
/usr/bin/python3 manage.py import_interests
|
||||
|
||||
python3 manage.py import_browser_fp
|
||||
python3 manage.py import_search_engine
|
||||
python3 manage.py import_keywords
|
||||
python3 manage.py import_website
|
||||
python3 manage.py import_places
|
||||
python3 manage.py import_events
|
||||
python3 manage.py import_interests
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from profiles.models_rdf import RdfProfile
|
||||
from profiles import models
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
''' Exports database models to RDF '''
|
||||
|
||||
def add_arguments(self, parser):
|
||||
pass
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
exported_models = [
|
||||
models.Keyword,
|
||||
models.Webpage,
|
||||
models.Website,
|
||||
models.Place,
|
||||
models.Event,
|
||||
models.BrowserFingerprint,
|
||||
models.SearchEngine,
|
||||
models.Interest,
|
||||
models.Profile,
|
||||
]
|
||||
output_xml = RdfProfile().serialize(
|
||||
# models=exported_models,
|
||||
)
|
||||
self.stdout.write(output_xml)
|
|
@ -5,6 +5,7 @@ import json
|
|||
from datetime import datetime
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import models
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from profiles.models import Keyword, Interest, Place, Website, Event
|
||||
|
||||
def import_file(filename):
|
||||
|
@ -19,15 +20,14 @@ def import_interest(_interest):
|
|||
places = []
|
||||
websites = []
|
||||
for keyword in _interest.get("keywords", []):
|
||||
if not Keyword.objects.get(keyword["keyword"]):
|
||||
keywords.append(
|
||||
Keyword(
|
||||
text=keyword["keyword"]
|
||||
)
|
||||
)
|
||||
print("New keyword %s" % new_keywords)
|
||||
else:
|
||||
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
|
||||
try:
|
||||
stored = Keyword.objects.get(text=keyword["keyword"])
|
||||
keywords.append(stored)
|
||||
except ObjectDoesNotExist:
|
||||
new_keyword = Keyword(text=keyword["keyword"])
|
||||
new_keyword.save()
|
||||
keywords.append(new_keyword)
|
||||
print("New keyword %s" % new_keyword)
|
||||
for place in _interest.get("places", []):
|
||||
places.append(Place.objects.get(name=place["place"]))
|
||||
for website in _interest.get("websites", []):
|
||||
|
@ -36,7 +36,9 @@ def import_interest(_interest):
|
|||
interest = Interest(
|
||||
name=_interest.get("name", ""),
|
||||
)
|
||||
interest.save()
|
||||
for keyword in keywords:
|
||||
print(keyword)
|
||||
interest.keywords.add(keyword)
|
||||
for place in places:
|
||||
interest.places.add(place)
|
||||
|
@ -46,4 +48,4 @@ def import_interest(_interest):
|
|||
|
||||
class Command(BaseCommand):
|
||||
def handle(self, *args, **kwargs):
|
||||
import_file("data/events.json")
|
||||
import_file("data/interests.json")
|
||||
|
|
|
@ -12,12 +12,36 @@ from django.db import models
|
|||
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
|
||||
NICKNAMES = None
|
||||
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
|
||||
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
|
||||
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
|
||||
|
||||
|
||||
def require_nicknames(fct):
|
||||
def read_file(path):
|
||||
global NICKNAMES
|
||||
print("Trying {}".format(path))
|
||||
with open(path, 'r') as handle:
|
||||
NICKNAMES = handle.read().splitlines()
|
||||
|
||||
nicknames_files = [
|
||||
os.path.join(BASE_DIR, 'data/nicknames_dict'),
|
||||
"/usr/share/dict/american-english",
|
||||
]
|
||||
if NICKNAMES is None:
|
||||
for nick_file in nicknames_files:
|
||||
try:
|
||||
read_file(nick_file)
|
||||
break
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
if NICKNAMES is None:
|
||||
raise FileNotFoundError
|
||||
|
||||
return fct
|
||||
|
||||
|
||||
class InvalidData(Exception):
|
||||
''' Thrown when the DB contains invalid data, and cannot perform
|
||||
something '''
|
||||
|
@ -67,13 +91,13 @@ class Website(models.Model):
|
|||
""" Generates the url in case the interest chosen is a website.
|
||||
"""
|
||||
rand = random.random()
|
||||
if user.uses_url:
|
||||
if user.uses_urls:
|
||||
url = self.url
|
||||
elif rand <= 0.1:
|
||||
url = random.choice(self.notable_pages).url
|
||||
url = random.choice(self.notable_pages.all()).url
|
||||
elif rand <= 0.8:
|
||||
search_term_text = self.name + " " + \
|
||||
random.choice(self.keywords)
|
||||
search_term_text = self.name + " " + \
|
||||
str(random.choice(self.keywords.all()))
|
||||
url = user.search_engine.search_url(search_term_text)
|
||||
else:
|
||||
url = user.search_engine.search_url(self.name)
|
||||
|
@ -122,7 +146,6 @@ class Event(models.Model):
|
|||
return user.search_engine.search_url(" ".join(possibilities))
|
||||
|
||||
|
||||
|
||||
class BrowserFingerprint(models.Model):
|
||||
''' A browser fingerprint, containing things like a user agent '''
|
||||
|
||||
|
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
|
|||
|
||||
def serialize_headers(self):
|
||||
return {
|
||||
"Description" : str(self.description),
|
||||
"User-Agent" : str(self.useragent),
|
||||
"Accept-Encoding" : str(self.accept_encoding),
|
||||
"Accept" : str(self.accept_default),
|
||||
"Accept-Language" : str(self.accept_lang),
|
||||
"Description": str(self.description),
|
||||
"User-Agent": str(self.useragent),
|
||||
"Accept-Encoding": str(self.accept_encoding),
|
||||
"Accept": str(self.accept_default),
|
||||
"Accept-Language": str(self.accept_lang),
|
||||
}
|
||||
|
||||
|
||||
|
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
|
|||
url = models.URLField()
|
||||
query_pattern = models.CharField(max_length=256) # This field is the
|
||||
# query pattern. It should contain a `{}`, which, when substituted with a
|
||||
# search term (using `.format()`), must yield a URL that can be resolved to
|
||||
# perform the search
|
||||
# search term (using `.format()`), must yield a URL tail that can be
|
||||
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
|
|||
def search_url(self, search_term):
|
||||
''' Obtain a url to search `search_term` with this search engine '''
|
||||
pattern = str(self.query_pattern)
|
||||
search_term = str(search_term).replace(' ', '+')
|
||||
if '{}' not in pattern:
|
||||
raise InvalidData("Search engine {}: bad pattern".format(self))
|
||||
return str(self.query_pattern).format(search_term)
|
||||
return self.url + (str(self.query_pattern).format(search_term))
|
||||
|
||||
|
||||
class Interest(models.Model):
|
||||
|
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
|
|||
if random.random() < 0.3:
|
||||
email = first_name + "." + last_name + "@" + domain
|
||||
else:
|
||||
email = nick + "@" + domain
|
||||
email = nick + "@" + domain
|
||||
return email
|
||||
|
||||
|
||||
@require_nicknames
|
||||
def create_profile(nick=None):
|
||||
nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
|
||||
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
|
||||
first_name = random.choice(FIRSTNAMES)
|
||||
last_name = random.choice(LASTNAMES)
|
||||
email = generate_email(nick, first_name, last_name)
|
||||
|
@ -227,7 +253,13 @@ def create_profile(nick=None):
|
|||
first_name=first_name,
|
||||
last_name=last_name,
|
||||
email=email,
|
||||
uses_url=(random.random() < 0.5),
|
||||
uses_urls=(random.random() < 0.5),
|
||||
)
|
||||
profile.search_engine = random.choice(SearchEngine.objects.all())
|
||||
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
|
||||
|
||||
profile.full_clean()
|
||||
profile.save()
|
||||
profile.interests.add(random.choice(Interest.objects.all()))
|
||||
profile.save()
|
||||
return profile
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
""" RDF serialization class for profile models """
|
||||
|
||||
import rdfserializer as rdf
|
||||
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
|
||||
# ^ This was hurting my eyes way too much
|
||||
from rdfserializer import SCHEMA as schema
|
||||
from rdflib.namespace import Namespace
|
||||
|
||||
import profiles.models as profile_models
|
||||
|
||||
|
||||
LOCAL_NS = Namespace('local:')
|
||||
|
||||
|
||||
class RdfWebpage(RDFModelSerializer):
|
||||
""" RDF serializer for Webpage """
|
||||
|
||||
_type = schema.WebPage
|
||||
model = profile_models.Webpage
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.url, 'url'),
|
||||
]
|
||||
|
||||
|
||||
class RdfWebsite(RDFModelSerializer):
|
||||
""" RDF serializer for Website """
|
||||
|
||||
_type = schema.WebSite
|
||||
model = profile_models.Website
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.name, 'name'),
|
||||
rdf.RDFSimpleField(schema.url, 'url'),
|
||||
rdf.RDFManyField(schema.keywords, 'keywords',
|
||||
lambda keyword: keyword.text),
|
||||
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
|
||||
]
|
||||
|
||||
|
||||
class RdfPlace(RDFModelSerializer):
|
||||
""" RDF serializer for Place """
|
||||
|
||||
_type = schema.Place
|
||||
model = profile_models.Place
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.name, 'name'),
|
||||
rdf.RDFSimpleField(schema.address, 'address'),
|
||||
rdf.RDFSimpleField(schema.latitude, 'lat'),
|
||||
rdf.RDFSimpleField(schema.longitude, 'lon'),
|
||||
]
|
||||
|
||||
|
||||
class RdfEvent(RDFModelSerializer):
|
||||
""" RDF serializer for Event """
|
||||
|
||||
_type = schema.Event
|
||||
model = profile_models.Event
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.name, 'name'),
|
||||
rdf.RDFSimpleField(schema.startDate, 'date'),
|
||||
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
|
||||
]
|
||||
|
||||
|
||||
class RdfBrowserFingerprint(RDFModelSerializer):
|
||||
""" RDF serializer for BrowserFingerprint """
|
||||
|
||||
_type = schema.Intangible
|
||||
model = profile_models.BrowserFingerprint
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.description, 'description'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
|
||||
]
|
||||
|
||||
|
||||
class RdfSearchEngine(RDFModelSerializer):
|
||||
""" RDF serializer for SearchEngine """
|
||||
|
||||
_type = schema.WebSite
|
||||
model = profile_models.SearchEngine
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.url, 'url'),
|
||||
rdf.RDFSimpleField(schema.name, 'name'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
|
||||
]
|
||||
|
||||
|
||||
class RdfInterest(RDFModelSerializer):
|
||||
""" RDF serializer for Interest """
|
||||
|
||||
Interesttype = 'interest'
|
||||
model = profile_models.Interest
|
||||
entries = [
|
||||
rdf.RDFSimpleField(schema.name, 'name'),
|
||||
rdf.RDFManyField(schema.keywords, 'keywords',
|
||||
lambda keyword: keyword.text),
|
||||
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
|
||||
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
|
||||
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
|
||||
]
|
||||
|
||||
|
||||
class RdfProfile(RDFModelSerializer):
|
||||
""" RDF serializer for Profile """
|
||||
|
||||
_type = schema.Person
|
||||
model = profile_models.Profile
|
||||
entries = [
|
||||
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
|
||||
rdf.RDFSimpleField(schema.given_name, 'first_name'),
|
||||
rdf.RDFSimpleField(schema.family_name, 'last_name'),
|
||||
rdf.RDFSimpleField(schema.email, 'email'),
|
||||
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
|
||||
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
|
||||
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
|
||||
RdfSearchEngine),
|
||||
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
|
||||
RdfBrowserFingerprint)
|
||||
]
|
|
@ -14,3 +14,6 @@ yarl==1.1.1
|
|||
beautifulsoup4==4.6.0
|
||||
stem==1.6.0
|
||||
pycurl==7.43.0.1
|
||||
rdflib==4.2.2
|
||||
git+https://github.com/tobast/RDFSerializer.git
|
||||
aiosocks==0.2.6
|
||||
|
|
Loading…
Reference in New Issue