Compare commits

...

49 Commits

Author SHA1 Message Date
Rémi Oudin 89d1f8301a Remove duplicated url in history 2018-02-26 17:46:49 +01:00
Théophile Bastian 379b53e6ce Fix printing in gen_history 2018-02-26 17:25:04 +01:00
Théophile Bastian c94841c17b Add gen_history django-admin command 2018-02-26 17:25:04 +01:00
Rémi Oudin 97107d9bec Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam 2018-02-26 17:12:26 +01:00
Rémi Oudin dedc66bb9d Bug fix 2018-02-26 17:12:19 +01:00
Théophile Bastian d3d04739e7 Add DuckDuckGo lite search engine to stock data
This search engine works better than the others
2018-02-26 17:10:18 +01:00
Théophile Bastian b88aeffd5a Helpful README 2018-02-26 17:09:05 +01:00
Rémi Oudin 7c8ec7351c Merge branch 'master' of git.tobast.fr:tobast/mpri-webdam 2018-02-26 17:04:09 +01:00
Théophile Bastian 2005c0f24f Add xml string gen 2018-02-26 17:03:27 +01:00
Rémi Oudin 392e16b797 Merge branch 'histories_models' 2018-02-26 17:03:27 +01:00
Théophile Bastian 185c1cf8a4 Fix XML generation 2018-02-26 17:00:53 +01:00
Rémi Oudin 9dd1954067 Partial runner fix 2018-02-26 17:00:53 +01:00
Rémi Oudin 04270e88c0 Bug fix 2018-02-26 17:00:12 +01:00
Théophile Bastian 6bc64ceb7a Add requirement for aiohttp 2018-02-26 16:38:16 +01:00
Rémi Oudin 15e0c2a11c Partial runner fix 2018-02-26 16:37:51 +01:00
Rémi Oudin 2b07779f5c Bug fix 2018-02-26 16:37:32 +01:00
Théophile Bastian 8cdc50c04e Fix stupid typo 2018-02-26 16:34:43 +01:00
Rémi Oudin 22fa039f1b Remove debug print 2018-02-26 16:23:14 +01:00
Théophile Bastian e4ad8c7ce6 Towards a working XML export 2018-02-26 15:58:30 +01:00
Théophile Bastian 67ad232533 Add a timeout to a single page retrieval 2018-02-26 15:42:36 +01:00
Théophile Bastian e140d4a8a7 Fix merge remanences 2018-02-26 15:37:05 +01:00
Théophile Bastian 98fe69ba62 Real async crawling 2018-02-26 15:30:38 +01:00
Théophile Bastian 968ff6d24c More robust crawling 2018-02-26 15:29:36 +01:00
Rémi Oudin 5d4bd30e20 Exception handling 2018-02-26 15:15:03 +01:00
Rémi Oudin bdfa285e6b We do not want to use settings 2018-02-26 15:14:53 +01:00
Rémi Oudin 65f777f00f Should get the objects and not the Manager 2018-02-26 15:04:26 +01:00
Rémi Oudin 236e40d359 Sanity check 2018-02-26 14:57:46 +01:00
Rémi Oudin 22017cea91 Typo in data u_u 2018-02-26 14:56:22 +01:00
Rémi Oudin 549c861908 Bug fixé 2018-02-26 14:38:26 +01:00
Rémi Oudin 517be1d822 Merge rdf branch 2018-02-26 14:11:06 +01:00
Rémi Oudin c4f63a92b2 Error in the merge, mea culpa 2018-02-26 14:01:29 +01:00
Rémi Oudin db067e56fc Typo 2018-02-26 13:59:34 +01:00
Rémi Oudin 33bdae96e4 merge commit from histories_tobast into histories_models 2018-02-26 12:59:38 +01:00
Rémi Oudin 526aad1364 Add interests 2018-02-26 12:33:23 +01:00
Théophile Bastian 02e91bb2b7 Fix function calls 2018-02-26 11:56:02 +01:00
Théophile Bastian 3e5fc2f9b3 Fix search engine URL generation 2018-02-26 11:49:24 +01:00
Théophile Bastian 45ddbff91a Crawling and histories: fix a lot of stuff 2018-02-26 11:49:24 +01:00
Théophile Bastian e6d587bffd Actually save to DB a created history 2018-02-26 11:49:24 +01:00
Théophile Bastian 8baf408e02 Use dict from data/nicknames_dict for nicknames 2018-02-26 11:49:24 +01:00
Théophile Bastian 6463e348ac Fix populate.sh exec path 2018-02-26 11:48:51 +01:00
Théophile Bastian 22064ebee3 Histories: xml import/export — untested
To be tested when history generation is available
2018-02-26 11:48:51 +01:00
Théophile Bastian a4de51b84a Crawl: do not use global SEARCH_ENGINES 2018-02-26 11:48:51 +01:00
Théophile Bastian 4f0148cb63 Crawler: use a random fingerprint 2018-02-26 11:48:51 +01:00
Théophile Bastian 4a8bd32516 Fix tor_runner import 2018-02-26 11:48:51 +01:00
Rémi Oudin 44cf26df8f It can be useful to save a new object 2018-02-26 11:42:45 +01:00
Rémi Oudin 93b235cb6c Fix interests import 2018-02-25 21:20:52 +01:00
Théophile Bastian 15323c3465 [REBASE ME] Crawl: enhance efficiency and output a tree 2018-02-25 15:08:06 +01:00
Théophile Bastian c3bcdea1eb Add tentative export to RDF 2018-02-25 14:37:30 +01:00
Théophile Bastian 2732e4115f Add RDF models export classes — untested
Also add a dependency to https://github.com/tobast/RDFSerializer/
2018-02-23 13:32:32 +01:00
17 changed files with 611 additions and 156 deletions

1
.gitignore vendored
View File

@ -65,3 +65,4 @@ venv/
# Django stuff
db.sqlite3
_vimrc_local.vim

View File

@ -1,3 +1,6 @@
# mpri-webdam
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
Generate realistic fake browsing histories for borderline and/or activists
users, to hide real traffic from global surveillance.
Lacks proper documentation at the moment `:(`

View File

@ -1,11 +1,10 @@
from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
@ -15,6 +14,8 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -26,13 +27,11 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings:
USER_AGENT = 'Default User'
settings = Settings()
startup_time = datetime.min
def url_getter(html, current_page, root_url):
@ -74,7 +73,7 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto']
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
@ -82,8 +81,6 @@ def url_getter(html, current_page, root_url):
return links_list
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -106,12 +103,17 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
self.can_fetch_b = False
if any(self.urlroot() in item for item in SEARCH_ENGINE):
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
@ -125,7 +127,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
@ -134,9 +136,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
if not self.robot_parser.default_entry:
self.dead = True
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
delay = self.robot_parser.crawl_delay(self.user_agent)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
req_rate = self.robot_parser.request_rate(self.user_agent)
if req_rate is None:
delay = 5
else:
@ -159,7 +161,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -170,40 +174,47 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, engine_list, queue):
global settings
global SEARCH_ENGINE
SEARCH_ENGINE = engine_list
self.queue = queue
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.output_tree = []
super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url
def run(self):
global startup_time
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
class PageGetter:
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
self.user_agent = user_agent
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
@ -214,7 +225,6 @@ class PageGetter:
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try:
return await resp.text()
except UnicodeDecodeError:
@ -224,50 +234,89 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get(ssl=False)
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
print('GOT {}HTML for {} at {}'.format(
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
))
async def async_crawler(url, queue):
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queued.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
else:
print("No html received")
print(crawled)
queue.put(crawled)
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl.start()
crawl.join()
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
nicknames_dict

View File

@ -13,6 +13,13 @@
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo Lite",
"url":"https://duckduckgo.com/lite/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant",

View File

@ -17,7 +17,7 @@
},
{
"name":"paris-luttes info",
"url":"https//paris-luttes.info/",
"url":"https://paris-luttes.info/",
"keywords": [
{"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"},

View File

@ -0,0 +1,16 @@
from django.core.management.base import BaseCommand
from profiles import models as profiles
from histories.models import generate_history
from datetime import datetime
class Command(BaseCommand):
''' Generates an history and prints the related XML '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
prof = profiles.Profile.objects.all()[0]
history = generate_history(prof, datetime.now())
print(history.to_xml_string())

View File

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View File

@ -3,14 +3,27 @@ entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
from collections import namedtuple
import random
import asyncio
from math import floor
from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles
from tor_runner import TorInstance
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model):
@ -28,14 +41,48 @@ class HistoryEntry(models.Model):
"""
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = str(self.search)
entry_ts = ET.Element('timestamp')
entry_ts.text = str(self.timestamp.timestamp())
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like '
'structure.'
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
)
played = models.BooleanField(default=False)
@ -47,24 +94,90 @@ class History(models.Model):
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
history_set = self.history_set.order_by('timestamp')
history_set = [(item.search, item.timestamp.date()) for item in history_set]
return history_set
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
def __str__(self):
""" Returns the string representation of a history.
"""
history_set = self.history_set.order_by('timestamp')
header = "[History]:\n"
return header + "\n".join(history_set)
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
async def _handler(self):
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
await runner.run()
self.played = True
self.save()
def play_histories(self):
""" Actually plays the history.
"""
self.played = True
runner = TorInstance(self.history)
self.save()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait([self._handler()]))
def to_xml(self, xml_root=None):
''' Exports the current history to xml '''
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={
'start-ts': str(self.start_ts),
'played': '1' if self.played else '0',
'user': str(self.user.pk),
})
xml_root.append(hist_node)
for entry in self.historyentry_set.all():
entry.to_xml(hist_node)
if standalone:
return xml_root
def to_xml_string(self):
xml = self.to_xml()
return ET.tostring(xml)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
@ -74,36 +187,57 @@ def generate_partial_history(user, t_start):
timestamp = t_start
result = []
basis = generate_first_url(user)
result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
timestamp += 5* random.weibullvariate(1, 1.5)
result.append((url, timestamp))
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all()
]
)
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, ts_start):
def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
@ -111,19 +245,32 @@ def generate_history(user, ts_start):
"""
# let's define a new history object.
history = History(start_ts=ts_start, user=user)
history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean()
history.save()
history_line = 0
current_timestamp = start_time.timestamp()
while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
hist_size = 0
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
new_line = HistoryEntry(
search=url,
timestamp=timestamp,
history=history
)
new_line.save()
if len(url) < 200:
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
try:
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
return history

View File

@ -58,7 +58,9 @@ class TorInstance():
async def run(self):
""" Runs the Tor Instance on the history.
"""
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
while (self.history) and (dt.datetime.combine(self.history[0][1],
dt.datetime.min.time()) -
dt.datetime.now()).total_seconds() >= 10:
print("Sleeping")
sleep(10)
while self.history:
@ -66,8 +68,9 @@ class TorInstance():
async with async_timeout.timeout(30):
await(self.query(item[0]))
now = dt.datetime.now()
if now <= self.history[0][1]:
sleep((self.history[0][1] - now).total_seconds())
print(self.history[0])
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
def create_session(self):

View File

@ -97,7 +97,7 @@ USE_I18N = True
USE_L10N = True
USE_TZ = True
USE_TZ = False # We don't really care, we want POSIX timestamps
# Static files (CSS, JavaScript, Images)

View File

@ -1,11 +1,10 @@
#!/bin/bash
# -*- coding: UTF8 -*-
/usr/bin/python3 manage.py import_browser_fp
/usr/bin/python3 manage.py import_search_engine
/usr/bin/python3 manage.py import_keywords
/usr/bin/python3 manage.py import_website
/usr/bin/python3 manage.py import_places
/usr/bin/python3 manage.py import_events
/usr/bin/python3 manage.py import_interests
python3 manage.py import_browser_fp
python3 manage.py import_search_engine
python3 manage.py import_keywords
python3 manage.py import_website
python3 manage.py import_places
python3 manage.py import_events
python3 manage.py import_interests

View File

@ -0,0 +1,27 @@
from django.core.management.base import BaseCommand
from profiles.models_rdf import RdfProfile
from profiles import models
class Command(BaseCommand):
''' Exports database models to RDF '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
exported_models = [
models.Keyword,
models.Webpage,
models.Website,
models.Place,
models.Event,
models.BrowserFingerprint,
models.SearchEngine,
models.Interest,
models.Profile,
]
output_xml = RdfProfile().serialize(
# models=exported_models,
)
self.stdout.write(output_xml)

View File

@ -5,6 +5,7 @@ import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
@ -19,15 +20,14 @@ def import_interest(_interest):
places = []
websites = []
for keyword in _interest.get("keywords", []):
if not Keyword.objects.get(keyword["keyword"]):
keywords.append(
Keyword(
text=keyword["keyword"]
)
)
print("New keyword %s" % new_keywords)
else:
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
@ -36,7 +36,9 @@ def import_interest(_interest):
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
@ -46,4 +48,4 @@ def import_interest(_interest):
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/events.json")
import_file("data/interests.json")

View File

@ -12,12 +12,36 @@ from django.db import models
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
NICKNAMES = None
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
def require_nicknames(fct):
def read_file(path):
global NICKNAMES
print("Trying {}".format(path))
with open(path, 'r') as handle:
NICKNAMES = handle.read().splitlines()
nicknames_files = [
os.path.join(BASE_DIR, 'data/nicknames_dict'),
"/usr/share/dict/american-english",
]
if NICKNAMES is None:
for nick_file in nicknames_files:
try:
read_file(nick_file)
break
except FileNotFoundError:
pass
if NICKNAMES is None:
raise FileNotFoundError
return fct
class InvalidData(Exception):
''' Thrown when the DB contains invalid data, and cannot perform
something '''
@ -67,13 +91,13 @@ class Website(models.Model):
""" Generates the url in case the interest chosen is a website.
"""
rand = random.random()
if user.uses_url:
if user.uses_urls:
url = self.url
elif rand <= 0.1:
url = random.choice(self.notable_pages).url
url = random.choice(self.notable_pages.all()).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
random.choice(self.keywords)
search_term_text = self.name + " " + \
str(random.choice(self.keywords.all()))
url = user.search_engine.search_url(search_term_text)
else:
url = user.search_engine.search_url(self.name)
@ -122,7 +146,6 @@ class Event(models.Model):
return user.search_engine.search_url(" ".join(possibilities))
class BrowserFingerprint(models.Model):
''' A browser fingerprint, containing things like a user agent '''
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
def serialize_headers(self):
return {
"Description" : str(self.description),
"User-Agent" : str(self.useragent),
"Accept-Encoding" : str(self.accept_encoding),
"Accept" : str(self.accept_default),
"Accept-Language" : str(self.accept_lang),
"Description": str(self.description),
"User-Agent": str(self.useragent),
"Accept-Encoding": str(self.accept_encoding),
"Accept": str(self.accept_default),
"Accept-Language": str(self.accept_lang),
}
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
url = models.URLField()
query_pattern = models.CharField(max_length=256) # This field is the
# query pattern. It should contain a `{}`, which, when substituted with a
# search term (using `.format()`), must yield a URL that can be resolved to
# perform the search
# search term (using `.format()`), must yield a URL tail that can be
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
def __str__(self):
return self.name
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
def search_url(self, search_term):
''' Obtain a url to search `search_term` with this search engine '''
pattern = str(self.query_pattern)
search_term = str(search_term).replace(' ', '+')
if '{}' not in pattern:
raise InvalidData("Search engine {}: bad pattern".format(self))
return str(self.query_pattern).format(search_term)
return self.url + (str(self.query_pattern).format(search_term))
class Interest(models.Model):
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
if random.random() < 0.3:
email = first_name + "." + last_name + "@" + domain
else:
email = nick + "@" + domain
email = nick + "@" + domain
return email
@require_nicknames
def create_profile(nick=None):
nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
first_name = random.choice(FIRSTNAMES)
last_name = random.choice(LASTNAMES)
email = generate_email(nick, first_name, last_name)
@ -227,7 +253,13 @@ def create_profile(nick=None):
first_name=first_name,
last_name=last_name,
email=email,
uses_url=(random.random() < 0.5),
uses_urls=(random.random() < 0.5),
)
profile.search_engine = random.choice(SearchEngine.objects.all())
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
profile.full_clean()
profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile

131
profiles/models_rdf.py Normal file
View File

@ -0,0 +1,131 @@
""" RDF serialization class for profile models """
import rdfserializer as rdf
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
# ^ This was hurting my eyes way too much
from rdfserializer import SCHEMA as schema
from rdflib.namespace import Namespace
import profiles.models as profile_models
LOCAL_NS = Namespace('local:')
class RdfWebpage(RDFModelSerializer):
""" RDF serializer for Webpage """
_type = schema.WebPage
model = profile_models.Webpage
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
]
class RdfWebsite(RDFModelSerializer):
""" RDF serializer for Website """
_type = schema.WebSite
model = profile_models.Website
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
]
class RdfPlace(RDFModelSerializer):
""" RDF serializer for Place """
_type = schema.Place
model = profile_models.Place
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.address, 'address'),
rdf.RDFSimpleField(schema.latitude, 'lat'),
rdf.RDFSimpleField(schema.longitude, 'lon'),
]
class RdfEvent(RDFModelSerializer):
""" RDF serializer for Event """
_type = schema.Event
model = profile_models.Event
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.startDate, 'date'),
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
]
class RdfBrowserFingerprint(RDFModelSerializer):
""" RDF serializer for BrowserFingerprint """
_type = schema.Intangible
model = profile_models.BrowserFingerprint
entries = [
rdf.RDFSimpleField(schema.description, 'description'),
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
]
class RdfSearchEngine(RDFModelSerializer):
""" RDF serializer for SearchEngine """
_type = schema.WebSite
model = profile_models.SearchEngine
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
]
class RdfInterest(RDFModelSerializer):
""" RDF serializer for Interest """
Interesttype = 'interest'
model = profile_models.Interest
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
]
class RdfProfile(RDFModelSerializer):
""" RDF serializer for Profile """
_type = schema.Person
model = profile_models.Profile
entries = [
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
rdf.RDFSimpleField(schema.given_name, 'first_name'),
rdf.RDFSimpleField(schema.family_name, 'last_name'),
rdf.RDFSimpleField(schema.email, 'email'),
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
RdfSearchEngine),
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
RdfBrowserFingerprint)
]

View File

@ -14,3 +14,6 @@ yarl==1.1.1
beautifulsoup4==4.6.0
stem==1.6.0
pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git
aiosocks==0.2.6