Compare commits

..

9 commits

14 changed files with 102 additions and 362 deletions

1
.gitignore vendored
View file

@ -65,4 +65,3 @@ venv/
# Django stuff # Django stuff
db.sqlite3 db.sqlite3
_vimrc_local.vim

View file

@ -1,6 +1,3 @@
# mpri-webdam # mpri-webdam
Generate realistic fake browsing histories for borderline and/or activists Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
users, to hide real traffic from global surveillance.
Lacks proper documentation at the moment `:(`

View file

@ -1,4 +1,5 @@
from threading import Thread from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from urllib.error import URLError from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
@ -73,7 +74,7 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6 # Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list)) links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto', 'settings'] forbidden_words = ['login', 'agreement', 'mailto']
links_list = [link for link in links_list if not any(word in link.lower() links_list = [link for link in links_list if not any(word in link.lower()
for word in for word in
forbidden_words)] forbidden_words)]
@ -174,7 +175,7 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, url): def __init__(self, url, queue):
engine_list = [engine.url for engine in SearchEngine.objects.all()] engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list WebsiteScheduler.search_engines = engine_list
@ -183,7 +184,7 @@ class CrawlingThread(Thread):
randint(0, nb_fingerprint - 1)] randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers() self.headers = fingerprint.serialize_headers()
self.output_tree = [] self.queue = queue
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
self.url = url self.url = url
@ -192,14 +193,12 @@ class CrawlingThread(Thread):
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(run_crawl(self.url, self.output_tree, self.headers)) tasks.append(async_crawler(self.url, self.queue, self.headers))
try: loop = asyncio.new_event_loop()
loop = asyncio.new_event_loop() asyncio.set_event_loop(loop)
asyncio.set_event_loop(loop) loop.run_until_complete(asyncio.wait(tasks))
loop.run_until_complete(asyncio.wait(tasks)) loop.close()
finally:
loop.close()
class PageGetter: class PageGetter:
@ -225,6 +224,7 @@ class PageGetter:
scheduler.fetching() scheduler.fetching()
async with async_timeout.timeout(10): async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp: async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try: try:
return await resp.text() return await resp.text()
except UnicodeDecodeError: except UnicodeDecodeError:
@ -234,8 +234,7 @@ class PageGetter:
async def async_print(url): async def async_print(url):
""" Debug function to follow what's actually happening """ """ Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url, html = await PageGetter(session, url).get(ssl=False)
settings.USER_AGENT).get(ssl=False)
print('GOT {}HTML for {}'.format( print('GOT {}HTML for {}'.format(
'None ' if html is None else '', 'None ' if html is None else '',
@ -243,80 +242,48 @@ async def async_print(url):
)) ))
class CrawlElem: async def async_crawler(url, queue, headers=None):
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
if headers is None: if headers is None:
headers = {} headers = {
if 'User-Agent' not in headers: 'User-Agent': settings.USER_AGENT,
headers['User-Agent'] = settings.USER_AGENT }
user_agent = headers['User-Agent'] queued = [url]
crawled = set() crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession(headers=headers) as session:
try:
url = queued.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
else:
print("No html received")
print(crawled)
queue.put(crawled)
async with aiohttp.ClientSession(headers=headers) as session: if __name__ == '__main__':
await async_crawler( queue = Queue()
url, output_tree, crawled, user_agent, session, None) crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
def simplify_url(url): crawl.start()
anchor = url.find('#') crawl.join()
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)

View file

@ -13,13 +13,6 @@
"query_pattern":"?q={}" "query_pattern":"?q={}"
} }
}, },
{
"searchengine": {
"name":"Duckduckgo Lite",
"url":"https://duckduckgo.com/lite/",
"query_pattern":"?q={}"
}
},
{ {
"searchengine": { "searchengine": {
"name":"Qwant", "name":"Qwant",

View file

@ -17,7 +17,7 @@
}, },
{ {
"name":"paris-luttes info", "name":"paris-luttes info",
"url":"https://paris-luttes.info/", "url":"https//paris-luttes.info/",
"keywords": [ "keywords": [
{"keyword":"manifestations"}, {"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"}, {"keyword":"solidarité immigré·e·s"},

View file

@ -1,16 +0,0 @@
from django.core.management.base import BaseCommand
from profiles import models as profiles
from histories.models import generate_history
from datetime import datetime
class Command(BaseCommand):
''' Generates an history and prints the related XML '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
prof = profiles.Profile.objects.all()[0]
history = generate_history(prof, datetime.now())
print(history.to_xml_string())

View file

@ -5,12 +5,11 @@ interests, keywords...
from collections import namedtuple from collections import namedtuple
import random import random
import asyncio
from math import floor from math import floor
from queue import Queue
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from datetime import datetime from datetime import datetime
from django.db import models from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles import profiles.models as profiles
from crawl import crawl from crawl import crawl
from pinocchio.settings import HISTORY_MIN from pinocchio.settings import HISTORY_MIN
@ -44,9 +43,9 @@ class HistoryEntry(models.Model):
def to_xml(self, xml_root): def to_xml(self, xml_root):
entry = ET.Element('history') entry = ET.Element('history')
entry_url = ET.Element('url') entry_url = ET.Element('url')
entry_url.text = str(self.search) entry_url.text = self.search
entry_ts = ET.Element('timestamp') entry_ts = ET.Element('timestamp')
entry_ts.text = str(self.timestamp.timestamp()) entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url) entry.append(entry_url)
entry.append(entry_ts) entry.append(entry_ts)
xml_root.append(entry) xml_root.append(entry)
@ -102,48 +101,28 @@ class History(models.Model):
def __str__(self): def __str__(self):
""" Returns the string representation of a history. """ Returns the string representation of a history.
""" """
entries = self.historyentry_set.order_by('timestamp') history_set = self.historyentry_set.order_by('timestamp')
output = "[History]:\n" header = "[History]:\n"
for entry in entries: return header + "\n".join(history_set)
output += str(entry) + '\n'
return output
async def _handler(self):
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
await runner.run()
self.played = True
self.save()
def play_histories(self): def play_histories(self):
""" Actually plays the history. """ Actually plays the history.
""" """
loop = asyncio.new_event_loop() self.played = True
asyncio.set_event_loop(loop) runner = TorInstance(self.history)
loop.run_until_complete(asyncio.wait([self._handler()])) self.save()
def to_xml(self, xml_root=None): def to_xml(self, xml_root):
''' Exports the current history to xml ''' ''' Exports the current history to xml '''
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={ hist_node = ET.Element("history", attrib={
'start-ts': str(self.start_ts), 'start-ts': self.start_ts,
'played': '1' if self.played else '0', 'played': 1 if self.played else 0,
'user': str(self.user.pk), 'user': self.user.pk,
}) })
xml_root.append(hist_node) xml_root.append(hist_node)
for entry in self.historyentry_set.all(): for entry in self.historyentry_set:
entry.to_xml(hist_node) entry.to_xml(hist_node)
if standalone:
return xml_root
def to_xml_string(self):
xml = self.to_xml()
return ET.tostring(xml)
@staticmethod @staticmethod
def from_xml(xml_root): def from_xml(xml_root):
''' Loads an history from an XML file ''' ''' Loads an history from an XML file '''
@ -187,21 +166,15 @@ def generate_partial_history(user, t_start):
timestamp = t_start timestamp = t_start
result = [] result = []
basis = generate_first_url(user) basis = generate_first_url(user)
result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis) queue = Queue()
crawler = crawl.CrawlingThread(basis, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls_tree = crawler.output_tree urls = queue.get()
for url in urls:
open_time = {} t_start += 5 * random.weibullvariate(1, 1.5)
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp)) result.append(PartialHistoryEntry(url, timestamp))
return result return result
@ -250,27 +223,22 @@ def generate_history(user, start_time):
history.full_clean() history.full_clean()
history.save() history.save()
history_line = 0
current_timestamp = start_time.timestamp() current_timestamp = start_time.timestamp()
hist_size = 0 while history_line < length:
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8) current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp) history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \ current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list: for (url, timestamp) in history_list:
if len(url) < 200: new_line = HistoryEntry(
new_line = HistoryEntry( search=url,
search=url, timestamp=datetime.fromtimestamp(timestamp),
timestamp=datetime.fromtimestamp(timestamp), history=history
history=history )
) new_line.full_clean()
try: new_line.save()
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
return history return history

View file

@ -58,9 +58,7 @@ class TorInstance():
async def run(self): async def run(self):
""" Runs the Tor Instance on the history. """ Runs the Tor Instance on the history.
""" """
while (self.history) and (dt.datetime.combine(self.history[0][1], while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
dt.datetime.min.time()) -
dt.datetime.now()).total_seconds() >= 10:
print("Sleeping") print("Sleeping")
sleep(10) sleep(10)
while self.history: while self.history:
@ -68,9 +66,8 @@ class TorInstance():
async with async_timeout.timeout(30): async with async_timeout.timeout(30):
await(self.query(item[0])) await(self.query(item[0]))
now = dt.datetime.now() now = dt.datetime.now()
print(self.history[0]) if now <= self.history[0][1]:
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()): sleep((self.history[0][1] - now).total_seconds())
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
def create_session(self): def create_session(self):

View file

@ -97,7 +97,7 @@ USE_I18N = True
USE_L10N = True USE_L10N = True
USE_TZ = False # We don't really care, we want POSIX timestamps USE_TZ = True
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)

View file

@ -1,27 +0,0 @@
from django.core.management.base import BaseCommand
from profiles.models_rdf import RdfProfile
from profiles import models
class Command(BaseCommand):
''' Exports database models to RDF '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
exported_models = [
models.Keyword,
models.Webpage,
models.Website,
models.Place,
models.Event,
models.BrowserFingerprint,
models.SearchEngine,
models.Interest,
models.Profile,
]
output_xml = RdfProfile().serialize(
# models=exported_models,
)
self.stdout.write(output_xml)

View file

@ -5,7 +5,6 @@ import json
from datetime import datetime from datetime import datetime
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db import models from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename): def import_file(filename):
@ -20,14 +19,15 @@ def import_interest(_interest):
places = [] places = []
websites = [] websites = []
for keyword in _interest.get("keywords", []): for keyword in _interest.get("keywords", []):
try: if not Keyword.objects.get(keyword["keyword"]):
stored = Keyword.objects.get(text=keyword["keyword"]) keywords.append(
keywords.append(stored) Keyword(
except ObjectDoesNotExist: text=keyword["keyword"]
new_keyword = Keyword(text=keyword["keyword"]) )
new_keyword.save() )
keywords.append(new_keyword) print("New keyword %s" % new_keywords)
print("New keyword %s" % new_keyword) else:
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
for place in _interest.get("places", []): for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"])) places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []): for website in _interest.get("websites", []):
@ -36,9 +36,7 @@ def import_interest(_interest):
interest = Interest( interest = Interest(
name=_interest.get("name", ""), name=_interest.get("name", ""),
) )
interest.save()
for keyword in keywords: for keyword in keywords:
print(keyword)
interest.keywords.add(keyword) interest.keywords.add(keyword)
for place in places: for place in places:
interest.places.add(place) interest.places.add(place)
@ -48,4 +46,4 @@ def import_interest(_interest):
class Command(BaseCommand): class Command(BaseCommand):
def handle(self, *args, **kwargs): def handle(self, *args, **kwargs):
import_file("data/interests.json") import_file("data/events.json")

View file

@ -91,13 +91,13 @@ class Website(models.Model):
""" Generates the url in case the interest chosen is a website. """ Generates the url in case the interest chosen is a website.
""" """
rand = random.random() rand = random.random()
if user.uses_urls: if user.uses_url:
url = self.url url = self.url
elif rand <= 0.1: elif rand <= 0.1:
url = random.choice(self.notable_pages.all()).url url = random.choice(self.notable_pages).url
elif rand <= 0.8: elif rand <= 0.8:
search_term_text = self.name + " " + \ search_term_text = self.name + " " + \
str(random.choice(self.keywords.all())) random.choice(self.keywords)
url = user.search_engine.search_url(search_term_text) url = user.search_engine.search_url(search_term_text)
else: else:
url = user.search_engine.search_url(self.name) url = user.search_engine.search_url(self.name)
@ -260,6 +260,4 @@ def create_profile(nick=None):
profile.full_clean() profile.full_clean()
profile.save() profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile return profile

View file

@ -1,131 +0,0 @@
""" RDF serialization class for profile models """
import rdfserializer as rdf
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
# ^ This was hurting my eyes way too much
from rdfserializer import SCHEMA as schema
from rdflib.namespace import Namespace
import profiles.models as profile_models
LOCAL_NS = Namespace('local:')
class RdfWebpage(RDFModelSerializer):
""" RDF serializer for Webpage """
_type = schema.WebPage
model = profile_models.Webpage
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
]
class RdfWebsite(RDFModelSerializer):
""" RDF serializer for Website """
_type = schema.WebSite
model = profile_models.Website
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
]
class RdfPlace(RDFModelSerializer):
""" RDF serializer for Place """
_type = schema.Place
model = profile_models.Place
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.address, 'address'),
rdf.RDFSimpleField(schema.latitude, 'lat'),
rdf.RDFSimpleField(schema.longitude, 'lon'),
]
class RdfEvent(RDFModelSerializer):
""" RDF serializer for Event """
_type = schema.Event
model = profile_models.Event
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.startDate, 'date'),
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
]
class RdfBrowserFingerprint(RDFModelSerializer):
""" RDF serializer for BrowserFingerprint """
_type = schema.Intangible
model = profile_models.BrowserFingerprint
entries = [
rdf.RDFSimpleField(schema.description, 'description'),
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
]
class RdfSearchEngine(RDFModelSerializer):
""" RDF serializer for SearchEngine """
_type = schema.WebSite
model = profile_models.SearchEngine
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
]
class RdfInterest(RDFModelSerializer):
""" RDF serializer for Interest """
Interesttype = 'interest'
model = profile_models.Interest
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
]
class RdfProfile(RDFModelSerializer):
""" RDF serializer for Profile """
_type = schema.Person
model = profile_models.Profile
entries = [
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
rdf.RDFSimpleField(schema.given_name, 'first_name'),
rdf.RDFSimpleField(schema.family_name, 'last_name'),
rdf.RDFSimpleField(schema.email, 'email'),
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
RdfSearchEngine),
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
RdfBrowserFingerprint)
]

View file

@ -14,6 +14,3 @@ yarl==1.1.1
beautifulsoup4==4.6.0 beautifulsoup4==4.6.0
stem==1.6.0 stem==1.6.0
pycurl==7.43.0.1 pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git
aiosocks==0.2.6