Compare commits

..

9 Commits

14 changed files with 102 additions and 362 deletions

1
.gitignore vendored
View File

@ -65,4 +65,3 @@ venv/
# Django stuff
db.sqlite3
_vimrc_local.vim

View File

@ -1,6 +1,3 @@
# mpri-webdam
Generate realistic fake browsing histories for borderline and/or activists
users, to hide real traffic from global surveillance.
Lacks proper documentation at the moment `:(`
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.

View File

@ -1,4 +1,5 @@
from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
@ -73,7 +74,7 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
forbidden_words = ['login', 'agreement', 'mailto']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
@ -174,7 +175,7 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, url):
def __init__(self, url, queue):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
@ -183,7 +184,7 @@ class CrawlingThread(Thread):
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.output_tree = []
self.queue = queue
super(CrawlingThread, self).__init__()
self.url = url
@ -192,14 +193,12 @@ class CrawlingThread(Thread):
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
tasks.append(async_crawler(self.url, self.queue, self.headers))
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
class PageGetter:
@ -225,6 +224,7 @@ class PageGetter:
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try:
return await resp.text()
except UnicodeDecodeError:
@ -234,8 +234,7 @@ class PageGetter:
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
@ -243,80 +242,48 @@ async def async_print(url):
))
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
async def async_crawler(url, queue, headers=None):
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
headers = {
'User-Agent': settings.USER_AGENT,
}
user_agent = headers['User-Agent']
crawled = set()
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession(headers=headers) as session:
try:
url = queued.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
else:
print("No html received")
print(crawled)
queue.put(crawled)
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl.start()
crawl.join()

View File

@ -13,13 +13,6 @@
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Duckduckgo Lite",
"url":"https://duckduckgo.com/lite/",
"query_pattern":"?q={}"
}
},
{
"searchengine": {
"name":"Qwant",

View File

@ -17,7 +17,7 @@
},
{
"name":"paris-luttes info",
"url":"https://paris-luttes.info/",
"url":"https//paris-luttes.info/",
"keywords": [
{"keyword":"manifestations"},
{"keyword":"solidarité immigré·e·s"},

View File

@ -1,16 +0,0 @@
from django.core.management.base import BaseCommand
from profiles import models as profiles
from histories.models import generate_history
from datetime import datetime
class Command(BaseCommand):
''' Generates an history and prints the related XML '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
prof = profiles.Profile.objects.all()[0]
history = generate_history(prof, datetime.now())
print(history.to_xml_string())

View File

@ -5,12 +5,11 @@ interests, keywords...
from collections import namedtuple
import random
import asyncio
from math import floor
from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models
from django.core.exceptions import ValidationError
import profiles.models as profiles
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
@ -44,9 +43,9 @@ class HistoryEntry(models.Model):
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = str(self.search)
entry_url.text = self.search
entry_ts = ET.Element('timestamp')
entry_ts.text = str(self.timestamp.timestamp())
entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@ -102,48 +101,28 @@ class History(models.Model):
def __str__(self):
""" Returns the string representation of a history.
"""
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
async def _handler(self):
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
await runner.run()
self.played = True
self.save()
history_set = self.historyentry_set.order_by('timestamp')
header = "[History]:\n"
return header + "\n".join(history_set)
def play_histories(self):
""" Actually plays the history.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait([self._handler()]))
self.played = True
runner = TorInstance(self.history)
self.save()
def to_xml(self, xml_root=None):
def to_xml(self, xml_root):
''' Exports the current history to xml '''
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={
'start-ts': str(self.start_ts),
'played': '1' if self.played else '0',
'user': str(self.user.pk),
'start-ts': self.start_ts,
'played': 1 if self.played else 0,
'user': self.user.pk,
})
xml_root.append(hist_node)
for entry in self.historyentry_set.all():
for entry in self.historyentry_set:
entry.to_xml(hist_node)
if standalone:
return xml_root
def to_xml_string(self):
xml = self.to_xml()
return ET.tostring(xml)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
@ -187,21 +166,15 @@ def generate_partial_history(user, t_start):
timestamp = t_start
result = []
basis = generate_first_url(user)
result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5)
crawler = crawl.CrawlingThread(basis)
queue = Queue()
crawler = crawl.CrawlingThread(basis, queue)
crawler.start()
crawler.join()
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
urls = queue.get()
for url in urls:
t_start += 5 * random.weibullvariate(1, 1.5)
result.append(PartialHistoryEntry(url, timestamp))
return result
@ -250,27 +223,22 @@ def generate_history(user, start_time):
history.full_clean()
history.save()
history_line = 0
current_timestamp = start_time.timestamp()
hist_size = 0
while hist_size < length:
while history_line < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
if len(url) < 200:
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
try:
new_line.full_clean()
new_line.save()
hist_size += 1
except ValidationError:
continue
new_line = HistoryEntry(
search=url,
timestamp=datetime.fromtimestamp(timestamp),
history=history
)
new_line.full_clean()
new_line.save()
return history

View File

@ -58,9 +58,7 @@ class TorInstance():
async def run(self):
""" Runs the Tor Instance on the history.
"""
while (self.history) and (dt.datetime.combine(self.history[0][1],
dt.datetime.min.time()) -
dt.datetime.now()).total_seconds() >= 10:
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
print("Sleeping")
sleep(10)
while self.history:
@ -68,9 +66,8 @@ class TorInstance():
async with async_timeout.timeout(30):
await(self.query(item[0]))
now = dt.datetime.now()
print(self.history[0])
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
if now <= self.history[0][1]:
sleep((self.history[0][1] - now).total_seconds())
def create_session(self):

View File

@ -97,7 +97,7 @@ USE_I18N = True
USE_L10N = True
USE_TZ = False # We don't really care, we want POSIX timestamps
USE_TZ = True
# Static files (CSS, JavaScript, Images)

View File

@ -1,27 +0,0 @@
from django.core.management.base import BaseCommand
from profiles.models_rdf import RdfProfile
from profiles import models
class Command(BaseCommand):
''' Exports database models to RDF '''
def add_arguments(self, parser):
pass
def handle(self, *args, **kwargs):
exported_models = [
models.Keyword,
models.Webpage,
models.Website,
models.Place,
models.Event,
models.BrowserFingerprint,
models.SearchEngine,
models.Interest,
models.Profile,
]
output_xml = RdfProfile().serialize(
# models=exported_models,
)
self.stdout.write(output_xml)

View File

@ -5,7 +5,6 @@ import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
@ -20,14 +19,15 @@ def import_interest(_interest):
places = []
websites = []
for keyword in _interest.get("keywords", []):
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
if not Keyword.objects.get(keyword["keyword"]):
keywords.append(
Keyword(
text=keyword["keyword"]
)
)
print("New keyword %s" % new_keywords)
else:
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
@ -36,9 +36,7 @@ def import_interest(_interest):
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
@ -48,4 +46,4 @@ def import_interest(_interest):
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/interests.json")
import_file("data/events.json")

View File

@ -91,13 +91,13 @@ class Website(models.Model):
""" Generates the url in case the interest chosen is a website.
"""
rand = random.random()
if user.uses_urls:
if user.uses_url:
url = self.url
elif rand <= 0.1:
url = random.choice(self.notable_pages.all()).url
url = random.choice(self.notable_pages).url
elif rand <= 0.8:
search_term_text = self.name + " " + \
str(random.choice(self.keywords.all()))
random.choice(self.keywords)
url = user.search_engine.search_url(search_term_text)
else:
url = user.search_engine.search_url(self.name)
@ -260,6 +260,4 @@ def create_profile(nick=None):
profile.full_clean()
profile.save()
profile.interests.add(random.choice(Interest.objects.all()))
profile.save()
return profile

View File

@ -1,131 +0,0 @@
""" RDF serialization class for profile models """
import rdfserializer as rdf
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
# ^ This was hurting my eyes way too much
from rdfserializer import SCHEMA as schema
from rdflib.namespace import Namespace
import profiles.models as profile_models
LOCAL_NS = Namespace('local:')
class RdfWebpage(RDFModelSerializer):
""" RDF serializer for Webpage """
_type = schema.WebPage
model = profile_models.Webpage
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
]
class RdfWebsite(RDFModelSerializer):
""" RDF serializer for Website """
_type = schema.WebSite
model = profile_models.Website
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
]
class RdfPlace(RDFModelSerializer):
""" RDF serializer for Place """
_type = schema.Place
model = profile_models.Place
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.address, 'address'),
rdf.RDFSimpleField(schema.latitude, 'lat'),
rdf.RDFSimpleField(schema.longitude, 'lon'),
]
class RdfEvent(RDFModelSerializer):
""" RDF serializer for Event """
_type = schema.Event
model = profile_models.Event
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(schema.startDate, 'date'),
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
]
class RdfBrowserFingerprint(RDFModelSerializer):
""" RDF serializer for BrowserFingerprint """
_type = schema.Intangible
model = profile_models.BrowserFingerprint
entries = [
rdf.RDFSimpleField(schema.description, 'description'),
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
]
class RdfSearchEngine(RDFModelSerializer):
""" RDF serializer for SearchEngine """
_type = schema.WebSite
model = profile_models.SearchEngine
entries = [
rdf.RDFSimpleField(schema.url, 'url'),
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
]
class RdfInterest(RDFModelSerializer):
""" RDF serializer for Interest """
Interesttype = 'interest'
model = profile_models.Interest
entries = [
rdf.RDFSimpleField(schema.name, 'name'),
rdf.RDFManyField(schema.keywords, 'keywords',
lambda keyword: keyword.text),
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
]
class RdfProfile(RDFModelSerializer):
""" RDF serializer for Profile """
_type = schema.Person
model = profile_models.Profile
entries = [
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
rdf.RDFSimpleField(schema.given_name, 'first_name'),
rdf.RDFSimpleField(schema.family_name, 'last_name'),
rdf.RDFSimpleField(schema.email, 'email'),
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
RdfSearchEngine),
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
RdfBrowserFingerprint)
]

View File

@ -14,6 +14,3 @@ yarl==1.1.1
beautifulsoup4==4.6.0
stem==1.6.0
pycurl==7.43.0.1
rdflib==4.2.2
git+https://github.com/tobast/RDFSerializer.git
aiosocks==0.2.6