Compare commits
9 commits
master
...
histories_
Author | SHA1 | Date | |
---|---|---|---|
Théophile Bastian | 90a6164861 | ||
Théophile Bastian | b7be4f4df4 | ||
Théophile Bastian | a6d7d6b62b | ||
Théophile Bastian | f33820a4dc | ||
Théophile Bastian | 04fcc2b324 | ||
Théophile Bastian | 6e4709ac91 | ||
Théophile Bastian | fd4e1d35c7 | ||
Théophile Bastian | 8f1d69bc41 | ||
Théophile Bastian | 38ccd04d31 |
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -65,4 +65,3 @@ venv/
|
||||||
# Django stuff
|
# Django stuff
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
|
|
||||||
_vimrc_local.vim
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# mpri-webdam
|
# mpri-webdam
|
||||||
|
|
||||||
Generate realistic fake browsing histories for borderline and/or activists
|
Générer tout plein de faux historiques. Parce qu'il faut bien valider ce cours.
|
||||||
users, to hide real traffic from global surveillance.
|
|
||||||
|
|
||||||
Lacks proper documentation at the moment `:(`
|
|
||||||
|
|
139
crawl/crawl.py
139
crawl/crawl.py
|
@ -1,4 +1,5 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
from queue import Queue
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
from urllib.error import URLError
|
from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -73,7 +74,7 @@ def url_getter(html, current_page, root_url):
|
||||||
# Works only with python >= 3.6
|
# Works only with python >= 3.6
|
||||||
links_list = list(dict.fromkeys(links_list))
|
links_list = list(dict.fromkeys(links_list))
|
||||||
|
|
||||||
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
|
forbidden_words = ['login', 'agreement', 'mailto']
|
||||||
links_list = [link for link in links_list if not any(word in link.lower()
|
links_list = [link for link in links_list if not any(word in link.lower()
|
||||||
for word in
|
for word in
|
||||||
forbidden_words)]
|
forbidden_words)]
|
||||||
|
@ -174,7 +175,7 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url, queue):
|
||||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
WebsiteScheduler.search_engines = engine_list
|
WebsiteScheduler.search_engines = engine_list
|
||||||
|
|
||||||
|
@ -183,7 +184,7 @@ class CrawlingThread(Thread):
|
||||||
randint(0, nb_fingerprint - 1)]
|
randint(0, nb_fingerprint - 1)]
|
||||||
self.headers = fingerprint.serialize_headers()
|
self.headers = fingerprint.serialize_headers()
|
||||||
|
|
||||||
self.output_tree = []
|
self.queue = queue
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
|
@ -192,14 +193,12 @@ class CrawlingThread(Thread):
|
||||||
|
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
|
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
||||||
|
|
||||||
try:
|
loop = asyncio.new_event_loop()
|
||||||
loop = asyncio.new_event_loop()
|
asyncio.set_event_loop(loop)
|
||||||
asyncio.set_event_loop(loop)
|
loop.run_until_complete(asyncio.wait(tasks))
|
||||||
loop.run_until_complete(asyncio.wait(tasks))
|
loop.close()
|
||||||
finally:
|
|
||||||
loop.close()
|
|
||||||
|
|
||||||
|
|
||||||
class PageGetter:
|
class PageGetter:
|
||||||
|
@ -225,6 +224,7 @@ class PageGetter:
|
||||||
scheduler.fetching()
|
scheduler.fetching()
|
||||||
async with async_timeout.timeout(10):
|
async with async_timeout.timeout(10):
|
||||||
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
||||||
|
print("Resp status %s" % resp.status)
|
||||||
try:
|
try:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
@ -234,8 +234,7 @@ class PageGetter:
|
||||||
async def async_print(url):
|
async def async_print(url):
|
||||||
""" Debug function to follow what's actually happening """
|
""" Debug function to follow what's actually happening """
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url,
|
html = await PageGetter(session, url).get(ssl=False)
|
||||||
settings.USER_AGENT).get(ssl=False)
|
|
||||||
|
|
||||||
print('GOT {}HTML for {}'.format(
|
print('GOT {}HTML for {}'.format(
|
||||||
'None ' if html is None else '',
|
'None ' if html is None else '',
|
||||||
|
@ -243,80 +242,48 @@ async def async_print(url):
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
class CrawlElem:
|
async def async_crawler(url, queue, headers=None):
|
||||||
''' Describes a crawled element, to be assembled into a tree '''
|
|
||||||
|
|
||||||
def __init__(self, url, parent):
|
|
||||||
self.url = url
|
|
||||||
self.parent = parent
|
|
||||||
|
|
||||||
|
|
||||||
async def run_crawl(url, output_tree, headers=None):
|
|
||||||
''' Starts a crawling session '''
|
|
||||||
|
|
||||||
if headers is None:
|
if headers is None:
|
||||||
headers = {}
|
headers = {
|
||||||
if 'User-Agent' not in headers:
|
'User-Agent': settings.USER_AGENT,
|
||||||
headers['User-Agent'] = settings.USER_AGENT
|
}
|
||||||
|
|
||||||
user_agent = headers['User-Agent']
|
queued = [url]
|
||||||
crawled = set()
|
crawled = []
|
||||||
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
|
try:
|
||||||
|
url = queued.pop(0)
|
||||||
|
except IndexError:
|
||||||
|
print("Error queue is empty")
|
||||||
|
return crawled
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
print("Crawling {}".format(url))
|
||||||
|
html = await PageGetter(session, url).get(ssl=False)
|
||||||
|
if html:
|
||||||
|
new_urls = url_getter(
|
||||||
|
html,
|
||||||
|
url,
|
||||||
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
|
)
|
||||||
|
crawled += [url]
|
||||||
|
if new_urls:
|
||||||
|
sampled = sample(
|
||||||
|
new_urls,
|
||||||
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
|
)
|
||||||
|
queued += [sample_url for sample_url in sampled if
|
||||||
|
sample_url not in queued and sample_url not in
|
||||||
|
crawled]
|
||||||
|
else:
|
||||||
|
print("No html received")
|
||||||
|
print(crawled)
|
||||||
|
queue.put(crawled)
|
||||||
|
|
||||||
async with aiohttp.ClientSession(headers=headers) as session:
|
if __name__ == '__main__':
|
||||||
await async_crawler(
|
queue = Queue()
|
||||||
url, output_tree, crawled, user_agent, session, None)
|
crawl = CrawlingThread(None,
|
||||||
|
"https://google.com/search?q=fabriquer+masque+manif",
|
||||||
|
["https://google.com/search/"], queue)
|
||||||
def simplify_url(url):
|
crawl.start()
|
||||||
anchor = url.find('#')
|
crawl.join()
|
||||||
if anchor >= 0:
|
|
||||||
url = url[:anchor]
|
|
||||||
|
|
||||||
prot = url.find('://')
|
|
||||||
if prot >= 0:
|
|
||||||
url = url[prot+3:]
|
|
||||||
|
|
||||||
if url.startswith('www.'):
|
|
||||||
url = url[4:]
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
|
|
||||||
if len(crawled) >= HARD_LIMIT:
|
|
||||||
return
|
|
||||||
crawled.add(simplify_url(url))
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
print("Crawling {}".format(url))
|
|
||||||
try:
|
|
||||||
with async_timeout.timeout(3):
|
|
||||||
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
return
|
|
||||||
|
|
||||||
new_tasks = []
|
|
||||||
|
|
||||||
if html:
|
|
||||||
this_elem = CrawlElem(url, parent)
|
|
||||||
out_tree.append(this_elem)
|
|
||||||
new_urls = url_getter(
|
|
||||||
html,
|
|
||||||
url,
|
|
||||||
parsed_url.scheme + "://" + parsed_url.netloc
|
|
||||||
)
|
|
||||||
if new_urls:
|
|
||||||
sampled = sample(
|
|
||||||
new_urls,
|
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
|
||||||
)
|
|
||||||
for sample_url in sampled:
|
|
||||||
if simplify_url(sample_url) not in crawled:
|
|
||||||
new_tasks.append(async_crawler(
|
|
||||||
sample_url, out_tree, crawled, user_agent, session,
|
|
||||||
this_elem))
|
|
||||||
else:
|
|
||||||
print("No html received")
|
|
||||||
if len(crawled) >= HARD_LIMIT:
|
|
||||||
return
|
|
||||||
if new_tasks:
|
|
||||||
await asyncio.wait(new_tasks)
|
|
||||||
|
|
|
@ -13,13 +13,6 @@
|
||||||
"query_pattern":"?q={}"
|
"query_pattern":"?q={}"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"searchengine": {
|
|
||||||
"name":"Duckduckgo Lite",
|
|
||||||
"url":"https://duckduckgo.com/lite/",
|
|
||||||
"query_pattern":"?q={}"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"searchengine": {
|
"searchengine": {
|
||||||
"name":"Qwant",
|
"name":"Qwant",
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name":"paris-luttes info",
|
"name":"paris-luttes info",
|
||||||
"url":"https://paris-luttes.info/",
|
"url":"https//paris-luttes.info/",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
{"keyword":"manifestations"},
|
{"keyword":"manifestations"},
|
||||||
{"keyword":"solidarité immigré·e·s"},
|
{"keyword":"solidarité immigré·e·s"},
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
from profiles import models as profiles
|
|
||||||
from histories.models import generate_history
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
''' Generates an history and prints the related XML '''
|
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def handle(self, *args, **kwargs):
|
|
||||||
prof = profiles.Profile.objects.all()[0]
|
|
||||||
history = generate_history(prof, datetime.now())
|
|
||||||
print(history.to_xml_string())
|
|
|
@ -5,12 +5,11 @@ interests, keywords...
|
||||||
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import random
|
import random
|
||||||
import asyncio
|
|
||||||
from math import floor
|
from math import floor
|
||||||
|
from queue import Queue
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.core.exceptions import ValidationError
|
|
||||||
import profiles.models as profiles
|
import profiles.models as profiles
|
||||||
from crawl import crawl
|
from crawl import crawl
|
||||||
from pinocchio.settings import HISTORY_MIN
|
from pinocchio.settings import HISTORY_MIN
|
||||||
|
@ -44,9 +43,9 @@ class HistoryEntry(models.Model):
|
||||||
def to_xml(self, xml_root):
|
def to_xml(self, xml_root):
|
||||||
entry = ET.Element('history')
|
entry = ET.Element('history')
|
||||||
entry_url = ET.Element('url')
|
entry_url = ET.Element('url')
|
||||||
entry_url.text = str(self.search)
|
entry_url.text = self.search
|
||||||
entry_ts = ET.Element('timestamp')
|
entry_ts = ET.Element('timestamp')
|
||||||
entry_ts.text = str(self.timestamp.timestamp())
|
entry_ts.text = self.timestamp.timestamp()
|
||||||
entry.append(entry_url)
|
entry.append(entry_url)
|
||||||
entry.append(entry_ts)
|
entry.append(entry_ts)
|
||||||
xml_root.append(entry)
|
xml_root.append(entry)
|
||||||
|
@ -102,48 +101,28 @@ class History(models.Model):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
""" Returns the string representation of a history.
|
""" Returns the string representation of a history.
|
||||||
"""
|
"""
|
||||||
entries = self.historyentry_set.order_by('timestamp')
|
history_set = self.historyentry_set.order_by('timestamp')
|
||||||
output = "[History]:\n"
|
header = "[History]:\n"
|
||||||
for entry in entries:
|
return header + "\n".join(history_set)
|
||||||
output += str(entry) + '\n'
|
|
||||||
return output
|
|
||||||
|
|
||||||
async def _handler(self):
|
|
||||||
runner = await TorInstance.create(self.return_history(), self.user.browser_fingerprint.serialize_headers())
|
|
||||||
await runner.run()
|
|
||||||
self.played = True
|
|
||||||
self.save()
|
|
||||||
|
|
||||||
def play_histories(self):
|
def play_histories(self):
|
||||||
""" Actually plays the history.
|
""" Actually plays the history.
|
||||||
"""
|
"""
|
||||||
loop = asyncio.new_event_loop()
|
self.played = True
|
||||||
asyncio.set_event_loop(loop)
|
runner = TorInstance(self.history)
|
||||||
loop.run_until_complete(asyncio.wait([self._handler()]))
|
self.save()
|
||||||
|
|
||||||
def to_xml(self, xml_root=None):
|
def to_xml(self, xml_root):
|
||||||
''' Exports the current history to xml '''
|
''' Exports the current history to xml '''
|
||||||
standalone = False
|
|
||||||
if xml_root is None:
|
|
||||||
standalone = True
|
|
||||||
xml_root = ET.Element('root')
|
|
||||||
|
|
||||||
hist_node = ET.Element("history", attrib={
|
hist_node = ET.Element("history", attrib={
|
||||||
'start-ts': str(self.start_ts),
|
'start-ts': self.start_ts,
|
||||||
'played': '1' if self.played else '0',
|
'played': 1 if self.played else 0,
|
||||||
'user': str(self.user.pk),
|
'user': self.user.pk,
|
||||||
})
|
})
|
||||||
xml_root.append(hist_node)
|
xml_root.append(hist_node)
|
||||||
for entry in self.historyentry_set.all():
|
for entry in self.historyentry_set:
|
||||||
entry.to_xml(hist_node)
|
entry.to_xml(hist_node)
|
||||||
|
|
||||||
if standalone:
|
|
||||||
return xml_root
|
|
||||||
|
|
||||||
def to_xml_string(self):
|
|
||||||
xml = self.to_xml()
|
|
||||||
return ET.tostring(xml)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_xml(xml_root):
|
def from_xml(xml_root):
|
||||||
''' Loads an history from an XML file '''
|
''' Loads an history from an XML file '''
|
||||||
|
@ -187,21 +166,15 @@ def generate_partial_history(user, t_start):
|
||||||
timestamp = t_start
|
timestamp = t_start
|
||||||
result = []
|
result = []
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
|
result.append(PartialHistoryEntry(basis, timestamp))
|
||||||
t_start += 5 * random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
crawler = crawl.CrawlingThread(basis)
|
queue = Queue()
|
||||||
|
crawler = crawl.CrawlingThread(basis, queue)
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls_tree = crawler.output_tree
|
urls = queue.get()
|
||||||
|
for url in urls:
|
||||||
open_time = {}
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
for elem in urls_tree:
|
|
||||||
url, parent = elem.url, elem.parent
|
|
||||||
timestamp = 0
|
|
||||||
if parent is None:
|
|
||||||
timestamp = t_start
|
|
||||||
else:
|
|
||||||
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
|
|
||||||
open_time[elem] = timestamp
|
|
||||||
result.append(PartialHistoryEntry(url, timestamp))
|
result.append(PartialHistoryEntry(url, timestamp))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -250,27 +223,22 @@ def generate_history(user, start_time):
|
||||||
history.full_clean()
|
history.full_clean()
|
||||||
history.save()
|
history.save()
|
||||||
|
|
||||||
|
history_line = 0
|
||||||
|
|
||||||
current_timestamp = start_time.timestamp()
|
current_timestamp = start_time.timestamp()
|
||||||
|
|
||||||
hist_size = 0
|
while history_line < length:
|
||||||
|
|
||||||
while hist_size < length:
|
|
||||||
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
||||||
history_list = generate_partial_history(user, current_timestamp)
|
history_list = generate_partial_history(user, current_timestamp)
|
||||||
current_timestamp = \
|
current_timestamp = \
|
||||||
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||||
for (url, timestamp) in history_list:
|
for (url, timestamp) in history_list:
|
||||||
if len(url) < 200:
|
new_line = HistoryEntry(
|
||||||
new_line = HistoryEntry(
|
search=url,
|
||||||
search=url,
|
timestamp=datetime.fromtimestamp(timestamp),
|
||||||
timestamp=datetime.fromtimestamp(timestamp),
|
history=history
|
||||||
history=history
|
)
|
||||||
)
|
new_line.full_clean()
|
||||||
try:
|
new_line.save()
|
||||||
new_line.full_clean()
|
|
||||||
new_line.save()
|
|
||||||
hist_size += 1
|
|
||||||
except ValidationError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
return history
|
return history
|
||||||
|
|
|
@ -58,9 +58,7 @@ class TorInstance():
|
||||||
async def run(self):
|
async def run(self):
|
||||||
""" Runs the Tor Instance on the history.
|
""" Runs the Tor Instance on the history.
|
||||||
"""
|
"""
|
||||||
while (self.history) and (dt.datetime.combine(self.history[0][1],
|
while (self.history[0][1] - dt.datetime.now()).total_seconds >= 10:
|
||||||
dt.datetime.min.time()) -
|
|
||||||
dt.datetime.now()).total_seconds() >= 10:
|
|
||||||
print("Sleeping")
|
print("Sleeping")
|
||||||
sleep(10)
|
sleep(10)
|
||||||
while self.history:
|
while self.history:
|
||||||
|
@ -68,9 +66,8 @@ class TorInstance():
|
||||||
async with async_timeout.timeout(30):
|
async with async_timeout.timeout(30):
|
||||||
await(self.query(item[0]))
|
await(self.query(item[0]))
|
||||||
now = dt.datetime.now()
|
now = dt.datetime.now()
|
||||||
print(self.history[0])
|
if now <= self.history[0][1]:
|
||||||
if now <= dt.datetime.combine(self.history[0][1], dt.datetime.min.time()):
|
sleep((self.history[0][1] - now).total_seconds())
|
||||||
sleep((dt.datetime.combine(self.history[0][1], dt.datetime.min.time()) - now).total_seconds())
|
|
||||||
|
|
||||||
|
|
||||||
def create_session(self):
|
def create_session(self):
|
||||||
|
|
|
@ -97,7 +97,7 @@ USE_I18N = True
|
||||||
|
|
||||||
USE_L10N = True
|
USE_L10N = True
|
||||||
|
|
||||||
USE_TZ = False # We don't really care, we want POSIX timestamps
|
USE_TZ = True
|
||||||
|
|
||||||
|
|
||||||
# Static files (CSS, JavaScript, Images)
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
from profiles.models_rdf import RdfProfile
|
|
||||||
from profiles import models
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
''' Exports database models to RDF '''
|
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def handle(self, *args, **kwargs):
|
|
||||||
exported_models = [
|
|
||||||
models.Keyword,
|
|
||||||
models.Webpage,
|
|
||||||
models.Website,
|
|
||||||
models.Place,
|
|
||||||
models.Event,
|
|
||||||
models.BrowserFingerprint,
|
|
||||||
models.SearchEngine,
|
|
||||||
models.Interest,
|
|
||||||
models.Profile,
|
|
||||||
]
|
|
||||||
output_xml = RdfProfile().serialize(
|
|
||||||
# models=exported_models,
|
|
||||||
)
|
|
||||||
self.stdout.write(output_xml)
|
|
|
@ -5,7 +5,6 @@ import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.core.exceptions import ObjectDoesNotExist
|
|
||||||
from profiles.models import Keyword, Interest, Place, Website, Event
|
from profiles.models import Keyword, Interest, Place, Website, Event
|
||||||
|
|
||||||
def import_file(filename):
|
def import_file(filename):
|
||||||
|
@ -20,14 +19,15 @@ def import_interest(_interest):
|
||||||
places = []
|
places = []
|
||||||
websites = []
|
websites = []
|
||||||
for keyword in _interest.get("keywords", []):
|
for keyword in _interest.get("keywords", []):
|
||||||
try:
|
if not Keyword.objects.get(keyword["keyword"]):
|
||||||
stored = Keyword.objects.get(text=keyword["keyword"])
|
keywords.append(
|
||||||
keywords.append(stored)
|
Keyword(
|
||||||
except ObjectDoesNotExist:
|
text=keyword["keyword"]
|
||||||
new_keyword = Keyword(text=keyword["keyword"])
|
)
|
||||||
new_keyword.save()
|
)
|
||||||
keywords.append(new_keyword)
|
print("New keyword %s" % new_keywords)
|
||||||
print("New keyword %s" % new_keyword)
|
else:
|
||||||
|
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
|
||||||
for place in _interest.get("places", []):
|
for place in _interest.get("places", []):
|
||||||
places.append(Place.objects.get(name=place["place"]))
|
places.append(Place.objects.get(name=place["place"]))
|
||||||
for website in _interest.get("websites", []):
|
for website in _interest.get("websites", []):
|
||||||
|
@ -36,9 +36,7 @@ def import_interest(_interest):
|
||||||
interest = Interest(
|
interest = Interest(
|
||||||
name=_interest.get("name", ""),
|
name=_interest.get("name", ""),
|
||||||
)
|
)
|
||||||
interest.save()
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
print(keyword)
|
|
||||||
interest.keywords.add(keyword)
|
interest.keywords.add(keyword)
|
||||||
for place in places:
|
for place in places:
|
||||||
interest.places.add(place)
|
interest.places.add(place)
|
||||||
|
@ -48,4 +46,4 @@ def import_interest(_interest):
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
def handle(self, *args, **kwargs):
|
def handle(self, *args, **kwargs):
|
||||||
import_file("data/interests.json")
|
import_file("data/events.json")
|
||||||
|
|
|
@ -91,13 +91,13 @@ class Website(models.Model):
|
||||||
""" Generates the url in case the interest chosen is a website.
|
""" Generates the url in case the interest chosen is a website.
|
||||||
"""
|
"""
|
||||||
rand = random.random()
|
rand = random.random()
|
||||||
if user.uses_urls:
|
if user.uses_url:
|
||||||
url = self.url
|
url = self.url
|
||||||
elif rand <= 0.1:
|
elif rand <= 0.1:
|
||||||
url = random.choice(self.notable_pages.all()).url
|
url = random.choice(self.notable_pages).url
|
||||||
elif rand <= 0.8:
|
elif rand <= 0.8:
|
||||||
search_term_text = self.name + " " + \
|
search_term_text = self.name + " " + \
|
||||||
str(random.choice(self.keywords.all()))
|
random.choice(self.keywords)
|
||||||
url = user.search_engine.search_url(search_term_text)
|
url = user.search_engine.search_url(search_term_text)
|
||||||
else:
|
else:
|
||||||
url = user.search_engine.search_url(self.name)
|
url = user.search_engine.search_url(self.name)
|
||||||
|
@ -260,6 +260,4 @@ def create_profile(nick=None):
|
||||||
|
|
||||||
profile.full_clean()
|
profile.full_clean()
|
||||||
profile.save()
|
profile.save()
|
||||||
profile.interests.add(random.choice(Interest.objects.all()))
|
|
||||||
profile.save()
|
|
||||||
return profile
|
return profile
|
||||||
|
|
|
@ -1,131 +0,0 @@
|
||||||
""" RDF serialization class for profile models """
|
|
||||||
|
|
||||||
import rdfserializer as rdf
|
|
||||||
from rdfserializer import RDFModelSerialiser as RDFModelSerializer
|
|
||||||
# ^ This was hurting my eyes way too much
|
|
||||||
from rdfserializer import SCHEMA as schema
|
|
||||||
from rdflib.namespace import Namespace
|
|
||||||
|
|
||||||
import profiles.models as profile_models
|
|
||||||
|
|
||||||
|
|
||||||
LOCAL_NS = Namespace('local:')
|
|
||||||
|
|
||||||
|
|
||||||
class RdfWebpage(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Webpage """
|
|
||||||
|
|
||||||
_type = schema.WebPage
|
|
||||||
model = profile_models.Webpage
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.url, 'url'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfWebsite(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Website """
|
|
||||||
|
|
||||||
_type = schema.WebSite
|
|
||||||
model = profile_models.Website
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.name, 'name'),
|
|
||||||
rdf.RDFSimpleField(schema.url, 'url'),
|
|
||||||
rdf.RDFManyField(schema.keywords, 'keywords',
|
|
||||||
lambda keyword: keyword.text),
|
|
||||||
rdf.RDFManyLinker(schema.hasPart, 'notable_pages', RdfWebpage),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfPlace(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Place """
|
|
||||||
|
|
||||||
_type = schema.Place
|
|
||||||
model = profile_models.Place
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.name, 'name'),
|
|
||||||
rdf.RDFSimpleField(schema.address, 'address'),
|
|
||||||
rdf.RDFSimpleField(schema.latitude, 'lat'),
|
|
||||||
rdf.RDFSimpleField(schema.longitude, 'lon'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfEvent(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Event """
|
|
||||||
|
|
||||||
_type = schema.Event
|
|
||||||
model = profile_models.Event
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.name, 'name'),
|
|
||||||
rdf.RDFSimpleField(schema.startDate, 'date'),
|
|
||||||
rdf.RDFLeftBinder(schema.location, 'place', RdfPlace),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfBrowserFingerprint(RDFModelSerializer):
|
|
||||||
""" RDF serializer for BrowserFingerprint """
|
|
||||||
|
|
||||||
_type = schema.Intangible
|
|
||||||
model = profile_models.BrowserFingerprint
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.description, 'description'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.useragent, 'useragent'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.appname, 'appname'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.appversion, 'appversion'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.platform, 'platform'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.vendor, 'vendor'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.vendorsub, 'vendorsub'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.buildID, 'buildID'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.oscpu, 'oscpu'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.accept_encoding, 'accept_encoding'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.accept_default, 'accept_default'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.accept_lang, 'accept_lang'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.pixeldepth, 'pixeldepth'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.colordepth, 'colordepth'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.screens, 'screens'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfSearchEngine(RDFModelSerializer):
|
|
||||||
""" RDF serializer for SearchEngine """
|
|
||||||
|
|
||||||
_type = schema.WebSite
|
|
||||||
model = profile_models.SearchEngine
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.url, 'url'),
|
|
||||||
rdf.RDFSimpleField(schema.name, 'name'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.query_pattern, 'query_pattern'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfInterest(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Interest """
|
|
||||||
|
|
||||||
Interesttype = 'interest'
|
|
||||||
model = profile_models.Interest
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(schema.name, 'name'),
|
|
||||||
rdf.RDFManyField(schema.keywords, 'keywords',
|
|
||||||
lambda keyword: keyword.text),
|
|
||||||
rdf.RDFManyLinker(schema.location, 'places', RdfPlace),
|
|
||||||
rdf.RDFManyLinker(schema.website, 'websites', RdfWebsite),
|
|
||||||
rdf.RDFManyLinker(schema.event, 'events', RdfEvent),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class RdfProfile(RDFModelSerializer):
|
|
||||||
""" RDF serializer for Profile """
|
|
||||||
|
|
||||||
_type = schema.Person
|
|
||||||
model = profile_models.Profile
|
|
||||||
entries = [
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.nickname, 'nick'),
|
|
||||||
rdf.RDFSimpleField(schema.given_name, 'first_name'),
|
|
||||||
rdf.RDFSimpleField(schema.family_name, 'last_name'),
|
|
||||||
rdf.RDFSimpleField(schema.email, 'email'),
|
|
||||||
rdf.RDFSimpleField(LOCAL_NS.uses_urls, 'uses_urls'),
|
|
||||||
rdf.RDFManyLinker(LOCAL_NS.interest, 'interests', RdfInterest),
|
|
||||||
rdf.RDFLeftBinder(LOCAL_NS.search_engine, 'search_engine',
|
|
||||||
RdfSearchEngine),
|
|
||||||
rdf.RDFLeftBinder(LOCAL_NS.browser_fingerprint, 'browser_fingerprint',
|
|
||||||
RdfBrowserFingerprint)
|
|
||||||
]
|
|
|
@ -14,6 +14,3 @@ yarl==1.1.1
|
||||||
beautifulsoup4==4.6.0
|
beautifulsoup4==4.6.0
|
||||||
stem==1.6.0
|
stem==1.6.0
|
||||||
pycurl==7.43.0.1
|
pycurl==7.43.0.1
|
||||||
rdflib==4.2.2
|
|
||||||
git+https://github.com/tobast/RDFSerializer.git
|
|
||||||
aiosocks==0.2.6
|
|
||||||
|
|
Loading…
Reference in a new issue