mpri-webdam/histories/models.py

270 lines
8.3 KiB
Python
Raw Normal View History

2018-01-24 16:07:33 +01:00
""" Models for the history. This history should be able to generate history
entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
from collections import namedtuple
2018-02-20 23:42:21 +01:00
import random
2018-02-19 22:56:16 +01:00
from math import floor
from xml.etree import ElementTree as ET
from datetime import datetime
2018-01-23 18:12:47 +01:00
from django.db import models
2018-02-26 15:15:03 +01:00
from django.core.exceptions import ValidationError
2018-02-20 23:42:21 +01:00
import profiles.models as profiles
from crawl import crawl
2018-02-19 22:56:16 +01:00
from pinocchio.settings import HISTORY_MIN
2018-02-26 11:12:16 +01:00
from .tor_runner import TorInstance
2018-01-24 16:07:33 +01:00
2018-02-22 11:06:45 +01:00
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
2018-01-24 16:07:33 +01:00
class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp.
"""
2018-02-19 22:56:16 +01:00
search = models.URLField(help_text="The url to be searched")
2018-01-24 16:07:33 +01:00
timestamp = models.DateTimeField()
history = models.ForeignKey(
'History',
on_delete=models.CASCADE
)
def __str__(self):
2018-02-19 22:56:16 +01:00
""" Returns the string representation of a history entry.
"""
2018-01-24 16:07:33 +01:00
return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
2018-02-26 16:59:18 +01:00
entry_url.text = str(self.search)
entry_ts = ET.Element('timestamp')
2018-02-26 16:59:18 +01:00
entry_ts.text = str(self.timestamp.timestamp())
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
2018-01-24 16:07:33 +01:00
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
2018-02-19 22:56:16 +01:00
start_ts = models.DateTimeField(
help_text=('The starting timestamp of the history. Useful for '
'cron-like structure.')
2018-02-19 22:56:16 +01:00
)
2018-01-24 16:07:33 +01:00
played = models.BooleanField(default=False)
2018-02-19 22:56:16 +01:00
user = models.ForeignKey(
2018-02-20 23:42:21 +01:00
profiles.Profile,
2018-01-24 16:07:33 +01:00
on_delete=models.CASCADE
)
2018-01-23 18:12:47 +01:00
2018-02-19 22:56:16 +01:00
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
output_history = self.historyentry_set.order_by('timestamp')
output_history = [(item.search, item.timestamp.date())
for item in output_history]
return output_history
2018-02-19 22:56:16 +01:00
2018-01-24 16:07:33 +01:00
def __str__(self):
2018-02-19 22:56:16 +01:00
""" Returns the string representation of a history.
"""
2018-02-26 15:27:57 +01:00
entries = self.historyentry_set.order_by('timestamp')
output = "[History]:\n"
for entry in entries:
output += str(entry) + '\n'
return output
2018-02-19 22:56:16 +01:00
2018-02-26 16:37:51 +01:00
async def play_histories(self):
2018-02-19 22:56:16 +01:00
""" Actually plays the history.
"""
self.played = True
2018-02-26 16:37:51 +01:00
runner = await TorInstance.create(
self.return_history(),
self.user.browser_fingerprint.serialize_headers())
2018-02-26 16:32:47 +01:00
runner.run()
2018-02-19 22:56:16 +01:00
self.save()
2018-02-26 15:58:30 +01:00
def to_xml(self, xml_root=None):
''' Exports the current history to xml '''
2018-02-26 15:58:30 +01:00
standalone = False
if xml_root is None:
standalone = True
xml_root = ET.Element('root')
hist_node = ET.Element("history", attrib={
2018-02-26 16:59:18 +01:00
'start-ts': str(self.start_ts),
'played': '1' if self.played else '0',
'user': str(self.user.pk),
})
xml_root.append(hist_node)
2018-02-26 15:58:30 +01:00
for entry in self.historyentry_set.all():
entry.to_xml(hist_node)
2018-02-26 15:58:30 +01:00
if standalone:
return xml_root
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
2018-02-19 22:56:16 +01:00
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start):
2018-02-20 23:42:21 +01:00
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
2018-02-25 11:49:44 +01:00
timestamp = t_start
2018-02-22 11:06:45 +01:00
result = []
basis = generate_first_url(user)
result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5)
2018-02-26 15:27:57 +01:00
crawler = crawl.CrawlingThread(basis)
crawler.start()
crawler.join()
urls_tree = crawler.output_tree
open_time = {}
for elem in urls_tree:
url, parent = elem.url, elem.parent
timestamp = 0
if parent is None:
timestamp = t_start
else:
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
open_time[elem] = timestamp
result.append(PartialHistoryEntry(url, timestamp))
2018-02-22 11:06:45 +01:00
return result
2018-02-20 23:42:21 +01:00
2018-02-20 23:42:21 +01:00
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
def nonempty(seq):
out = []
for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
2018-02-22 11:06:45 +01:00
search_term = random.choice(interest)
2018-02-21 11:50:28 +01:00
url = search_term.generate_url(user)
return url
2018-02-20 23:42:21 +01:00
def generate_history(user, start_time):
2018-02-19 22:56:16 +01:00
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
is actually played by a user.
"""
2018-02-20 23:42:21 +01:00
# let's define a new history object.
history = History(start_ts=start_time, user=user)
2018-02-20 23:42:21 +01:00
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
2018-02-25 20:00:34 +01:00
history.full_clean()
history.save()
2018-02-19 22:56:16 +01:00
current_timestamp = start_time.timestamp()
2018-02-26 15:27:57 +01:00
hist_size = 0
while hist_size < length:
current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, current_timestamp)
current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
2018-02-22 11:06:45 +01:00
for (url, timestamp) in history_list:
2018-02-26 14:57:46 +01:00
if len(url) < 200:
new_line = HistoryEntry(
search=url,
2018-02-26 15:37:05 +01:00
timestamp=datetime.fromtimestamp(timestamp),
2018-02-26 14:57:46 +01:00
history=history
)
2018-02-26 15:15:03 +01:00
try:
new_line.full_clean()
new_line.save()
2018-02-26 15:27:57 +01:00
hist_size += 1
2018-02-26 15:15:03 +01:00
except ValidationError:
2018-02-26 13:01:05 +01:00
continue
2018-02-25 20:00:34 +01:00
return history