2018-01-24 16:07:33 +01:00
|
|
|
""" Models for the history. This history should be able to generate history
|
|
|
|
entries, which looks like human-based browsing, according to a dedicated user
|
|
|
|
interests, keywords...
|
|
|
|
"""
|
|
|
|
|
2018-02-26 00:24:54 +01:00
|
|
|
from collections import namedtuple
|
2018-02-20 23:42:21 +01:00
|
|
|
import random
|
2018-02-19 22:56:16 +01:00
|
|
|
from math import floor
|
2018-02-25 18:06:59 +01:00
|
|
|
from xml.etree import ElementTree as ET
|
|
|
|
from datetime import datetime
|
2018-01-23 18:12:47 +01:00
|
|
|
from django.db import models
|
2018-02-26 15:15:03 +01:00
|
|
|
from django.core.exceptions import ValidationError
|
2018-02-20 23:42:21 +01:00
|
|
|
import profiles.models as profiles
|
2018-02-24 23:17:24 +01:00
|
|
|
from crawl import crawl
|
2018-02-19 22:56:16 +01:00
|
|
|
from pinocchio.settings import HISTORY_MIN
|
2018-02-26 11:12:16 +01:00
|
|
|
from .tor_runner import TorInstance
|
2018-01-24 16:07:33 +01:00
|
|
|
|
2018-02-22 11:06:45 +01:00
|
|
|
|
2018-02-25 18:06:59 +01:00
|
|
|
class InvalidXml(Exception):
|
|
|
|
def __init__(self, what='unexpected XML data.'):
|
|
|
|
super().__init__()
|
|
|
|
self.what = what
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return "Invalid XML: " + self.what
|
|
|
|
|
|
|
|
|
2018-01-24 16:07:33 +01:00
|
|
|
class HistoryEntry(models.Model):
|
|
|
|
""" A history entry, aka a url, and a timestamp.
|
|
|
|
"""
|
2018-02-19 22:56:16 +01:00
|
|
|
search = models.URLField(help_text="The url to be searched")
|
2018-01-24 16:07:33 +01:00
|
|
|
timestamp = models.DateTimeField()
|
|
|
|
history = models.ForeignKey(
|
|
|
|
'History',
|
|
|
|
on_delete=models.CASCADE
|
|
|
|
)
|
|
|
|
|
|
|
|
def __str__(self):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Returns the string representation of a history entry.
|
|
|
|
"""
|
2018-01-24 16:07:33 +01:00
|
|
|
return "{} : {}".format(self.timestamp, self.search)
|
|
|
|
|
2018-02-25 18:06:59 +01:00
|
|
|
def to_xml(self, xml_root):
|
|
|
|
entry = ET.Element('history')
|
|
|
|
entry_url = ET.Element('url')
|
|
|
|
entry_url.text = self.search
|
|
|
|
entry_ts = ET.Element('timestamp')
|
|
|
|
entry_ts.text = self.timestamp.timestamp()
|
|
|
|
entry.append(entry_url)
|
|
|
|
entry.append(entry_ts)
|
|
|
|
xml_root.append(entry)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def from_xml(xml_root, in_history):
|
|
|
|
if xml_root.tag != 'history':
|
|
|
|
raise InvalidXml("expected <history> tag here.")
|
|
|
|
url, timestamp = None, None
|
|
|
|
|
|
|
|
for child in xml_root:
|
|
|
|
if child.tag == 'url':
|
|
|
|
url = child.text
|
|
|
|
elif child.tag == 'timestamp':
|
|
|
|
try:
|
|
|
|
timestamp = datetime.fromtimestamp(child.text)
|
|
|
|
except TypeError:
|
|
|
|
raise InvalidXml("invalid timestamp {}".format(child.text))
|
|
|
|
else:
|
|
|
|
raise InvalidXml("unknown tag {} as child of <history>".format(
|
|
|
|
child.tag))
|
|
|
|
output = HistoryEntry()
|
|
|
|
output.search = url
|
|
|
|
output.timestamp = timestamp
|
|
|
|
output.history = in_history
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
2018-01-24 16:07:33 +01:00
|
|
|
|
|
|
|
class History(models.Model):
|
|
|
|
""" A history for a user, containing some web connections (http, https).
|
|
|
|
Each history is timed, in a human-behaviour manner. """
|
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
start_ts = models.DateTimeField(
|
2018-02-25 18:06:59 +01:00
|
|
|
help_text=('The starting timestamp of the history. Useful for '
|
|
|
|
'cron-like structure.')
|
2018-02-19 22:56:16 +01:00
|
|
|
|
|
|
|
)
|
2018-01-24 16:07:33 +01:00
|
|
|
played = models.BooleanField(default=False)
|
2018-02-19 22:56:16 +01:00
|
|
|
user = models.ForeignKey(
|
2018-02-20 23:42:21 +01:00
|
|
|
profiles.Profile,
|
2018-01-24 16:07:33 +01:00
|
|
|
on_delete=models.CASCADE
|
|
|
|
)
|
2018-01-23 18:12:47 +01:00
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
def return_history(self):
|
|
|
|
""" Returns the history, sorted by increasing timestamps
|
|
|
|
"""
|
2018-02-26 00:24:54 +01:00
|
|
|
output_history = self.historyentry_set.order_by('timestamp')
|
|
|
|
output_history = [(item.search, item.timestamp.date())
|
|
|
|
for item in output_history]
|
|
|
|
return output_history
|
2018-02-19 22:56:16 +01:00
|
|
|
|
2018-01-24 16:07:33 +01:00
|
|
|
def __str__(self):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Returns the string representation of a history.
|
|
|
|
"""
|
2018-02-26 15:27:57 +01:00
|
|
|
entries = self.historyentry_set.order_by('timestamp')
|
|
|
|
output = "[History]:\n"
|
|
|
|
for entry in entries:
|
|
|
|
output += str(entry) + '\n'
|
|
|
|
return output
|
2018-02-19 22:56:16 +01:00
|
|
|
|
2018-02-26 10:05:33 +01:00
|
|
|
def play_histories(self):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Actually plays the history.
|
|
|
|
"""
|
|
|
|
self.played = True
|
2018-02-26 14:57:46 +01:00
|
|
|
runner = TorInstance(self.return_history())
|
|
|
|
runnner.run()
|
2018-02-19 22:56:16 +01:00
|
|
|
self.save()
|
|
|
|
|
2018-02-26 15:58:30 +01:00
|
|
|
def to_xml(self, xml_root=None):
|
2018-02-25 18:06:59 +01:00
|
|
|
''' Exports the current history to xml '''
|
2018-02-26 15:58:30 +01:00
|
|
|
standalone = False
|
|
|
|
if xml_root is None:
|
|
|
|
standalone = True
|
|
|
|
xml_root = ET.Element('root')
|
|
|
|
|
2018-02-25 18:06:59 +01:00
|
|
|
hist_node = ET.Element("history", attrib={
|
|
|
|
'start-ts': self.start_ts,
|
|
|
|
'played': 1 if self.played else 0,
|
|
|
|
'user': self.user.pk,
|
|
|
|
})
|
|
|
|
xml_root.append(hist_node)
|
2018-02-26 15:58:30 +01:00
|
|
|
for entry in self.historyentry_set.all():
|
2018-02-25 18:06:59 +01:00
|
|
|
entry.to_xml(hist_node)
|
|
|
|
|
2018-02-26 15:58:30 +01:00
|
|
|
if standalone:
|
|
|
|
return xml_root
|
|
|
|
|
2018-02-25 18:06:59 +01:00
|
|
|
@staticmethod
|
|
|
|
def from_xml(xml_root):
|
|
|
|
''' Loads an history from an XML file '''
|
|
|
|
|
|
|
|
REQUIRED_ATTR = ['start-ts', 'played', 'user']
|
|
|
|
|
|
|
|
if xml_root.tag != 'history':
|
|
|
|
raise InvalidXml('unexpected node {} as root of an history'.format(
|
|
|
|
xml_root.tag))
|
|
|
|
for attr in REQUIRED_ATTR:
|
|
|
|
if attr not in xml_root.attrib:
|
|
|
|
raise InvalidXml(('missing attribute "{}" for tag of type '
|
|
|
|
'history').format(attr))
|
|
|
|
start_ts = xml_root.attrib['start-ts']
|
|
|
|
played = xml_root.attrib['played']
|
|
|
|
user_pk = xml_root.attrib['user']
|
|
|
|
users = History.objects.filter(pk=1)
|
|
|
|
if len(users) != 1:
|
|
|
|
raise InvalidXml('primary key for History {} is invalid'.format(
|
|
|
|
user_pk))
|
|
|
|
|
|
|
|
output = History()
|
|
|
|
output.start_ts = start_ts
|
|
|
|
output.played = played > 0
|
|
|
|
output.user = users[0]
|
|
|
|
|
|
|
|
for child in xml_root:
|
|
|
|
HistoryEntry.from_xml(child, output)
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
|
2018-02-26 00:24:54 +01:00
|
|
|
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
|
|
|
|
['url', 'timestamp'])
|
|
|
|
|
|
|
|
|
2018-02-24 23:17:24 +01:00
|
|
|
def generate_partial_history(user, t_start):
|
2018-02-20 23:42:21 +01:00
|
|
|
""" Generate the part of the history resulting from the crawl starting at
|
|
|
|
the given url.
|
|
|
|
"""
|
2018-02-25 11:49:44 +01:00
|
|
|
timestamp = t_start
|
2018-02-22 11:06:45 +01:00
|
|
|
result = []
|
|
|
|
basis = generate_first_url(user)
|
2018-02-26 00:24:54 +01:00
|
|
|
result.append(PartialHistoryEntry(basis, timestamp))
|
2018-02-25 18:06:59 +01:00
|
|
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
2018-02-26 15:27:57 +01:00
|
|
|
crawler = crawl.CrawlingThread(basis)
|
|
|
|
crawler.start()
|
|
|
|
crawler.join()
|
|
|
|
urls_tree = crawler.output_tree
|
|
|
|
|
|
|
|
open_time = {}
|
|
|
|
for elem in urls_tree:
|
|
|
|
url, parent = elem.url, elem.parent
|
|
|
|
timestamp = 0
|
|
|
|
if parent is None:
|
|
|
|
timestamp = t_start
|
|
|
|
else:
|
|
|
|
timestamp = open_time[parent] + 5 * random.weibullvariate(1, 1.5)
|
|
|
|
open_time[elem] = timestamp
|
|
|
|
result.append(PartialHistoryEntry(url, timestamp))
|
2018-02-22 11:06:45 +01:00
|
|
|
return result
|
2018-02-20 23:42:21 +01:00
|
|
|
|
2018-02-25 18:06:59 +01:00
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
def generate_first_url(user):
|
|
|
|
""" Generate the first url of a partial history, based on the user
|
|
|
|
information. """
|
2018-02-26 00:24:54 +01:00
|
|
|
|
|
|
|
def nonempty(seq):
|
|
|
|
out = []
|
|
|
|
for elt in seq:
|
|
|
|
if elt:
|
|
|
|
out.append(elt)
|
|
|
|
return out
|
|
|
|
|
|
|
|
all_keywords = profiles.Keyword.objects.filter(
|
|
|
|
interest__profile__in=[user])
|
|
|
|
all_websites = profiles.Website.objects.filter(
|
|
|
|
interest__profile__in=[user])
|
|
|
|
all_places = profiles.Place.objects.filter(
|
|
|
|
interest__profile__in=[user])
|
|
|
|
all_events = profiles.Event.objects.filter(
|
|
|
|
interest__profile__in=[user])
|
|
|
|
|
|
|
|
interest = random.choice(nonempty([
|
|
|
|
all_keywords,
|
|
|
|
all_websites,
|
|
|
|
all_places,
|
|
|
|
all_events,
|
|
|
|
]))
|
2018-02-22 11:06:45 +01:00
|
|
|
search_term = random.choice(interest)
|
2018-02-21 11:50:28 +01:00
|
|
|
url = search_term.generate_url(user)
|
|
|
|
return url
|
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
|
2018-02-26 00:24:54 +01:00
|
|
|
def generate_history(user, start_time):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Generate a new history for the user `user`, starting from timestamp
|
|
|
|
`ts_start`.
|
|
|
|
A few heuristics are used in order to give the impression that the history
|
|
|
|
is actually played by a user.
|
|
|
|
"""
|
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
# let's define a new history object.
|
2018-02-26 00:24:54 +01:00
|
|
|
history = History(start_ts=start_time, user=user)
|
2018-02-20 23:42:21 +01:00
|
|
|
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
2018-02-25 20:00:34 +01:00
|
|
|
history.full_clean()
|
|
|
|
history.save()
|
2018-02-19 22:56:16 +01:00
|
|
|
|
2018-02-26 00:24:54 +01:00
|
|
|
current_timestamp = start_time.timestamp()
|
|
|
|
|
2018-02-26 15:27:57 +01:00
|
|
|
hist_size = 0
|
|
|
|
|
|
|
|
while hist_size < length:
|
2018-02-26 00:24:54 +01:00
|
|
|
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
|
|
|
history_list = generate_partial_history(user, current_timestamp)
|
|
|
|
current_timestamp = \
|
|
|
|
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
2018-02-22 11:06:45 +01:00
|
|
|
for (url, timestamp) in history_list:
|
2018-02-26 14:57:46 +01:00
|
|
|
if len(url) < 200:
|
|
|
|
new_line = HistoryEntry(
|
|
|
|
search=url,
|
2018-02-26 15:37:05 +01:00
|
|
|
timestamp=datetime.fromtimestamp(timestamp),
|
2018-02-26 14:57:46 +01:00
|
|
|
history=history
|
|
|
|
)
|
2018-02-26 15:15:03 +01:00
|
|
|
try:
|
|
|
|
new_line.full_clean()
|
|
|
|
new_line.save()
|
2018-02-26 15:27:57 +01:00
|
|
|
hist_size += 1
|
2018-02-26 15:15:03 +01:00
|
|
|
except ValidationError:
|
2018-02-26 13:01:05 +01:00
|
|
|
continue
|
2018-02-25 20:00:34 +01:00
|
|
|
|
|
|
|
return history
|