Histories: xml import/export — untested

To be tested when history generation is available
This commit is contained in:
Théophile Bastian 2018-02-25 18:06:59 +01:00
parent fd4e1d35c7
commit 6e4709ac91

View file

@ -6,6 +6,8 @@ interests, keywords...
import random import random
from math import floor from math import floor
from queue import Queue from queue import Queue
from xml.etree import ElementTree as ET
from datetime import datetime
from django.db import models from django.db import models
import profiles.models as profiles import profiles.models as profiles
from crawl import crawl from crawl import crawl
@ -13,6 +15,15 @@ from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance from .tor_runner import TorInstance
class InvalidXml(Exception):
def __init__(self, what='unexpected XML data.'):
super().__init__()
self.what = what
def __str__(self):
return "Invalid XML: " + self.what
class HistoryEntry(models.Model): class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp. """ A history entry, aka a url, and a timestamp.
""" """
@ -28,14 +39,48 @@ class HistoryEntry(models.Model):
""" """
return "{} : {}".format(self.timestamp, self.search) return "{} : {}".format(self.timestamp, self.search)
def to_xml(self, xml_root):
entry = ET.Element('history')
entry_url = ET.Element('url')
entry_url.text = self.search
entry_ts = ET.Element('timestamp')
entry_ts.text = self.timestamp.timestamp()
entry.append(entry_url)
entry.append(entry_ts)
xml_root.append(entry)
@staticmethod
def from_xml(xml_root, in_history):
if xml_root.tag != 'history':
raise InvalidXml("expected <history> tag here.")
url, timestamp = None, None
for child in xml_root:
if child.tag == 'url':
url = child.text
elif child.tag == 'timestamp':
try:
timestamp = datetime.fromtimestamp(child.text)
except TypeError:
raise InvalidXml("invalid timestamp {}".format(child.text))
else:
raise InvalidXml("unknown tag {} as child of <history>".format(
child.tag))
output = HistoryEntry()
output.search = url
output.timestamp = timestamp
output.history = in_history
return output
class History(models.Model): class History(models.Model):
""" A history for a user, containing some web connections (http, https). """ A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """ Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField( start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like ' help_text=('The starting timestamp of the history. Useful for '
'structure.' 'cron-like structure.')
) )
played = models.BooleanField(default=False) played = models.BooleanField(default=False)
@ -58,7 +103,6 @@ class History(models.Model):
header = "[History]:\n" header = "[History]:\n"
return header + "\n".join(history_set) return header + "\n".join(history_set)
def play_histories(self): def play_histories(self):
""" Actually plays the history. """ Actually plays the history.
""" """
@ -66,6 +110,48 @@ class History(models.Model):
runner = TorInstance(self.history) runner = TorInstance(self.history)
self.save() self.save()
def to_xml(self, xml_root):
''' Exports the current history to xml '''
hist_node = ET.Element("history", attrib={
'start-ts': self.start_ts,
'played': 1 if self.played else 0,
'user': self.user.pk,
})
xml_root.append(hist_node)
for entry in self.history_set:
entry.to_xml(hist_node)
@staticmethod
def from_xml(xml_root):
''' Loads an history from an XML file '''
REQUIRED_ATTR = ['start-ts', 'played', 'user']
if xml_root.tag != 'history':
raise InvalidXml('unexpected node {} as root of an history'.format(
xml_root.tag))
for attr in REQUIRED_ATTR:
if attr not in xml_root.attrib:
raise InvalidXml(('missing attribute "{}" for tag of type '
'history').format(attr))
start_ts = xml_root.attrib['start-ts']
played = xml_root.attrib['played']
user_pk = xml_root.attrib['user']
users = History.objects.filter(pk=1)
if len(users) != 1:
raise InvalidXml('primary key for History {} is invalid'.format(
user_pk))
output = History()
output.start_ts = start_ts
output.played = played > 0
output.user = users[0]
for child in xml_root:
HistoryEntry.from_xml(child, output)
return output
def generate_partial_history(user, t_start): def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at """ Generate the part of the history resulting from the crawl starting at
@ -75,32 +161,30 @@ def generate_partial_history(user, t_start):
result = [] result = []
basis = generate_first_url(user) basis = generate_first_url(user)
result.append((basis, timestamp)) result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
queue = Queue() queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue) crawler = crawl.CrawlingThread(user, basis, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls = queue.get() urls = queue.get()
for url in urls: for url in urls:
timestamp += 5* random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
result.append((url, timestamp)) result.append((url, timestamp))
return result return result
def generate_first_url(user): def generate_first_url(user):
""" Generate the first url of a partial history, based on the user """ Generate the first url of a partial history, based on the user
information. """ information. """
interest = random.choice( interest = random.choice([
[user.interests.keywords.all(), user.interests.places.all(), user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all() user.interests.websites.all(), user.interests.events.all()
] ])
)
search_term = random.choice(interest) search_term = random.choice(interest)
url = search_term.generate_url(user) url = search_term.generate_url(user)
return url return url
def generate_history(user, ts_start): def generate_history(user, ts_start):
""" Generate a new history for the user `user`, starting from timestamp """ Generate a new history for the user `user`, starting from timestamp
`ts_start`. `ts_start`.