diff --git a/histories/models.py b/histories/models.py index 7fd0ae6..f386eb3 100644 --- a/histories/models.py +++ b/histories/models.py @@ -6,6 +6,8 @@ interests, keywords... import random from math import floor from queue import Queue +from xml.etree import ElementTree as ET +from datetime import datetime from django.db import models import profiles.models as profiles from crawl import crawl @@ -13,6 +15,15 @@ from pinocchio.settings import HISTORY_MIN from .tor_runner import TorInstance +class InvalidXml(Exception): + def __init__(self, what='unexpected XML data.'): + super().__init__() + self.what = what + + def __str__(self): + return "Invalid XML: " + self.what + + class HistoryEntry(models.Model): """ A history entry, aka a url, and a timestamp. """ @@ -28,14 +39,48 @@ class HistoryEntry(models.Model): """ return "{} : {}".format(self.timestamp, self.search) + def to_xml(self, xml_root): + entry = ET.Element('history') + entry_url = ET.Element('url') + entry_url.text = self.search + entry_ts = ET.Element('timestamp') + entry_ts.text = self.timestamp.timestamp() + entry.append(entry_url) + entry.append(entry_ts) + xml_root.append(entry) + + @staticmethod + def from_xml(xml_root, in_history): + if xml_root.tag != 'history': + raise InvalidXml("expected tag here.") + url, timestamp = None, None + + for child in xml_root: + if child.tag == 'url': + url = child.text + elif child.tag == 'timestamp': + try: + timestamp = datetime.fromtimestamp(child.text) + except TypeError: + raise InvalidXml("invalid timestamp {}".format(child.text)) + else: + raise InvalidXml("unknown tag {} as child of ".format( + child.tag)) + output = HistoryEntry() + output.search = url + output.timestamp = timestamp + output.history = in_history + + return output + class History(models.Model): """ A history for a user, containing some web connections (http, https). Each history is timed, in a human-behaviour manner. """ start_ts = models.DateTimeField( - help_text='The starting timestamp of the history. Useful for cron-like ' - 'structure.' + help_text=('The starting timestamp of the history. Useful for ' + 'cron-like structure.') ) played = models.BooleanField(default=False) @@ -58,7 +103,6 @@ class History(models.Model): header = "[History]:\n" return header + "\n".join(history_set) - def play_histories(self): """ Actually plays the history. """ @@ -66,6 +110,48 @@ class History(models.Model): runner = TorInstance(self.history) self.save() + def to_xml(self, xml_root): + ''' Exports the current history to xml ''' + hist_node = ET.Element("history", attrib={ + 'start-ts': self.start_ts, + 'played': 1 if self.played else 0, + 'user': self.user.pk, + }) + xml_root.append(hist_node) + for entry in self.history_set: + entry.to_xml(hist_node) + + @staticmethod + def from_xml(xml_root): + ''' Loads an history from an XML file ''' + + REQUIRED_ATTR = ['start-ts', 'played', 'user'] + + if xml_root.tag != 'history': + raise InvalidXml('unexpected node {} as root of an history'.format( + xml_root.tag)) + for attr in REQUIRED_ATTR: + if attr not in xml_root.attrib: + raise InvalidXml(('missing attribute "{}" for tag of type ' + 'history').format(attr)) + start_ts = xml_root.attrib['start-ts'] + played = xml_root.attrib['played'] + user_pk = xml_root.attrib['user'] + users = History.objects.filter(pk=1) + if len(users) != 1: + raise InvalidXml('primary key for History {} is invalid'.format( + user_pk)) + + output = History() + output.start_ts = start_ts + output.played = played > 0 + output.user = users[0] + + for child in xml_root: + HistoryEntry.from_xml(child, output) + + return output + def generate_partial_history(user, t_start): """ Generate the part of the history resulting from the crawl starting at @@ -75,32 +161,30 @@ def generate_partial_history(user, t_start): result = [] basis = generate_first_url(user) result.append((basis, timestamp)) - timestamp += 5* random.weibullvariate(1, 1.5) + t_start += 5 * random.weibullvariate(1, 1.5) queue = Queue() crawler = crawl.CrawlingThread(user, basis, queue) crawler.start() crawler.join() urls = queue.get() for url in urls: - timestamp += 5* random.weibullvariate(1, 1.5) + t_start += 5 * random.weibullvariate(1, 1.5) result.append((url, timestamp)) return result + def generate_first_url(user): """ Generate the first url of a partial history, based on the user information. """ - interest = random.choice( - [user.interests.keywords.all(), user.interests.places.all(), - user.interests.websites.all(), user.interests.events.all() - ] - ) + interest = random.choice([ + user.interests.keywords.all(), user.interests.places.all(), + user.interests.websites.all(), user.interests.events.all() + ]) search_term = random.choice(interest) url = search_term.generate_url(user) return url - - def generate_history(user, ts_start): """ Generate a new history for the user `user`, starting from timestamp `ts_start`.