Histories: xml import/export — untested
To be tested when history generation is available
This commit is contained in:
parent
a4de51b84a
commit
22064ebee3
1 changed files with 96 additions and 12 deletions
|
@ -6,6 +6,8 @@ interests, keywords...
|
||||||
import random
|
import random
|
||||||
from math import floor
|
from math import floor
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
from datetime import datetime
|
||||||
from django.db import models
|
from django.db import models
|
||||||
import profiles.models as profiles
|
import profiles.models as profiles
|
||||||
from crawl import crawl
|
from crawl import crawl
|
||||||
|
@ -13,6 +15,15 @@ from pinocchio.settings import HISTORY_MIN
|
||||||
from .tor_runner import TorInstance
|
from .tor_runner import TorInstance
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidXml(Exception):
|
||||||
|
def __init__(self, what='unexpected XML data.'):
|
||||||
|
super().__init__()
|
||||||
|
self.what = what
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "Invalid XML: " + self.what
|
||||||
|
|
||||||
|
|
||||||
class HistoryEntry(models.Model):
|
class HistoryEntry(models.Model):
|
||||||
""" A history entry, aka a url, and a timestamp.
|
""" A history entry, aka a url, and a timestamp.
|
||||||
"""
|
"""
|
||||||
|
@ -28,14 +39,48 @@ class HistoryEntry(models.Model):
|
||||||
"""
|
"""
|
||||||
return "{} : {}".format(self.timestamp, self.search)
|
return "{} : {}".format(self.timestamp, self.search)
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
entry = ET.Element('history')
|
||||||
|
entry_url = ET.Element('url')
|
||||||
|
entry_url.text = self.search
|
||||||
|
entry_ts = ET.Element('timestamp')
|
||||||
|
entry_ts.text = self.timestamp.timestamp()
|
||||||
|
entry.append(entry_url)
|
||||||
|
entry.append(entry_ts)
|
||||||
|
xml_root.append(entry)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root, in_history):
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml("expected <history> tag here.")
|
||||||
|
url, timestamp = None, None
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
if child.tag == 'url':
|
||||||
|
url = child.text
|
||||||
|
elif child.tag == 'timestamp':
|
||||||
|
try:
|
||||||
|
timestamp = datetime.fromtimestamp(child.text)
|
||||||
|
except TypeError:
|
||||||
|
raise InvalidXml("invalid timestamp {}".format(child.text))
|
||||||
|
else:
|
||||||
|
raise InvalidXml("unknown tag {} as child of <history>".format(
|
||||||
|
child.tag))
|
||||||
|
output = HistoryEntry()
|
||||||
|
output.search = url
|
||||||
|
output.timestamp = timestamp
|
||||||
|
output.history = in_history
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class History(models.Model):
|
class History(models.Model):
|
||||||
""" A history for a user, containing some web connections (http, https).
|
""" A history for a user, containing some web connections (http, https).
|
||||||
Each history is timed, in a human-behaviour manner. """
|
Each history is timed, in a human-behaviour manner. """
|
||||||
|
|
||||||
start_ts = models.DateTimeField(
|
start_ts = models.DateTimeField(
|
||||||
help_text='The starting timestamp of the history. Useful for cron-like '
|
help_text=('The starting timestamp of the history. Useful for '
|
||||||
'structure.'
|
'cron-like structure.')
|
||||||
|
|
||||||
)
|
)
|
||||||
played = models.BooleanField(default=False)
|
played = models.BooleanField(default=False)
|
||||||
|
@ -58,7 +103,6 @@ class History(models.Model):
|
||||||
header = "[History]:\n"
|
header = "[History]:\n"
|
||||||
return header + "\n".join(history_set)
|
return header + "\n".join(history_set)
|
||||||
|
|
||||||
|
|
||||||
def play_histories(self):
|
def play_histories(self):
|
||||||
""" Actually plays the history.
|
""" Actually plays the history.
|
||||||
"""
|
"""
|
||||||
|
@ -66,6 +110,48 @@ class History(models.Model):
|
||||||
runner = TorInstance(self.history)
|
runner = TorInstance(self.history)
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
''' Exports the current history to xml '''
|
||||||
|
hist_node = ET.Element("history", attrib={
|
||||||
|
'start-ts': self.start_ts,
|
||||||
|
'played': 1 if self.played else 0,
|
||||||
|
'user': self.user.pk,
|
||||||
|
})
|
||||||
|
xml_root.append(hist_node)
|
||||||
|
for entry in self.history_set:
|
||||||
|
entry.to_xml(hist_node)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root):
|
||||||
|
''' Loads an history from an XML file '''
|
||||||
|
|
||||||
|
REQUIRED_ATTR = ['start-ts', 'played', 'user']
|
||||||
|
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml('unexpected node {} as root of an history'.format(
|
||||||
|
xml_root.tag))
|
||||||
|
for attr in REQUIRED_ATTR:
|
||||||
|
if attr not in xml_root.attrib:
|
||||||
|
raise InvalidXml(('missing attribute "{}" for tag of type '
|
||||||
|
'history').format(attr))
|
||||||
|
start_ts = xml_root.attrib['start-ts']
|
||||||
|
played = xml_root.attrib['played']
|
||||||
|
user_pk = xml_root.attrib['user']
|
||||||
|
users = History.objects.filter(pk=1)
|
||||||
|
if len(users) != 1:
|
||||||
|
raise InvalidXml('primary key for History {} is invalid'.format(
|
||||||
|
user_pk))
|
||||||
|
|
||||||
|
output = History()
|
||||||
|
output.start_ts = start_ts
|
||||||
|
output.played = played > 0
|
||||||
|
output.user = users[0]
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
HistoryEntry.from_xml(child, output)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def generate_partial_history(user, t_start):
|
def generate_partial_history(user, t_start):
|
||||||
""" Generate the part of the history resulting from the crawl starting at
|
""" Generate the part of the history resulting from the crawl starting at
|
||||||
|
@ -75,32 +161,30 @@ def generate_partial_history(user, t_start):
|
||||||
result = []
|
result = []
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
result.append((basis, timestamp))
|
result.append((basis, timestamp))
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
crawler = crawl.CrawlingThread(user, basis, queue)
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls = queue.get()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
result.append((url, timestamp))
|
result.append((url, timestamp))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def generate_first_url(user):
|
def generate_first_url(user):
|
||||||
""" Generate the first url of a partial history, based on the user
|
""" Generate the first url of a partial history, based on the user
|
||||||
information. """
|
information. """
|
||||||
interest = random.choice(
|
interest = random.choice([
|
||||||
[user.interests.keywords.all(), user.interests.places.all(),
|
user.interests.keywords.all(), user.interests.places.all(),
|
||||||
user.interests.websites.all(), user.interests.events.all()
|
user.interests.websites.all(), user.interests.events.all()
|
||||||
]
|
])
|
||||||
)
|
|
||||||
search_term = random.choice(interest)
|
search_term = random.choice(interest)
|
||||||
url = search_term.generate_url(user)
|
url = search_term.generate_url(user)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_history(user, ts_start):
|
def generate_history(user, ts_start):
|
||||||
""" Generate a new history for the user `user`, starting from timestamp
|
""" Generate a new history for the user `user`, starting from timestamp
|
||||||
`ts_start`.
|
`ts_start`.
|
||||||
|
|
Loading…
Reference in a new issue