mpri-webdam/histories/models.py

118 lines
3.7 KiB
Python
Raw Normal View History

2018-01-24 16:07:33 +01:00
""" Models for the history. This history should be able to generate history
entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
2018-02-20 23:42:21 +01:00
import random
2018-02-19 22:56:16 +01:00
from math import floor
2018-01-23 18:12:47 +01:00
from django.db import models
2018-02-20 23:42:21 +01:00
import profiles.models as profiles
from crawl import crawl
2018-02-19 22:56:16 +01:00
from pinocchio.settings import HISTORY_MIN
2018-01-24 16:07:33 +01:00
class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp.
"""
2018-02-19 22:56:16 +01:00
search = models.URLField(help_text="The url to be searched")
2018-01-24 16:07:33 +01:00
timestamp = models.DateTimeField()
history = models.ForeignKey(
'History',
on_delete=models.CASCADE
)
def __str__(self):
2018-02-19 22:56:16 +01:00
""" Returns the string representation of a history entry.
"""
2018-01-24 16:07:33 +01:00
return "{} : {}".format(self.timestamp, self.search)
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
2018-02-19 22:56:16 +01:00
start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like '
'structure.'
)
2018-01-24 16:07:33 +01:00
played = models.BooleanField(default=False)
2018-02-19 22:56:16 +01:00
user = models.ForeignKey(
2018-02-20 23:42:21 +01:00
profiles.Profile,
2018-01-24 16:07:33 +01:00
on_delete=models.CASCADE
)
2018-01-23 18:12:47 +01:00
2018-02-19 22:56:16 +01:00
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
history_set = self.history_set.order_by('timestamp')
return history_set
2018-01-24 16:07:33 +01:00
def __str__(self):
2018-02-19 22:56:16 +01:00
""" Returns the string representation of a history.
"""
2018-01-24 16:07:33 +01:00
history_set = self.history_set.order_by('timestamp')
2018-02-19 13:58:45 +01:00
header = "[History]:\n"
return header + "\n".join(history_set)
2018-02-19 22:56:16 +01:00
def play_history(self):
""" Actually plays the history.
"""
self.played = True
self.save()
2018-02-20 23:42:21 +01:00
def generate_partial_history(user, t_start, url, history):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
#crawler = crawl.CrawlingThread()
return []
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all()
]
)
search_term = random.choice(interset)
if isinstance(search_term, profiles.Website):
if user.uses_url:
url = search_term.url
elif random.random() <= 0.1:
url = random.choice(search_term.notable_pages).url
elif random.random() >= 0.3:
search_term_text = search_term.name + " " + random.choice(search_term.keywords)
url = user.search_engine.search_url(search_term_text)
#if
#elif isinstance(search_term, profiles.Website):
# url = user.search_engine.search_url(search_term)
2018-02-19 22:56:16 +01:00
def generate_history(user, ts_start):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
is actually played by a user.
"""
2018-02-20 23:42:21 +01:00
# let's define a new history object.
2018-02-19 22:56:16 +01:00
history = History(start_ts=ts_start, user=user)
2018-02-20 23:42:21 +01:00
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
2018-02-19 22:56:16 +01:00
history_line = 0
while history_line < length:
ts_start += random.uniform(1, 10)
2018-02-20 23:42:21 +01:00
history = generate_partial_history(user, ts_start, url)
ts_start = history[-1].timestamp + 5 * weilbullvariate(1, 5)
for (url, ts) in history:
new_line = HistoryEntry(
search=url,
timestamp=ts,
history=history
)