mpri-webdam/histories/models.py

129 lines
4 KiB
Python

""" Models for the history. This history should be able to generate history
entries, which looks like human-based browsing, according to a dedicated user
interests, keywords...
"""
import random
from math import floor
from queue import Queue
from django.db import models
import profiles.models as profiles
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
from .tor_runner import TorInstance
class HistoryEntry(models.Model):
""" A history entry, aka a url, and a timestamp.
"""
search = models.URLField(help_text="The url to be searched")
timestamp = models.DateTimeField()
history = models.ForeignKey(
'History',
on_delete=models.CASCADE
)
def __str__(self):
""" Returns the string representation of a history entry.
"""
return "{} : {}".format(self.timestamp, self.search)
class History(models.Model):
""" A history for a user, containing some web connections (http, https).
Each history is timed, in a human-behaviour manner. """
start_ts = models.DateTimeField(
help_text='The starting timestamp of the history. Useful for cron-like '
'structure.'
)
played = models.BooleanField(default=False)
user = models.ForeignKey(
profiles.Profile,
on_delete=models.CASCADE
)
def return_history(self):
""" Returns the history, sorted by increasing timestamps
"""
history_set = self.history_set.order_by('timestamp')
history_set = [(item.search, item.timestamp.date()) for item in history_set]
return history_set
def __str__(self):
""" Returns the string representation of a history.
"""
history_set = self.history_set.order_by('timestamp')
header = "[History]:\n"
return header + "\n".join(history_set)
def play_histories(self):
""" Actually plays the history.
"""
self.played = True
runner = TorInstance(self.history)
self.save()
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
timestamp = t_start
result = []
basis = generate_first_url(user)
result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
timestamp += 5* random.weibullvariate(1, 1.5)
result.append((url, timestamp))
return result
def generate_first_url(user):
""" Generate the first url of a partial history, based on the user
information. """
interest = random.choice(
[user.interests.keywords.all(), user.interests.places.all(),
user.interests.websites.all(), user.interests.events.all()
]
)
search_term = random.choice(interest)
url = search_term.generate_url(user)
return url
def generate_history(user, ts_start):
""" Generate a new history for the user `user`, starting from timestamp
`ts_start`.
A few heuristics are used in order to give the impression that the history
is actually played by a user.
"""
# let's define a new history object.
history = History(start_ts=ts_start, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history_line = 0
while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
new_line = HistoryEntry(
search=url,
timestamp=timestamp,
history=history
)
new_line.save()