2018-01-24 16:07:33 +01:00
|
|
|
""" Models for the history. This history should be able to generate history
|
|
|
|
entries, which looks like human-based browsing, according to a dedicated user
|
|
|
|
interests, keywords...
|
|
|
|
"""
|
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
import random
|
2018-02-19 22:56:16 +01:00
|
|
|
from math import floor
|
2018-02-24 23:17:24 +01:00
|
|
|
from queue import Queue
|
2018-01-23 18:12:47 +01:00
|
|
|
from django.db import models
|
2018-02-20 23:42:21 +01:00
|
|
|
import profiles.models as profiles
|
2018-02-24 23:17:24 +01:00
|
|
|
from crawl import crawl
|
2018-02-19 22:56:16 +01:00
|
|
|
from pinocchio.settings import HISTORY_MIN
|
2018-01-24 16:07:33 +01:00
|
|
|
|
2018-02-22 11:06:45 +01:00
|
|
|
|
2018-01-24 16:07:33 +01:00
|
|
|
class HistoryEntry(models.Model):
|
|
|
|
""" A history entry, aka a url, and a timestamp.
|
|
|
|
"""
|
2018-02-19 22:56:16 +01:00
|
|
|
search = models.URLField(help_text="The url to be searched")
|
2018-01-24 16:07:33 +01:00
|
|
|
timestamp = models.DateTimeField()
|
|
|
|
history = models.ForeignKey(
|
|
|
|
'History',
|
|
|
|
on_delete=models.CASCADE
|
|
|
|
)
|
|
|
|
|
|
|
|
def __str__(self):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Returns the string representation of a history entry.
|
|
|
|
"""
|
2018-01-24 16:07:33 +01:00
|
|
|
return "{} : {}".format(self.timestamp, self.search)
|
|
|
|
|
|
|
|
|
|
|
|
class History(models.Model):
|
|
|
|
""" A history for a user, containing some web connections (http, https).
|
|
|
|
Each history is timed, in a human-behaviour manner. """
|
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
start_ts = models.DateTimeField(
|
|
|
|
help_text='The starting timestamp of the history. Useful for cron-like '
|
|
|
|
'structure.'
|
|
|
|
|
|
|
|
)
|
2018-01-24 16:07:33 +01:00
|
|
|
played = models.BooleanField(default=False)
|
2018-02-19 22:56:16 +01:00
|
|
|
user = models.ForeignKey(
|
2018-02-20 23:42:21 +01:00
|
|
|
profiles.Profile,
|
2018-01-24 16:07:33 +01:00
|
|
|
on_delete=models.CASCADE
|
|
|
|
)
|
2018-01-23 18:12:47 +01:00
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
def return_history(self):
|
|
|
|
""" Returns the history, sorted by increasing timestamps
|
|
|
|
"""
|
|
|
|
history_set = self.history_set.order_by('timestamp')
|
|
|
|
return history_set
|
|
|
|
|
2018-01-24 16:07:33 +01:00
|
|
|
def __str__(self):
|
2018-02-19 22:56:16 +01:00
|
|
|
""" Returns the string representation of a history.
|
|
|
|
"""
|
2018-01-24 16:07:33 +01:00
|
|
|
history_set = self.history_set.order_by('timestamp')
|
2018-02-19 13:58:45 +01:00
|
|
|
header = "[History]:\n"
|
|
|
|
return header + "\n".join(history_set)
|
2018-02-19 22:56:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
def play_history(self):
|
|
|
|
""" Actually plays the history.
|
|
|
|
"""
|
|
|
|
self.played = True
|
|
|
|
self.save()
|
|
|
|
|
|
|
|
|
2018-02-24 23:17:24 +01:00
|
|
|
def generate_partial_history(user, t_start):
|
2018-02-20 23:42:21 +01:00
|
|
|
""" Generate the part of the history resulting from the crawl starting at
|
|
|
|
the given url.
|
|
|
|
"""
|
2018-02-25 11:49:44 +01:00
|
|
|
timestamp = t_start
|
2018-02-22 11:06:45 +01:00
|
|
|
result = []
|
|
|
|
basis = generate_first_url(user)
|
2018-02-25 11:49:44 +01:00
|
|
|
result.append((basis, timestamp))
|
|
|
|
timestamp += 5* random.weibullvariate(1, 1.5)
|
2018-02-24 23:17:24 +01:00
|
|
|
queue = Queue()
|
|
|
|
crawler = crawl.CrawlingThread(user, basis, queue)
|
|
|
|
crawler.start()
|
|
|
|
crawler.join()
|
|
|
|
urls = queue.get()
|
|
|
|
for url in urls:
|
2018-02-25 11:49:44 +01:00
|
|
|
timestamp += 5* random.weibullvariate(1, 1.5)
|
|
|
|
result.append((url, timestamp))
|
2018-02-22 11:06:45 +01:00
|
|
|
return result
|
2018-02-20 23:42:21 +01:00
|
|
|
|
|
|
|
def generate_first_url(user):
|
|
|
|
""" Generate the first url of a partial history, based on the user
|
|
|
|
information. """
|
|
|
|
interest = random.choice(
|
|
|
|
[user.interests.keywords.all(), user.interests.places.all(),
|
|
|
|
user.interests.websites.all(), user.interests.events.all()
|
|
|
|
]
|
|
|
|
)
|
2018-02-22 11:06:45 +01:00
|
|
|
search_term = random.choice(interest)
|
2018-02-21 11:50:28 +01:00
|
|
|
url = search_term.generate_url(user)
|
|
|
|
return url
|
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
|
|
|
|
|
2018-02-19 22:56:16 +01:00
|
|
|
|
|
|
|
def generate_history(user, ts_start):
|
|
|
|
""" Generate a new history for the user `user`, starting from timestamp
|
|
|
|
`ts_start`.
|
|
|
|
A few heuristics are used in order to give the impression that the history
|
|
|
|
is actually played by a user.
|
|
|
|
"""
|
|
|
|
|
2018-02-20 23:42:21 +01:00
|
|
|
# let's define a new history object.
|
2018-02-19 22:56:16 +01:00
|
|
|
history = History(start_ts=ts_start, user=user)
|
2018-02-20 23:42:21 +01:00
|
|
|
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
2018-02-19 22:56:16 +01:00
|
|
|
|
|
|
|
history_line = 0
|
|
|
|
|
|
|
|
while history_line < length:
|
2018-02-22 11:06:45 +01:00
|
|
|
ts_start += 5 * random.weibullvariate(1, 2.8)
|
2018-02-24 23:17:24 +01:00
|
|
|
history_list = generate_partial_history(user, ts_start)
|
2018-02-22 11:06:45 +01:00
|
|
|
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
|
|
|
for (url, timestamp) in history_list:
|
2018-02-20 23:42:21 +01:00
|
|
|
new_line = HistoryEntry(
|
|
|
|
search=url,
|
2018-02-22 11:06:45 +01:00
|
|
|
timestamp=timestamp,
|
2018-02-20 23:42:21 +01:00
|
|
|
history=history
|
|
|
|
)
|
2018-02-22 11:06:45 +01:00
|
|
|
new_line.save()
|