Crawling and histories: fix a lot of stuff

This commit is contained in:
Théophile Bastian 2018-02-26 00:24:54 +01:00
parent e6d587bffd
commit 45ddbff91a
3 changed files with 81 additions and 19 deletions

View File

@ -175,7 +175,7 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, user, url, queue): def __init__(self, url, queue):
engine_list = [engine.url for engine in SearchEngine.objects.all()] engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list WebsiteScheduler.search_engines = engine_list

View File

@ -0,0 +1,34 @@
# Generated by Django 2.0.1 on 2018-02-25 19:08
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('profiles', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='History',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
('played', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
],
),
migrations.CreateModel(
name='HistoryEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('search', models.URLField(help_text='The url to be searched')),
('timestamp', models.DateTimeField()),
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
],
),
]

View File

@ -3,6 +3,7 @@ entries, which looks like human-based browsing, according to a dedicated user
interests, keywords... interests, keywords...
""" """
from collections import namedtuple
import random import random
from math import floor from math import floor
from queue import Queue from queue import Queue
@ -92,14 +93,15 @@ class History(models.Model):
def return_history(self): def return_history(self):
""" Returns the history, sorted by increasing timestamps """ Returns the history, sorted by increasing timestamps
""" """
history_set = self.history_set.order_by('timestamp') output_history = self.historyentry_set.order_by('timestamp')
history_set = [(item.search, item.timestamp.date()) for item in history_set] output_history = [(item.search, item.timestamp.date())
return history_set for item in output_history]
return output_history
def __str__(self): def __str__(self):
""" Returns the string representation of a history. """ Returns the string representation of a history.
""" """
history_set = self.history_set.order_by('timestamp') history_set = self.historyentry_set.order_by('timestamp')
header = "[History]:\n" header = "[History]:\n"
return header + "\n".join(history_set) return header + "\n".join(history_set)
@ -118,7 +120,7 @@ class History(models.Model):
'user': self.user.pk, 'user': self.user.pk,
}) })
xml_root.append(hist_node) xml_root.append(hist_node)
for entry in self.history_set: for entry in self.historyentry_set:
entry.to_xml(hist_node) entry.to_xml(hist_node)
@staticmethod @staticmethod
@ -153,6 +155,10 @@ class History(models.Model):
return output return output
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
['url', 'timestamp'])
def generate_partial_history(user, t_start): def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at """ Generate the part of the history resulting from the crawl starting at
the given url. the given url.
@ -160,32 +166,51 @@ def generate_partial_history(user, t_start):
timestamp = t_start timestamp = t_start
result = [] result = []
basis = generate_first_url(user) basis = generate_first_url(user)
result.append((basis, timestamp)) result.append(PartialHistoryEntry(basis, timestamp))
t_start += 5 * random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
queue = Queue() queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue) crawler = crawl.CrawlingThread(basis, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls = queue.get() urls = queue.get()
for url in urls: for url in urls:
t_start += 5 * random.weibullvariate(1, 1.5) t_start += 5 * random.weibullvariate(1, 1.5)
result.append((url, timestamp)) result.append(PartialHistoryEntry(url, timestamp))
return result return result
def generate_first_url(user): def generate_first_url(user):
""" Generate the first url of a partial history, based on the user """ Generate the first url of a partial history, based on the user
information. """ information. """
interest = random.choice([
user.interests.keywords.all(), user.interests.places.all(), def nonempty(seq):
user.interests.websites.all(), user.interests.events.all() out = []
]) for elt in seq:
if elt:
out.append(elt)
return out
all_keywords = profiles.Keyword.objects.filter(
interest__profile__in=[user])
all_websites = profiles.Website.objects.filter(
interest__profile__in=[user])
all_places = profiles.Place.objects.filter(
interest__profile__in=[user])
all_events = profiles.Event.objects.filter(
interest__profile__in=[user])
interest = random.choice(nonempty([
all_keywords,
all_websites,
all_places,
all_events,
]))
search_term = random.choice(interest) search_term = random.choice(interest)
url = search_term.generate_url(user) url = search_term.generate_url(user)
return url return url
def generate_history(user, ts_start): def generate_history(user, start_time):
""" Generate a new history for the user `user`, starting from timestamp """ Generate a new history for the user `user`, starting from timestamp
`ts_start`. `ts_start`.
A few heuristics are used in order to give the impression that the history A few heuristics are used in order to give the impression that the history
@ -193,21 +218,24 @@ def generate_history(user, ts_start):
""" """
# let's define a new history object. # let's define a new history object.
history = History(start_ts=ts_start, user=user) history = History(start_ts=start_time, user=user)
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5)) length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
history.full_clean() history.full_clean()
history.save() history.save()
history_line = 0 history_line = 0
current_timestamp = start_time.timestamp()
while history_line < length: while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8) current_timestamp += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start) history_list = generate_partial_history(user, current_timestamp)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) current_timestamp = \
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list: for (url, timestamp) in history_list:
new_line = HistoryEntry( new_line = HistoryEntry(
search=url, search=url,
timestamp=timestamp, timestamp=datetime.fromtimestamp(timestamp),
history=history history=history
) )
new_line.full_clean() new_line.full_clean()