diff --git a/crawl/crawl.py b/crawl/crawl.py index d005d20..9d266f3 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -175,7 +175,7 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, user, url, queue): + def __init__(self, url, queue): engine_list = [engine.url for engine in SearchEngine.objects.all()] WebsiteScheduler.search_engines = engine_list diff --git a/histories/migrations/0001_initial.py b/histories/migrations/0001_initial.py new file mode 100644 index 0000000..f6eac9a --- /dev/null +++ b/histories/migrations/0001_initial.py @@ -0,0 +1,34 @@ +# Generated by Django 2.0.1 on 2018-02-25 19:08 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('profiles', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='History', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')), + ('played', models.BooleanField(default=False)), + ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')), + ], + ), + migrations.CreateModel( + name='HistoryEntry', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('search', models.URLField(help_text='The url to be searched')), + ('timestamp', models.DateTimeField()), + ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')), + ], + ), + ] diff --git a/histories/models.py b/histories/models.py index 94ddf28..47abe56 100644 --- a/histories/models.py +++ b/histories/models.py @@ -3,6 +3,7 @@ entries, which looks like human-based browsing, according to a dedicated user interests, keywords... """ +from collections import namedtuple import random from math import floor from queue import Queue @@ -92,14 +93,15 @@ class History(models.Model): def return_history(self): """ Returns the history, sorted by increasing timestamps """ - history_set = self.history_set.order_by('timestamp') - history_set = [(item.search, item.timestamp.date()) for item in history_set] - return history_set + output_history = self.historyentry_set.order_by('timestamp') + output_history = [(item.search, item.timestamp.date()) + for item in output_history] + return output_history def __str__(self): """ Returns the string representation of a history. """ - history_set = self.history_set.order_by('timestamp') + history_set = self.historyentry_set.order_by('timestamp') header = "[History]:\n" return header + "\n".join(history_set) @@ -118,7 +120,7 @@ class History(models.Model): 'user': self.user.pk, }) xml_root.append(hist_node) - for entry in self.history_set: + for entry in self.historyentry_set: entry.to_xml(hist_node) @staticmethod @@ -153,6 +155,10 @@ class History(models.Model): return output +PartialHistoryEntry = namedtuple('PartialHistoryEntry', + ['url', 'timestamp']) + + def generate_partial_history(user, t_start): """ Generate the part of the history resulting from the crawl starting at the given url. @@ -160,32 +166,51 @@ def generate_partial_history(user, t_start): timestamp = t_start result = [] basis = generate_first_url(user) - result.append((basis, timestamp)) + result.append(PartialHistoryEntry(basis, timestamp)) t_start += 5 * random.weibullvariate(1, 1.5) queue = Queue() - crawler = crawl.CrawlingThread(user, basis, queue) + crawler = crawl.CrawlingThread(basis, queue) crawler.start() crawler.join() urls = queue.get() for url in urls: t_start += 5 * random.weibullvariate(1, 1.5) - result.append((url, timestamp)) + result.append(PartialHistoryEntry(url, timestamp)) return result def generate_first_url(user): """ Generate the first url of a partial history, based on the user information. """ - interest = random.choice([ - user.interests.keywords.all(), user.interests.places.all(), - user.interests.websites.all(), user.interests.events.all() - ]) + + def nonempty(seq): + out = [] + for elt in seq: + if elt: + out.append(elt) + return out + + all_keywords = profiles.Keyword.objects.filter( + interest__profile__in=[user]) + all_websites = profiles.Website.objects.filter( + interest__profile__in=[user]) + all_places = profiles.Place.objects.filter( + interest__profile__in=[user]) + all_events = profiles.Event.objects.filter( + interest__profile__in=[user]) + + interest = random.choice(nonempty([ + all_keywords, + all_websites, + all_places, + all_events, + ])) search_term = random.choice(interest) url = search_term.generate_url(user) return url -def generate_history(user, ts_start): +def generate_history(user, start_time): """ Generate a new history for the user `user`, starting from timestamp `ts_start`. A few heuristics are used in order to give the impression that the history @@ -193,21 +218,24 @@ def generate_history(user, ts_start): """ # let's define a new history object. - history = History(start_ts=ts_start, user=user) + history = History(start_ts=start_time, user=user) length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5)) history.full_clean() history.save() history_line = 0 + current_timestamp = start_time.timestamp() + while history_line < length: - ts_start += 5 * random.weibullvariate(1, 2.8) - history_list = generate_partial_history(user, ts_start) - ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) + current_timestamp += 5 * random.weibullvariate(1, 2.8) + history_list = generate_partial_history(user, current_timestamp) + current_timestamp = \ + history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) for (url, timestamp) in history_list: new_line = HistoryEntry( search=url, - timestamp=timestamp, + timestamp=datetime.fromtimestamp(timestamp), history=history ) new_line.full_clean()