Crawling and histories: fix a lot of stuff

2018-02-26 00:24:54 +01:00 · 2018-02-26 00:24:54 +01:00 · 45ddbff91a
commit 45ddbff91a
parent e6d587bffd
3 changed files with 81 additions and 19 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -175,7 +175,7 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self, user, url, queue):
+    def __init__(self, url, queue):
        engine_list = [engine.url for engine in SearchEngine.objects.all()]
        WebsiteScheduler.search_engines = engine_list

--- a/histories/migrations/0001_initial.py
+++ b/histories/migrations/0001_initial.py
@ -0,0 +1,34 @@
+# Generated by Django 2.0.1 on 2018-02-25 19:08
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('profiles', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='History',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
+                ('played', models.BooleanField(default=False)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
+            ],
+        ),
+        migrations.CreateModel(
+            name='HistoryEntry',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('search', models.URLField(help_text='The url to be searched')),
+                ('timestamp', models.DateTimeField()),
+                ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
+            ],
+        ),
+    ]
--- a/histories/models.py
+++ b/histories/models.py
@ -3,6 +3,7 @@ entries, which looks like human-based browsing, according to a dedicated user
 interests, keywords...
 """

+from collections import namedtuple
 import random
 from math import floor
 from queue import Queue
@ -92,14 +93,15 @@ class History(models.Model):
    def return_history(self):
        """ Returns the history, sorted by increasing timestamps
        """
-        history_set = self.history_set.order_by('timestamp')
-        history_set = [(item.search, item.timestamp.date()) for item in history_set]
-        return history_set
+        output_history = self.historyentry_set.order_by('timestamp')
+        output_history = [(item.search, item.timestamp.date())
+                          for item in output_history]
+        return output_history

    def __str__(self):
        """ Returns the string representation of a history.
        """
-        history_set = self.history_set.order_by('timestamp')
+        history_set = self.historyentry_set.order_by('timestamp')
        header = "[History]:\n"
        return header + "\n".join(history_set)

@ -118,7 +120,7 @@ class History(models.Model):
            'user': self.user.pk,
        })
        xml_root.append(hist_node)
-        for entry in self.history_set:
+        for entry in self.historyentry_set:
            entry.to_xml(hist_node)

    @staticmethod
@ -153,6 +155,10 @@ class History(models.Model):
        return output


+PartialHistoryEntry = namedtuple('PartialHistoryEntry',
+                                 ['url', 'timestamp'])
+
+
 def generate_partial_history(user, t_start):
    """ Generate the part of the history resulting from the crawl starting at
    the given url.
@ -160,32 +166,51 @@ def generate_partial_history(user, t_start):
    timestamp = t_start
    result = []
    basis = generate_first_url(user)
-    result.append((basis, timestamp))
+    result.append(PartialHistoryEntry(basis, timestamp))
    t_start += 5 * random.weibullvariate(1, 1.5)
    queue = Queue()
-    crawler = crawl.CrawlingThread(user, basis, queue)
+    crawler = crawl.CrawlingThread(basis, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()
    for url in urls:
        t_start += 5 * random.weibullvariate(1, 1.5)
-        result.append((url, timestamp))
+        result.append(PartialHistoryEntry(url, timestamp))
    return result


 def generate_first_url(user):
    """ Generate the first url of a partial history, based on the user
    information. """
-    interest = random.choice([
-        user.interests.keywords.all(), user.interests.places.all(),
-        user.interests.websites.all(), user.interests.events.all()
-    ])
+
+    def nonempty(seq):
+        out = []
+        for elt in seq:
+            if elt:
+                out.append(elt)
+        return out
+
+    all_keywords = profiles.Keyword.objects.filter(
+        interest__profile__in=[user])
+    all_websites = profiles.Website.objects.filter(
+        interest__profile__in=[user])
+    all_places = profiles.Place.objects.filter(
+        interest__profile__in=[user])
+    all_events = profiles.Event.objects.filter(
+        interest__profile__in=[user])
+
+    interest = random.choice(nonempty([
+        all_keywords,
+        all_websites,
+        all_places,
+        all_events,
+    ]))
    search_term = random.choice(interest)
    url = search_term.generate_url(user)
    return url


-def generate_history(user, ts_start):
+def generate_history(user, start_time):
    """ Generate a new history for the user `user`, starting from timestamp
    `ts_start`.
    A few heuristics are used in order to give the impression that the history
@ -193,21 +218,24 @@ def generate_history(user, ts_start):
    """

    # let's define a new history object.
-    history = History(start_ts=ts_start, user=user)
+    history = History(start_ts=start_time, user=user)
    length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
    history.full_clean()
    history.save()

    history_line = 0

+    current_timestamp = start_time.timestamp()
+
    while history_line < length:
-        ts_start += 5 * random.weibullvariate(1, 2.8)
-        history_list = generate_partial_history(user, ts_start)
-        ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
+        current_timestamp += 5 * random.weibullvariate(1, 2.8)
+        history_list = generate_partial_history(user, current_timestamp)
+        current_timestamp = \
+            history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
        for (url, timestamp) in history_list:
            new_line = HistoryEntry(
                search=url,
-                timestamp=timestamp,
+                timestamp=datetime.fromtimestamp(timestamp),
                history=history
            )
            new_line.full_clean()