Crawling and histories: fix a lot of stuff
This commit is contained in:
parent
e6d587bffd
commit
45ddbff91a
3 changed files with 81 additions and 19 deletions
|
@ -175,7 +175,7 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, queue):
|
def __init__(self, url, queue):
|
||||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
WebsiteScheduler.search_engines = engine_list
|
WebsiteScheduler.search_engines = engine_list
|
||||||
|
|
||||||
|
|
34
histories/migrations/0001_initial.py
Normal file
34
histories/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# Generated by Django 2.0.1 on 2018-02-25 19:08
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('profiles', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='History',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
|
||||||
|
('played', models.BooleanField(default=False)),
|
||||||
|
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='HistoryEntry',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('search', models.URLField(help_text='The url to be searched')),
|
||||||
|
('timestamp', models.DateTimeField()),
|
||||||
|
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
|
@ -3,6 +3,7 @@ entries, which looks like human-based browsing, according to a dedicated user
|
||||||
interests, keywords...
|
interests, keywords...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
import random
|
import random
|
||||||
from math import floor
|
from math import floor
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
@ -92,14 +93,15 @@ class History(models.Model):
|
||||||
def return_history(self):
|
def return_history(self):
|
||||||
""" Returns the history, sorted by increasing timestamps
|
""" Returns the history, sorted by increasing timestamps
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
output_history = self.historyentry_set.order_by('timestamp')
|
||||||
history_set = [(item.search, item.timestamp.date()) for item in history_set]
|
output_history = [(item.search, item.timestamp.date())
|
||||||
return history_set
|
for item in output_history]
|
||||||
|
return output_history
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
""" Returns the string representation of a history.
|
""" Returns the string representation of a history.
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
history_set = self.historyentry_set.order_by('timestamp')
|
||||||
header = "[History]:\n"
|
header = "[History]:\n"
|
||||||
return header + "\n".join(history_set)
|
return header + "\n".join(history_set)
|
||||||
|
|
||||||
|
@ -118,7 +120,7 @@ class History(models.Model):
|
||||||
'user': self.user.pk,
|
'user': self.user.pk,
|
||||||
})
|
})
|
||||||
xml_root.append(hist_node)
|
xml_root.append(hist_node)
|
||||||
for entry in self.history_set:
|
for entry in self.historyentry_set:
|
||||||
entry.to_xml(hist_node)
|
entry.to_xml(hist_node)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -153,6 +155,10 @@ class History(models.Model):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
|
||||||
|
['url', 'timestamp'])
|
||||||
|
|
||||||
|
|
||||||
def generate_partial_history(user, t_start):
|
def generate_partial_history(user, t_start):
|
||||||
""" Generate the part of the history resulting from the crawl starting at
|
""" Generate the part of the history resulting from the crawl starting at
|
||||||
the given url.
|
the given url.
|
||||||
|
@ -160,32 +166,51 @@ def generate_partial_history(user, t_start):
|
||||||
timestamp = t_start
|
timestamp = t_start
|
||||||
result = []
|
result = []
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
result.append((basis, timestamp))
|
result.append(PartialHistoryEntry(basis, timestamp))
|
||||||
t_start += 5 * random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
crawler = crawl.CrawlingThread(basis, queue)
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls = queue.get()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
t_start += 5 * random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
result.append((url, timestamp))
|
result.append(PartialHistoryEntry(url, timestamp))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def generate_first_url(user):
|
def generate_first_url(user):
|
||||||
""" Generate the first url of a partial history, based on the user
|
""" Generate the first url of a partial history, based on the user
|
||||||
information. """
|
information. """
|
||||||
interest = random.choice([
|
|
||||||
user.interests.keywords.all(), user.interests.places.all(),
|
def nonempty(seq):
|
||||||
user.interests.websites.all(), user.interests.events.all()
|
out = []
|
||||||
])
|
for elt in seq:
|
||||||
|
if elt:
|
||||||
|
out.append(elt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
all_keywords = profiles.Keyword.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_websites = profiles.Website.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_places = profiles.Place.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_events = profiles.Event.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
|
||||||
|
interest = random.choice(nonempty([
|
||||||
|
all_keywords,
|
||||||
|
all_websites,
|
||||||
|
all_places,
|
||||||
|
all_events,
|
||||||
|
]))
|
||||||
search_term = random.choice(interest)
|
search_term = random.choice(interest)
|
||||||
url = search_term.generate_url(user)
|
url = search_term.generate_url(user)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def generate_history(user, ts_start):
|
def generate_history(user, start_time):
|
||||||
""" Generate a new history for the user `user`, starting from timestamp
|
""" Generate a new history for the user `user`, starting from timestamp
|
||||||
`ts_start`.
|
`ts_start`.
|
||||||
A few heuristics are used in order to give the impression that the history
|
A few heuristics are used in order to give the impression that the history
|
||||||
|
@ -193,21 +218,24 @@ def generate_history(user, ts_start):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# let's define a new history object.
|
# let's define a new history object.
|
||||||
history = History(start_ts=ts_start, user=user)
|
history = History(start_ts=start_time, user=user)
|
||||||
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
||||||
history.full_clean()
|
history.full_clean()
|
||||||
history.save()
|
history.save()
|
||||||
|
|
||||||
history_line = 0
|
history_line = 0
|
||||||
|
|
||||||
|
current_timestamp = start_time.timestamp()
|
||||||
|
|
||||||
while history_line < length:
|
while history_line < length:
|
||||||
ts_start += 5 * random.weibullvariate(1, 2.8)
|
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
||||||
history_list = generate_partial_history(user, ts_start)
|
history_list = generate_partial_history(user, current_timestamp)
|
||||||
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
current_timestamp = \
|
||||||
|
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||||
for (url, timestamp) in history_list:
|
for (url, timestamp) in history_list:
|
||||||
new_line = HistoryEntry(
|
new_line = HistoryEntry(
|
||||||
search=url,
|
search=url,
|
||||||
timestamp=timestamp,
|
timestamp=datetime.fromtimestamp(timestamp),
|
||||||
history=history
|
history=history
|
||||||
)
|
)
|
||||||
new_line.full_clean()
|
new_line.full_clean()
|
||||||
|
|
Loading…
Reference in a new issue