Compare commits
9 commits
master
...
histories_
Author | SHA1 | Date | |
---|---|---|---|
90a6164861 | |||
b7be4f4df4 | |||
a6d7d6b62b | |||
f33820a4dc | |||
04fcc2b324 | |||
6e4709ac91 | |||
fd4e1d35c7 | |||
8f1d69bc41 | |||
38ccd04d31 |
6 changed files with 271 additions and 76 deletions
|
@ -5,7 +5,7 @@ from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ssl import CertificateError
|
from ssl import CertificateError
|
||||||
from random import sample, randrange
|
from random import sample, randrange, randint
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@ import async_timeout
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Comment
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
|
from profiles.models import BrowserFingerprint, SearchEngine
|
||||||
|
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
|
@ -26,13 +28,11 @@ MAX_PER_PAGE = 10
|
||||||
|
|
||||||
FOOTER_URL = re.compile(".*footer.*")
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
SEARCH_ENGINE = []
|
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Default User'
|
USER_AGENT = 'Default User'
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
startup_time = datetime.min
|
|
||||||
|
|
||||||
|
|
||||||
def url_getter(html, current_page, root_url):
|
def url_getter(html, current_page, root_url):
|
||||||
|
@ -82,8 +82,6 @@ def url_getter(html, current_page, root_url):
|
||||||
return links_list
|
return links_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||||
interface, but spawning one instance per canonical website URL """
|
interface, but spawning one instance per canonical website URL """
|
||||||
|
@ -106,12 +104,17 @@ class WebsiteSchedulerMeta(type):
|
||||||
|
|
||||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
""" Schedule the accesses to a website as of robots.txt """
|
""" Schedule the accesses to a website as of robots.txt """
|
||||||
def __init__(self, name):
|
|
||||||
|
search_engines = [] # Must be set by CrawlingThread.__init__
|
||||||
|
|
||||||
|
def __init__(self, name, user_agent):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.last_crawled = datetime.fromtimestamp(0)
|
self.last_crawled = datetime.fromtimestamp(0)
|
||||||
self.dead = False
|
self.dead = False
|
||||||
self.can_fetch_b = False
|
self.can_fetch_b = False
|
||||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
self.user_agent = (user_agent if user_agent is not None
|
||||||
|
else settings.USER_AGENT)
|
||||||
|
if any(self.urlroot() in item for item in self.search_engines):
|
||||||
print("found a search engine for %s" % self.urlroot())
|
print("found a search engine for %s" % self.urlroot())
|
||||||
self.crawl_delay = timedelta(seconds=5)
|
self.crawl_delay = timedelta(seconds=5)
|
||||||
self.can_fetch_b = True
|
self.can_fetch_b = True
|
||||||
|
@ -125,7 +128,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
self.robot_parser.read()
|
self.robot_parser.read()
|
||||||
except URLError: # Almost surely an offline website.
|
except URLError: # Almost surely an offline website.
|
||||||
self.dead = True
|
self.dead = True
|
||||||
self.crawl_delay = 0
|
self.crawl_delay = 0
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -134,9 +137,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
if not self.robot_parser.default_entry:
|
if not self.robot_parser.default_entry:
|
||||||
self.dead = True
|
self.dead = True
|
||||||
if not self.dead:
|
if not self.dead:
|
||||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
delay = self.robot_parser.crawl_delay(self.user_agent)
|
||||||
if delay is None:
|
if delay is None:
|
||||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
req_rate = self.robot_parser.request_rate(self.user_agent)
|
||||||
if req_rate is None:
|
if req_rate is None:
|
||||||
delay = 5
|
delay = 5
|
||||||
else:
|
else:
|
||||||
|
@ -159,7 +162,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
|
|
||||||
def can_fetch(self, url):
|
def can_fetch(self, url):
|
||||||
''' Check whether this program can fetch a given page '''
|
''' Check whether this program can fetch a given page '''
|
||||||
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
|
return ((self.can_fetch_b)
|
||||||
|
or ((not self.dead) and
|
||||||
|
self.robot_parser.can_fetch(self.user_agent, url)))
|
||||||
|
|
||||||
def fetching(self):
|
def fetching(self):
|
||||||
''' Tell the scheduler that a page is being fetched now '''
|
''' Tell the scheduler that a page is being fetched now '''
|
||||||
|
@ -170,26 +175,28 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, engine_list, queue):
|
def __init__(self, url, queue):
|
||||||
global settings
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
global SEARCH_ENGINE
|
WebsiteScheduler.search_engines = engine_list
|
||||||
SEARCH_ENGINE = engine_list
|
|
||||||
|
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||||
|
fingerprint = BrowserFingerprint.objects.all()[
|
||||||
|
randint(0, nb_fingerprint - 1)]
|
||||||
|
self.headers = fingerprint.serialize_headers()
|
||||||
|
|
||||||
self.queue = queue
|
self.queue = queue
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
if user:
|
|
||||||
settings.USER_AGENT = user.serialize_headers()
|
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
global startup_time
|
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(async_crawler(self.url, self.queue))
|
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
startup_time = datetime.now()
|
|
||||||
loop.run_until_complete(asyncio.wait(tasks))
|
loop.run_until_complete(asyncio.wait(tasks))
|
||||||
loop.close()
|
loop.close()
|
||||||
|
|
||||||
|
@ -197,13 +204,16 @@ class CrawlingThread(Thread):
|
||||||
class PageGetter:
|
class PageGetter:
|
||||||
""" Asynchronously get a webpage, abiding by robots.txt """
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
||||||
|
|
||||||
def __init__(self, session, url):
|
headers = None
|
||||||
|
|
||||||
|
def __init__(self, session, url, user_agent):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.session = session
|
self.session = session
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
async def get(self, ssl=True):
|
async def get(self, ssl=True):
|
||||||
""" Actually retrieve the webpage """
|
""" Actually retrieve the webpage """
|
||||||
scheduler = WebsiteScheduler(self.url)
|
scheduler = WebsiteScheduler(self.url, self.user_agent)
|
||||||
if not scheduler.can_fetch(self.url):
|
if not scheduler.can_fetch(self.url):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -226,16 +236,22 @@ async def async_print(url):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url).get(ssl=False)
|
html = await PageGetter(session, url).get(ssl=False)
|
||||||
|
|
||||||
print('GOT {}HTML for {} at {}'.format(
|
print('GOT {}HTML for {}'.format(
|
||||||
'None ' if html is None else '',
|
'None ' if html is None else '',
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
async def async_crawler(url, queue, headers=None):
|
||||||
|
if headers is None:
|
||||||
|
headers = {
|
||||||
|
'User-Agent': settings.USER_AGENT,
|
||||||
|
}
|
||||||
|
|
||||||
async def async_crawler(url, queue):
|
|
||||||
queued = [url]
|
queued = [url]
|
||||||
crawled = []
|
crawled = []
|
||||||
while queued and (len(crawled) < HARD_LIMIT):
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
try:
|
try:
|
||||||
url = queued.pop(0)
|
url = queued.pop(0)
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
|
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
nicknames_dict
|
34
histories/migrations/0001_initial.py
Normal file
34
histories/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# Generated by Django 2.0.1 on 2018-02-25 19:08
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('profiles', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='History',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
|
||||||
|
('played', models.BooleanField(default=False)),
|
||||||
|
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='HistoryEntry',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('search', models.URLField(help_text='The url to be searched')),
|
||||||
|
('timestamp', models.DateTimeField()),
|
||||||
|
('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
|
@ -3,14 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
|
||||||
interests, keywords...
|
interests, keywords...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
import random
|
import random
|
||||||
from math import floor
|
from math import floor
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
from datetime import datetime
|
||||||
from django.db import models
|
from django.db import models
|
||||||
import profiles.models as profiles
|
import profiles.models as profiles
|
||||||
from tor_runner import TorInstance
|
|
||||||
from crawl import crawl
|
from crawl import crawl
|
||||||
from pinocchio.settings import HISTORY_MIN
|
from pinocchio.settings import HISTORY_MIN
|
||||||
|
from .tor_runner import TorInstance
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidXml(Exception):
|
||||||
|
def __init__(self, what='unexpected XML data.'):
|
||||||
|
super().__init__()
|
||||||
|
self.what = what
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "Invalid XML: " + self.what
|
||||||
|
|
||||||
|
|
||||||
class HistoryEntry(models.Model):
|
class HistoryEntry(models.Model):
|
||||||
|
@ -28,14 +40,48 @@ class HistoryEntry(models.Model):
|
||||||
"""
|
"""
|
||||||
return "{} : {}".format(self.timestamp, self.search)
|
return "{} : {}".format(self.timestamp, self.search)
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
entry = ET.Element('history')
|
||||||
|
entry_url = ET.Element('url')
|
||||||
|
entry_url.text = self.search
|
||||||
|
entry_ts = ET.Element('timestamp')
|
||||||
|
entry_ts.text = self.timestamp.timestamp()
|
||||||
|
entry.append(entry_url)
|
||||||
|
entry.append(entry_ts)
|
||||||
|
xml_root.append(entry)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root, in_history):
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml("expected <history> tag here.")
|
||||||
|
url, timestamp = None, None
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
if child.tag == 'url':
|
||||||
|
url = child.text
|
||||||
|
elif child.tag == 'timestamp':
|
||||||
|
try:
|
||||||
|
timestamp = datetime.fromtimestamp(child.text)
|
||||||
|
except TypeError:
|
||||||
|
raise InvalidXml("invalid timestamp {}".format(child.text))
|
||||||
|
else:
|
||||||
|
raise InvalidXml("unknown tag {} as child of <history>".format(
|
||||||
|
child.tag))
|
||||||
|
output = HistoryEntry()
|
||||||
|
output.search = url
|
||||||
|
output.timestamp = timestamp
|
||||||
|
output.history = in_history
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class History(models.Model):
|
class History(models.Model):
|
||||||
""" A history for a user, containing some web connections (http, https).
|
""" A history for a user, containing some web connections (http, https).
|
||||||
Each history is timed, in a human-behaviour manner. """
|
Each history is timed, in a human-behaviour manner. """
|
||||||
|
|
||||||
start_ts = models.DateTimeField(
|
start_ts = models.DateTimeField(
|
||||||
help_text='The starting timestamp of the history. Useful for cron-like '
|
help_text=('The starting timestamp of the history. Useful for '
|
||||||
'structure.'
|
'cron-like structure.')
|
||||||
|
|
||||||
)
|
)
|
||||||
played = models.BooleanField(default=False)
|
played = models.BooleanField(default=False)
|
||||||
|
@ -47,18 +93,18 @@ class History(models.Model):
|
||||||
def return_history(self):
|
def return_history(self):
|
||||||
""" Returns the history, sorted by increasing timestamps
|
""" Returns the history, sorted by increasing timestamps
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
output_history = self.historyentry_set.order_by('timestamp')
|
||||||
history_set = [(item.search, item.timestamp.date()) for item in history_set]
|
output_history = [(item.search, item.timestamp.date())
|
||||||
return history_set
|
for item in output_history]
|
||||||
|
return output_history
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
""" Returns the string representation of a history.
|
""" Returns the string representation of a history.
|
||||||
"""
|
"""
|
||||||
history_set = self.history_set.order_by('timestamp')
|
history_set = self.historyentry_set.order_by('timestamp')
|
||||||
header = "[History]:\n"
|
header = "[History]:\n"
|
||||||
return header + "\n".join(history_set)
|
return header + "\n".join(history_set)
|
||||||
|
|
||||||
|
|
||||||
def play_histories(self):
|
def play_histories(self):
|
||||||
""" Actually plays the history.
|
""" Actually plays the history.
|
||||||
"""
|
"""
|
||||||
|
@ -66,6 +112,52 @@ class History(models.Model):
|
||||||
runner = TorInstance(self.history)
|
runner = TorInstance(self.history)
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
def to_xml(self, xml_root):
|
||||||
|
''' Exports the current history to xml '''
|
||||||
|
hist_node = ET.Element("history", attrib={
|
||||||
|
'start-ts': self.start_ts,
|
||||||
|
'played': 1 if self.played else 0,
|
||||||
|
'user': self.user.pk,
|
||||||
|
})
|
||||||
|
xml_root.append(hist_node)
|
||||||
|
for entry in self.historyentry_set:
|
||||||
|
entry.to_xml(hist_node)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_xml(xml_root):
|
||||||
|
''' Loads an history from an XML file '''
|
||||||
|
|
||||||
|
REQUIRED_ATTR = ['start-ts', 'played', 'user']
|
||||||
|
|
||||||
|
if xml_root.tag != 'history':
|
||||||
|
raise InvalidXml('unexpected node {} as root of an history'.format(
|
||||||
|
xml_root.tag))
|
||||||
|
for attr in REQUIRED_ATTR:
|
||||||
|
if attr not in xml_root.attrib:
|
||||||
|
raise InvalidXml(('missing attribute "{}" for tag of type '
|
||||||
|
'history').format(attr))
|
||||||
|
start_ts = xml_root.attrib['start-ts']
|
||||||
|
played = xml_root.attrib['played']
|
||||||
|
user_pk = xml_root.attrib['user']
|
||||||
|
users = History.objects.filter(pk=1)
|
||||||
|
if len(users) != 1:
|
||||||
|
raise InvalidXml('primary key for History {} is invalid'.format(
|
||||||
|
user_pk))
|
||||||
|
|
||||||
|
output = History()
|
||||||
|
output.start_ts = start_ts
|
||||||
|
output.played = played > 0
|
||||||
|
output.user = users[0]
|
||||||
|
|
||||||
|
for child in xml_root:
|
||||||
|
HistoryEntry.from_xml(child, output)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
PartialHistoryEntry = namedtuple('PartialHistoryEntry',
|
||||||
|
['url', 'timestamp'])
|
||||||
|
|
||||||
|
|
||||||
def generate_partial_history(user, t_start):
|
def generate_partial_history(user, t_start):
|
||||||
""" Generate the part of the history resulting from the crawl starting at
|
""" Generate the part of the history resulting from the crawl starting at
|
||||||
|
@ -74,36 +166,51 @@ def generate_partial_history(user, t_start):
|
||||||
timestamp = t_start
|
timestamp = t_start
|
||||||
result = []
|
result = []
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
result.append((basis, timestamp))
|
result.append(PartialHistoryEntry(basis, timestamp))
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
search_engine_query = profiles.SearchEngine.objects.all()
|
crawler = crawl.CrawlingThread(basis, queue)
|
||||||
search_engine_list = [item.url for item in search_engine_query]
|
|
||||||
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls = queue.get()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
t_start += 5 * random.weibullvariate(1, 1.5)
|
||||||
result.append((url, timestamp))
|
result.append(PartialHistoryEntry(url, timestamp))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def generate_first_url(user):
|
def generate_first_url(user):
|
||||||
""" Generate the first url of a partial history, based on the user
|
""" Generate the first url of a partial history, based on the user
|
||||||
information. """
|
information. """
|
||||||
interest = random.choice(
|
|
||||||
[user.interests.keywords.all(), user.interests.places.all(),
|
def nonempty(seq):
|
||||||
user.interests.websites.all(), user.interests.events.all()
|
out = []
|
||||||
]
|
for elt in seq:
|
||||||
)
|
if elt:
|
||||||
|
out.append(elt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
all_keywords = profiles.Keyword.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_websites = profiles.Website.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_places = profiles.Place.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
all_events = profiles.Event.objects.filter(
|
||||||
|
interest__profile__in=[user])
|
||||||
|
|
||||||
|
interest = random.choice(nonempty([
|
||||||
|
all_keywords,
|
||||||
|
all_websites,
|
||||||
|
all_places,
|
||||||
|
all_events,
|
||||||
|
]))
|
||||||
search_term = random.choice(interest)
|
search_term = random.choice(interest)
|
||||||
url = search_term.generate_url(user)
|
url = search_term.generate_url(user)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def generate_history(user, start_time):
|
||||||
|
|
||||||
def generate_history(user, ts_start):
|
|
||||||
""" Generate a new history for the user `user`, starting from timestamp
|
""" Generate a new history for the user `user`, starting from timestamp
|
||||||
`ts_start`.
|
`ts_start`.
|
||||||
A few heuristics are used in order to give the impression that the history
|
A few heuristics are used in order to give the impression that the history
|
||||||
|
@ -111,19 +218,27 @@ def generate_history(user, ts_start):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# let's define a new history object.
|
# let's define a new history object.
|
||||||
history = History(start_ts=ts_start, user=user)
|
history = History(start_ts=start_time, user=user)
|
||||||
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
|
||||||
|
history.full_clean()
|
||||||
|
history.save()
|
||||||
|
|
||||||
history_line = 0
|
history_line = 0
|
||||||
|
|
||||||
|
current_timestamp = start_time.timestamp()
|
||||||
|
|
||||||
while history_line < length:
|
while history_line < length:
|
||||||
ts_start += 5 * random.weibullvariate(1, 2.8)
|
current_timestamp += 5 * random.weibullvariate(1, 2.8)
|
||||||
history_list = generate_partial_history(user, ts_start)
|
history_list = generate_partial_history(user, current_timestamp)
|
||||||
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
current_timestamp = \
|
||||||
|
history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||||
for (url, timestamp) in history_list:
|
for (url, timestamp) in history_list:
|
||||||
new_line = HistoryEntry(
|
new_line = HistoryEntry(
|
||||||
search=url,
|
search=url,
|
||||||
timestamp=timestamp,
|
timestamp=datetime.fromtimestamp(timestamp),
|
||||||
history=history
|
history=history
|
||||||
)
|
)
|
||||||
|
new_line.full_clean()
|
||||||
new_line.save()
|
new_line.save()
|
||||||
|
|
||||||
|
return history
|
||||||
|
|
15
populate.sh
15
populate.sh
|
@ -1,11 +1,10 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# -*- coding: UTF8 -*-
|
# -*- coding: UTF8 -*-
|
||||||
|
|
||||||
/usr/bin/python3 manage.py import_browser_fp
|
python3 manage.py import_browser_fp
|
||||||
/usr/bin/python3 manage.py import_search_engine
|
python3 manage.py import_search_engine
|
||||||
/usr/bin/python3 manage.py import_keywords
|
python3 manage.py import_keywords
|
||||||
/usr/bin/python3 manage.py import_website
|
python3 manage.py import_website
|
||||||
/usr/bin/python3 manage.py import_places
|
python3 manage.py import_places
|
||||||
/usr/bin/python3 manage.py import_events
|
python3 manage.py import_events
|
||||||
/usr/bin/python3 manage.py import_interests
|
python3 manage.py import_interests
|
||||||
|
|
||||||
|
|
|
@ -12,12 +12,36 @@ from django.db import models
|
||||||
|
|
||||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
|
NICKNAMES = None
|
||||||
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
|
LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
|
||||||
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
|
FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
|
||||||
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
|
EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
|
||||||
|
|
||||||
|
|
||||||
|
def require_nicknames(fct):
|
||||||
|
def read_file(path):
|
||||||
|
global NICKNAMES
|
||||||
|
print("Trying {}".format(path))
|
||||||
|
with open(path, 'r') as handle:
|
||||||
|
NICKNAMES = handle.read().splitlines()
|
||||||
|
|
||||||
|
nicknames_files = [
|
||||||
|
os.path.join(BASE_DIR, 'data/nicknames_dict'),
|
||||||
|
"/usr/share/dict/american-english",
|
||||||
|
]
|
||||||
|
if NICKNAMES is None:
|
||||||
|
for nick_file in nicknames_files:
|
||||||
|
try:
|
||||||
|
read_file(nick_file)
|
||||||
|
break
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
if NICKNAMES is None:
|
||||||
|
raise FileNotFoundError
|
||||||
|
|
||||||
|
return fct
|
||||||
|
|
||||||
|
|
||||||
class InvalidData(Exception):
|
class InvalidData(Exception):
|
||||||
''' Thrown when the DB contains invalid data, and cannot perform
|
''' Thrown when the DB contains invalid data, and cannot perform
|
||||||
something '''
|
something '''
|
||||||
|
@ -72,7 +96,7 @@ class Website(models.Model):
|
||||||
elif rand <= 0.1:
|
elif rand <= 0.1:
|
||||||
url = random.choice(self.notable_pages).url
|
url = random.choice(self.notable_pages).url
|
||||||
elif rand <= 0.8:
|
elif rand <= 0.8:
|
||||||
search_term_text = self.name + " " + \
|
search_term_text = self.name + " " + \
|
||||||
random.choice(self.keywords)
|
random.choice(self.keywords)
|
||||||
url = user.search_engine.search_url(search_term_text)
|
url = user.search_engine.search_url(search_term_text)
|
||||||
else:
|
else:
|
||||||
|
@ -122,7 +146,6 @@ class Event(models.Model):
|
||||||
return user.search_engine.search_url(" ".join(possibilities))
|
return user.search_engine.search_url(" ".join(possibilities))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserFingerprint(models.Model):
|
class BrowserFingerprint(models.Model):
|
||||||
''' A browser fingerprint, containing things like a user agent '''
|
''' A browser fingerprint, containing things like a user agent '''
|
||||||
|
|
||||||
|
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
|
||||||
|
|
||||||
def serialize_headers(self):
|
def serialize_headers(self):
|
||||||
return {
|
return {
|
||||||
"Description" : str(self.description),
|
"Description": str(self.description),
|
||||||
"User-Agent" : str(self.useragent),
|
"User-Agent": str(self.useragent),
|
||||||
"Accept-Encoding" : str(self.accept_encoding),
|
"Accept-Encoding": str(self.accept_encoding),
|
||||||
"Accept" : str(self.accept_default),
|
"Accept": str(self.accept_default),
|
||||||
"Accept-Language" : str(self.accept_lang),
|
"Accept-Language": str(self.accept_lang),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
|
||||||
url = models.URLField()
|
url = models.URLField()
|
||||||
query_pattern = models.CharField(max_length=256) # This field is the
|
query_pattern = models.CharField(max_length=256) # This field is the
|
||||||
# query pattern. It should contain a `{}`, which, when substituted with a
|
# query pattern. It should contain a `{}`, which, when substituted with a
|
||||||
# search term (using `.format()`), must yield a URL that can be resolved to
|
# search term (using `.format()`), must yield a URL tail that can be
|
||||||
# perform the search
|
# concatenated with `url` to perform a search (eg. `?q={}` for ddg).
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
|
||||||
def search_url(self, search_term):
|
def search_url(self, search_term):
|
||||||
''' Obtain a url to search `search_term` with this search engine '''
|
''' Obtain a url to search `search_term` with this search engine '''
|
||||||
pattern = str(self.query_pattern)
|
pattern = str(self.query_pattern)
|
||||||
|
search_term = str(search_term).replace(' ', '+')
|
||||||
if '{}' not in pattern:
|
if '{}' not in pattern:
|
||||||
raise InvalidData("Search engine {}: bad pattern".format(self))
|
raise InvalidData("Search engine {}: bad pattern".format(self))
|
||||||
return str(self.query_pattern).format(search_term)
|
return self.url + (str(self.query_pattern).format(search_term))
|
||||||
|
|
||||||
|
|
||||||
class Interest(models.Model):
|
class Interest(models.Model):
|
||||||
|
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
|
||||||
if random.random() < 0.3:
|
if random.random() < 0.3:
|
||||||
email = first_name + "." + last_name + "@" + domain
|
email = first_name + "." + last_name + "@" + domain
|
||||||
else:
|
else:
|
||||||
email = nick + "@" + domain
|
email = nick + "@" + domain
|
||||||
return email
|
return email
|
||||||
|
|
||||||
|
|
||||||
|
@require_nicknames
|
||||||
def create_profile(nick=None):
|
def create_profile(nick=None):
|
||||||
nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
|
nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
|
||||||
first_name = random.choice(FIRSTNAMES)
|
first_name = random.choice(FIRSTNAMES)
|
||||||
last_name = random.choice(LASTNAMES)
|
last_name = random.choice(LASTNAMES)
|
||||||
email = generate_email(nick, first_name, last_name)
|
email = generate_email(nick, first_name, last_name)
|
||||||
|
@ -227,7 +253,11 @@ def create_profile(nick=None):
|
||||||
first_name=first_name,
|
first_name=first_name,
|
||||||
last_name=last_name,
|
last_name=last_name,
|
||||||
email=email,
|
email=email,
|
||||||
uses_url=(random.random() < 0.5),
|
uses_urls=(random.random() < 0.5),
|
||||||
)
|
)
|
||||||
profile.search_engine = random.choice(SearchEngine.objects.all())
|
profile.search_engine = random.choice(SearchEngine.objects.all())
|
||||||
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
|
profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
|
||||||
|
|
||||||
|
profile.full_clean()
|
||||||
|
profile.save()
|
||||||
|
return profile
|
||||||
|
|
Loading…
Reference in a new issue