Fix search engine URL generation

Crawling and histories: fix a lot of stuff
Actually save to DB a created history
2018-02-26 11:47:31 +01:00 · 2018-02-26 11:47:31 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00
6 changed files with 271 additions and 76 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -5,7 +5,7 @@ from urllib.error import URLError
 from urllib.parse import urlparse
 from ssl import CertificateError
-from random import sample, randrange
+from random import sample, randrange, randint
 import re
 from datetime import datetime, timedelta
@ -15,6 +15,8 @@ import async_timeout
 from bs4 import BeautifulSoup, Comment
 from profiles.models import BrowserFingerprint, SearchEngine
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
@ -26,13 +28,11 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 SEARCH_ENGINE = []
 class Settings:
    USER_AGENT = 'Default User'
 settings = Settings()
 startup_time = datetime.min
 def url_getter(html, current_page, root_url):
@ -82,8 +82,6 @@ def url_getter(html, current_page, root_url):
    return links_list
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
@ -106,12 +104,17 @@ class WebsiteSchedulerMeta(type):
 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
-    def __init__(self, name):
+
    search_engines = []  # Must be set by CrawlingThread.__init__
    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        self.can_fetch_b = False
-        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+        self.user_agent = (user_agent if user_agent is not None
                           else settings.USER_AGENT)
        if any(self.urlroot() in item for item in self.search_engines):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
@ -125,7 +128,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                    robots_url = self.unsafe_urlroot() + 'robots.txt'
                    self.robot_parser = RobotFileParser(robots_url)
                    self.robot_parser.read()
-                except URLError: # Almost surely an offline website.
+                except URLError:  # Almost surely an offline website.
                    self.dead = True
                    self.crawl_delay = 0
            except Exception as e:
@ -134,9 +137,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
            if not self.robot_parser.default_entry:
                self.dead = True
            if not self.dead:
-                delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+                delay = self.robot_parser.crawl_delay(self.user_agent)
                if delay is None:
-                    req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                    req_rate = self.robot_parser.request_rate(self.user_agent)
                    if req_rate is None:
                        delay = 5
                    else:
@ -159,7 +162,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
+        return ((self.can_fetch_b)
                or ((not self.dead) and
                    self.robot_parser.can_fetch(self.user_agent, url)))
    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -170,26 +175,28 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """
-    def __init__(self, user, url, engine_list, queue):
+    def __init__(self, url, queue):
-        global settings
+        engine_list = [engine.url for engine in SearchEngine.objects.all()]
-        global SEARCH_ENGINE
+        WebsiteScheduler.search_engines = engine_list
-        SEARCH_ENGINE = engine_list
+
        nb_fingerprint = len(BrowserFingerprint.objects.all())
        fingerprint = BrowserFingerprint.objects.all()[
            randint(0, nb_fingerprint - 1)]
        self.headers = fingerprint.serialize_headers()
        self.queue = queue
        super(CrawlingThread, self).__init__()
        if user:
            settings.USER_AGENT = user.serialize_headers()
        self.url = url
    def run(self):
        global startup_time
        tasks = []
        #tasks.append(async_crawler("http://plus.google.com/+Python"))
        #tasks.append(async_crawler('https://python.org/'))
-        tasks.append(async_crawler(self.url, self.queue))
+        tasks.append(async_crawler(self.url, self.queue, self.headers))
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        startup_time = datetime.now()
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
@ -197,13 +204,16 @@ class CrawlingThread(Thread):
 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """
-    def __init__(self, session, url):
+    headers = None
    def __init__(self, session, url, user_agent):
        self.url = url
        self.session = session
        self.user_agent = user_agent
    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
-        scheduler = WebsiteScheduler(self.url)
+        scheduler = WebsiteScheduler(self.url, self.user_agent)
        if not scheduler.can_fetch(self.url):
            return None
@ -226,16 +236,22 @@ async def async_print(url):
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get(ssl=False)
-        print('GOT {}HTML for {} at {}'.format(
+        print('GOT {}HTML for {}'.format(
            'None ' if html is None else '',
            url,
-            datetime.now() - startup_time))
+        ))
 async def async_crawler(url, queue, headers=None):
    if headers is None:
        headers = {
            'User-Agent': settings.USER_AGENT,
        }
 async def async_crawler(url, queue):
    queued = [url]
    crawled = []
    while queued and (len(crawled) < HARD_LIMIT):
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(headers=headers) as session:
            try:
                url = queued.pop(0)
            except IndexError:
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
 nicknames_dict
--- a/histories/migrations/0001_initial.py
+++ b/histories/migrations/0001_initial.py
@ -0,0 +1,34 @@
 # Generated by Django 2.0.1 on 2018-02-25 19:08
 from django.db import migrations, models
 import django.db.models.deletion
 class Migration(migrations.Migration):
    initial = True
    dependencies = [
        ('profiles', '0001_initial'),
    ]
    operations = [
        migrations.CreateModel(
            name='History',
            fields=[
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
                ('played', models.BooleanField(default=False)),
                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
            ],
        ),
        migrations.CreateModel(
            name='HistoryEntry',
            fields=[
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                ('search', models.URLField(help_text='The url to be searched')),
                ('timestamp', models.DateTimeField()),
                ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
            ],
        ),
    ]
--- a/histories/models.py
+++ b/histories/models.py
@ -3,14 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
 interests, keywords...
 """
 from collections import namedtuple
 import random
 from math import floor
 from queue import Queue
 from xml.etree import ElementTree as ET
 from datetime import datetime
 from django.db import models
 import profiles.models as profiles
 from tor_runner import TorInstance
 from crawl import crawl
 from pinocchio.settings import HISTORY_MIN
 from .tor_runner import TorInstance
 class InvalidXml(Exception):
    def __init__(self, what='unexpected XML data.'):
        super().__init__()
        self.what = what
    def __str__(self):
        return "Invalid XML: " + self.what
 class HistoryEntry(models.Model):
@ -28,14 +40,48 @@ class HistoryEntry(models.Model):
        """
        return "{} : {}".format(self.timestamp, self.search)
    def to_xml(self, xml_root):
        entry = ET.Element('history')
        entry_url = ET.Element('url')
        entry_url.text = self.search
        entry_ts = ET.Element('timestamp')
        entry_ts.text = self.timestamp.timestamp()
        entry.append(entry_url)
        entry.append(entry_ts)
        xml_root.append(entry)
    @staticmethod
    def from_xml(xml_root, in_history):
        if xml_root.tag != 'history':
            raise InvalidXml("expected <history> tag here.")
        url, timestamp = None, None
        for child in xml_root:
            if child.tag == 'url':
                url = child.text
            elif child.tag == 'timestamp':
                try:
                    timestamp = datetime.fromtimestamp(child.text)
                except TypeError:
                    raise InvalidXml("invalid timestamp {}".format(child.text))
            else:
                raise InvalidXml("unknown tag {} as child of <history>".format(
                    child.tag))
        output = HistoryEntry()
        output.search = url
        output.timestamp = timestamp
        output.history = in_history
        return output
 class History(models.Model):
    """ A history for a user, containing some web connections (http, https).
    Each history is timed, in a human-behaviour manner. """
    start_ts = models.DateTimeField(
-        help_text='The starting timestamp of the history. Useful for cron-like '
+        help_text=('The starting timestamp of the history. Useful for '
-                  'structure.'
+                   'cron-like structure.')
    )
    played = models.BooleanField(default=False)
@ -47,18 +93,18 @@ class History(models.Model):
    def return_history(self):
        """ Returns the history, sorted by increasing timestamps
        """
-        history_set = self.history_set.order_by('timestamp')
+        output_history = self.historyentry_set.order_by('timestamp')
-        history_set = [(item.search, item.timestamp.date()) for item in history_set]
+        output_history = [(item.search, item.timestamp.date())
-        return history_set
+                          for item in output_history]
        return output_history
    def __str__(self):
        """ Returns the string representation of a history.
        """
-        history_set = self.history_set.order_by('timestamp')
+        history_set = self.historyentry_set.order_by('timestamp')
        header = "[History]:\n"
        return header + "\n".join(history_set)
    def play_histories(self):
        """ Actually plays the history.
        """
@ -66,6 +112,52 @@ class History(models.Model):
        runner = TorInstance(self.history)
        self.save()
    def to_xml(self, xml_root):
        ''' Exports the current history to xml '''
        hist_node = ET.Element("history", attrib={
            'start-ts': self.start_ts,
            'played': 1 if self.played else 0,
            'user': self.user.pk,
        })
        xml_root.append(hist_node)
        for entry in self.historyentry_set:
            entry.to_xml(hist_node)
    @staticmethod
    def from_xml(xml_root):
        ''' Loads an history from an XML file '''
        REQUIRED_ATTR = ['start-ts', 'played', 'user']
        if xml_root.tag != 'history':
            raise InvalidXml('unexpected node {} as root of an history'.format(
                xml_root.tag))
        for attr in REQUIRED_ATTR:
            if attr not in xml_root.attrib:
                raise InvalidXml(('missing attribute "{}" for tag of type '
                                  'history').format(attr))
        start_ts = xml_root.attrib['start-ts']
        played = xml_root.attrib['played']
        user_pk = xml_root.attrib['user']
        users = History.objects.filter(pk=1)
        if len(users) != 1:
            raise InvalidXml('primary key for History {} is invalid'.format(
                user_pk))
        output = History()
        output.start_ts = start_ts
        output.played = played > 0
        output.user = users[0]
        for child in xml_root:
            HistoryEntry.from_xml(child, output)
        return output
 PartialHistoryEntry = namedtuple('PartialHistoryEntry',
                                 ['url', 'timestamp'])
 def generate_partial_history(user, t_start):
    """ Generate the part of the history resulting from the crawl starting at
@ -74,36 +166,51 @@ def generate_partial_history(user, t_start):
    timestamp = t_start
    result = []
    basis = generate_first_url(user)
-    result.append((basis, timestamp))
+    result.append(PartialHistoryEntry(basis, timestamp))
-    timestamp += 5* random.weibullvariate(1, 1.5)
+    t_start += 5 * random.weibullvariate(1, 1.5)
    queue = Queue()
-    search_engine_query = profiles.SearchEngine.objects.all()
+    crawler = crawl.CrawlingThread(basis, queue)
    search_engine_list = [item.url for item in search_engine_query]
    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()
    for url in urls:
-        timestamp += 5* random.weibullvariate(1, 1.5)
+        t_start += 5 * random.weibullvariate(1, 1.5)
-        result.append((url, timestamp))
+        result.append(PartialHistoryEntry(url, timestamp))
    return result
 def generate_first_url(user):
    """ Generate the first url of a partial history, based on the user
    information. """
-    interest = random.choice(
+
-        [user.interests.keywords.all(), user.interests.places.all(),
+    def nonempty(seq):
-         user.interests.websites.all(), user.interests.events.all()
+        out = []
-        ]
+        for elt in seq:
-    )
+            if elt:
                out.append(elt)
        return out
    all_keywords = profiles.Keyword.objects.filter(
        interest__profile__in=[user])
    all_websites = profiles.Website.objects.filter(
        interest__profile__in=[user])
    all_places = profiles.Place.objects.filter(
        interest__profile__in=[user])
    all_events = profiles.Event.objects.filter(
        interest__profile__in=[user])
    interest = random.choice(nonempty([
        all_keywords,
        all_websites,
        all_places,
        all_events,
    ]))
    search_term = random.choice(interest)
    url = search_term.generate_url(user)
    return url
-
+def generate_history(user, start_time):
 def generate_history(user, ts_start):
    """ Generate a new history for the user `user`, starting from timestamp
    `ts_start`.
    A few heuristics are used in order to give the impression that the history
@ -111,19 +218,27 @@ def generate_history(user, ts_start):
    """
    # let's define a new history object.
-    history = History(start_ts=ts_start, user=user)
+    history = History(start_ts=start_time, user=user)
    length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
    history.full_clean()
    history.save()
    history_line = 0
    current_timestamp = start_time.timestamp()
    while history_line < length:
-        ts_start += 5 * random.weibullvariate(1, 2.8)
+        current_timestamp += 5 * random.weibullvariate(1, 2.8)
-        history_list = generate_partial_history(user, ts_start)
+        history_list = generate_partial_history(user, current_timestamp)
-        ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
+        current_timestamp = \
            history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
        for (url, timestamp) in history_list:
            new_line = HistoryEntry(
                search=url,
-                timestamp=timestamp,
+                timestamp=datetime.fromtimestamp(timestamp),
                history=history
            )
            new_line.full_clean()
            new_line.save()
    return history
--- a/populate.sh
+++ b/populate.sh
@ -1,11 +1,10 @@
 #!/bin/bash
 # -*- coding: UTF8 -*-
-/usr/bin/python3 manage.py import_browser_fp
+python3 manage.py import_browser_fp
-/usr/bin/python3 manage.py import_search_engine
+python3 manage.py import_search_engine
-/usr/bin/python3 manage.py import_keywords
+python3 manage.py import_keywords
-/usr/bin/python3 manage.py import_website
+python3 manage.py import_website
-/usr/bin/python3 manage.py import_places
+python3 manage.py import_places
-/usr/bin/python3 manage.py import_events
+python3 manage.py import_events
-/usr/bin/python3 manage.py import_interests
+python3 manage.py import_interests
--- a/profiles/models.py
+++ b/profiles/models.py
@ -12,12 +12,36 @@ from django.db import models
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
+NICKNAMES = None
 LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
 FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
 EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()
 def require_nicknames(fct):
    def read_file(path):
        global NICKNAMES
        print("Trying {}".format(path))
        with open(path, 'r') as handle:
            NICKNAMES = handle.read().splitlines()
    nicknames_files = [
        os.path.join(BASE_DIR, 'data/nicknames_dict'),
        "/usr/share/dict/american-english",
    ]
    if NICKNAMES is None:
        for nick_file in nicknames_files:
            try:
                read_file(nick_file)
                break
            except FileNotFoundError:
                pass
        if NICKNAMES is None:
            raise FileNotFoundError
    return fct
 class InvalidData(Exception):
    ''' Thrown when the DB contains invalid data, and cannot perform
    something '''
@ -72,7 +96,7 @@ class Website(models.Model):
        elif rand <= 0.1:
            url = random.choice(self.notable_pages).url
        elif rand <= 0.8:
-            search_term_text = self.name +  " " + \
+            search_term_text = self.name + " " + \
                    random.choice(self.keywords)
            url = user.search_engine.search_url(search_term_text)
        else:
@ -122,7 +146,6 @@ class Event(models.Model):
        return user.search_engine.search_url(" ".join(possibilities))
 class BrowserFingerprint(models.Model):
    ''' A browser fingerprint, containing things like a user agent '''
@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):
    def serialize_headers(self):
        return {
-            "Description" : str(self.description),
+            "Description": str(self.description),
-            "User-Agent" : str(self.useragent),
+            "User-Agent": str(self.useragent),
-            "Accept-Encoding" : str(self.accept_encoding),
+            "Accept-Encoding": str(self.accept_encoding),
-            "Accept" : str(self.accept_default),
+            "Accept": str(self.accept_default),
-            "Accept-Language" : str(self.accept_lang),
+            "Accept-Language": str(self.accept_lang),
        }
@ -162,8 +185,8 @@ class SearchEngine(models.Model):
    url = models.URLField()
    query_pattern = models.CharField(max_length=256)  # This field is the
    # query pattern. It should contain a `{}`, which, when substituted with a
-    # search term (using `.format()`), must yield a URL that can be resolved to
+    # search term (using `.format()`), must yield a URL tail that can be
-    # perform the search
+    # concatenated with `url` to perform a search (eg. `?q={}` for ddg).
    def __str__(self):
        return self.name
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
    def search_url(self, search_term):
        ''' Obtain a url to search `search_term` with this search engine '''
        pattern = str(self.query_pattern)
        search_term = str(search_term).replace(' ', '+')
        if '{}' not in pattern:
            raise InvalidData("Search engine {}: bad pattern".format(self))
-        return str(self.query_pattern).format(search_term)
+        return self.url + (str(self.query_pattern).format(search_term))
 class Interest(models.Model):
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
    if random.random() < 0.3:
        email = first_name + "." + last_name + "@" + domain
    else:
-        email =  nick + "@" + domain
+        email = nick + "@" + domain
    return email
@require_nicknames
 def create_profile(nick=None):
-    nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
+    nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
    first_name = random.choice(FIRSTNAMES)
    last_name = random.choice(LASTNAMES)
    email = generate_email(nick, first_name, last_name)
@ -227,7 +253,11 @@ def create_profile(nick=None):
        first_name=first_name,
        last_name=last_name,
        email=email,
-        uses_url=(random.random() < 0.5),
+        uses_urls=(random.random() < 0.5),
    )
    profile.search_engine = random.choice(SearchEngine.objects.all())
    profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
    profile.full_clean()
    profile.save()
    return profile
Author	SHA1	Message	Date
Théophile Bastian	90a6164861	Fix search engine URL generation	2018-02-26 11:47:31 +01:00
Théophile Bastian	b7be4f4df4	Crawling and histories: fix a lot of stuff	2018-02-26 11:47:31 +01:00
Théophile Bastian	a6d7d6b62b	Actually save to DB a created history	2018-02-26 11:46:19 +01:00
Théophile Bastian	f33820a4dc	Use dict from data/nicknames_dict for nicknames	2018-02-26 11:46:19 +01:00
Théophile Bastian	04fcc2b324	Fix populate.sh exec path	2018-02-26 11:46:19 +01:00
Théophile Bastian	6e4709ac91	Histories: xml import/export — untested To be tested when history generation is available	2018-02-26 11:46:19 +01:00
Théophile Bastian	fd4e1d35c7	Crawl: do not use global SEARCH_ENGINES	2018-02-26 11:45:08 +01:00
Théophile Bastian	8f1d69bc41	Crawler: use a random fingerprint	2018-02-26 11:39:55 +01:00
Théophile Bastian	38ccd04d31	Fix tor_runner import	2018-02-26 11:27:27 +01:00