Fix search engine URL generation

Crawling and histories: fix a lot of stuff
Actually save to DB a created history
2018-02-26 11:47:31 +01:00 · 2018-02-26 11:47:31 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00 · 2018-02-26 11:46:19 +01:00
6 changed files with 271 additions and 76 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -5,7 +5,7 @@ from urllib.error import URLError
 from urllib.parse import urlparse

 from ssl import CertificateError
-from random import sample, randrange
+from random import sample, randrange, randint
 import re
 from datetime import datetime, timedelta

@ -15,6 +15,8 @@ import async_timeout

 from bs4 import BeautifulSoup, Comment

+from profiles.models import BrowserFingerprint, SearchEngine
+
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings

@ -26,13 +28,11 @@ MAX_PER_PAGE = 10

 FOOTER_URL = re.compile(".*footer.*")

-SEARCH_ENGINE = []

 class Settings:
    USER_AGENT = 'Default User'

 settings = Settings()
-startup_time = datetime.min


 def url_getter(html, current_page, root_url):
@ -82,8 +82,6 @@ def url_getter(html, current_page, root_url):
    return links_list


-
-
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
@ -106,12 +104,17 @@ class WebsiteSchedulerMeta(type):

 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
-    def __init__(self, name):
+
+    search_engines = []  # Must be set by CrawlingThread.__init__
+
+    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
        self.dead = False
        self.can_fetch_b = False
-        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+        self.user_agent = (user_agent if user_agent is not None
+                           else settings.USER_AGENT)
+        if any(self.urlroot() in item for item in self.search_engines):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
@ -125,7 +128,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                    robots_url = self.unsafe_urlroot() + 'robots.txt'
                    self.robot_parser = RobotFileParser(robots_url)
                    self.robot_parser.read()
-                except URLError: # Almost surely an offline website.
+                except URLError:  # Almost surely an offline website.
                    self.dead = True
                    self.crawl_delay = 0
            except Exception as e:
@ -134,9 +137,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
            if not self.robot_parser.default_entry:
                self.dead = True
            if not self.dead:
-                delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
+                delay = self.robot_parser.crawl_delay(self.user_agent)
                if delay is None:
-                    req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
+                    req_rate = self.robot_parser.request_rate(self.user_agent)
                    if req_rate is None:
                        delay = 5
                    else:
@ -159,7 +162,9 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):

    def can_fetch(self, url):
        ''' Check whether this program can fetch a given page '''
-        return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
+        return ((self.can_fetch_b)
+                or ((not self.dead) and
+                    self.robot_parser.can_fetch(self.user_agent, url)))

    def fetching(self):
        ''' Tell the scheduler that a page is being fetched now '''
@ -170,26 +175,28 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self, user, url, engine_list, queue):
-        global settings
-        global SEARCH_ENGINE
-        SEARCH_ENGINE = engine_list
+    def __init__(self, url, queue):
+        engine_list = [engine.url for engine in SearchEngine.objects.all()]
+        WebsiteScheduler.search_engines = engine_list
+
+        nb_fingerprint = len(BrowserFingerprint.objects.all())
+        fingerprint = BrowserFingerprint.objects.all()[
+            randint(0, nb_fingerprint - 1)]
+        self.headers = fingerprint.serialize_headers()
+
        self.queue = queue
        super(CrawlingThread, self).__init__()
-        if user:
-            settings.USER_AGENT = user.serialize_headers()
        self.url = url

    def run(self):
-        global startup_time
        tasks = []
+
        #tasks.append(async_crawler("http://plus.google.com/+Python"))
        #tasks.append(async_crawler('https://python.org/'))
-        tasks.append(async_crawler(self.url, self.queue))
+        tasks.append(async_crawler(self.url, self.queue, self.headers))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
-        startup_time = datetime.now()
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

@ -197,13 +204,16 @@ class CrawlingThread(Thread):
 class PageGetter:
    """ Asynchronously get a webpage, abiding by robots.txt """

-    def __init__(self, session, url):
+    headers = None
+
+    def __init__(self, session, url, user_agent):
        self.url = url
        self.session = session
+        self.user_agent = user_agent

    async def get(self, ssl=True):
        """ Actually retrieve the webpage """
-        scheduler = WebsiteScheduler(self.url)
+        scheduler = WebsiteScheduler(self.url, self.user_agent)
        if not scheduler.can_fetch(self.url):
            return None

@ -226,16 +236,22 @@ async def async_print(url):
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get(ssl=False)

-        print('GOT {}HTML for {} at {}'.format(
+        print('GOT {}HTML for {}'.format(
            'None ' if html is None else '',
            url,
-            datetime.now() - startup_time))
+        ))
+
+
+async def async_crawler(url, queue, headers=None):
+    if headers is None:
+        headers = {
+            'User-Agent': settings.USER_AGENT,
+        }

-async def async_crawler(url, queue):
    queued = [url]
    crawled = []
    while queued and (len(crawled) < HARD_LIMIT):
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(headers=headers) as session:
            try:
                url = queued.pop(0)
            except IndexError:
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
+nicknames_dict
--- a/histories/migrations/0001_initial.py
+++ b/histories/migrations/0001_initial.py
@ -0,0 +1,34 @@
+# Generated by Django 2.0.1 on 2018-02-25 19:08
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('profiles', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='History',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('start_ts', models.DateTimeField(help_text='The starting timestamp of the history. Useful for cron-like structure.')),
+                ('played', models.BooleanField(default=False)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='profiles.Profile')),
+            ],
+        ),
+        migrations.CreateModel(
+            name='HistoryEntry',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('search', models.URLField(help_text='The url to be searched')),
+                ('timestamp', models.DateTimeField()),
+                ('history', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='histories.History')),
+            ],
+        ),
+    ]
--- a/histories/models.py
+++ b/histories/models.py
@ -3,14 +3,26 @@ entries, which looks like human-based browsing, according to a dedicated user
 interests, keywords...
 """

+from collections import namedtuple
 import random
 from math import floor
 from queue import Queue
+from xml.etree import ElementTree as ET
+from datetime import datetime
 from django.db import models
 import profiles.models as profiles
-from tor_runner import TorInstance
 from crawl import crawl
 from pinocchio.settings import HISTORY_MIN
+from .tor_runner import TorInstance
+
+
+class InvalidXml(Exception):
+    def __init__(self, what='unexpected XML data.'):
+        super().__init__()
+        self.what = what
+
+    def __str__(self):
+        return "Invalid XML: " + self.what


 class HistoryEntry(models.Model):
@ -28,14 +40,48 @@ class HistoryEntry(models.Model):
        """
        return "{} : {}".format(self.timestamp, self.search)

+    def to_xml(self, xml_root):
+        entry = ET.Element('history')
+        entry_url = ET.Element('url')
+        entry_url.text = self.search
+        entry_ts = ET.Element('timestamp')
+        entry_ts.text = self.timestamp.timestamp()
+        entry.append(entry_url)
+        entry.append(entry_ts)
+        xml_root.append(entry)
+
+    @staticmethod
+    def from_xml(xml_root, in_history):
+        if xml_root.tag != 'history':
+            raise InvalidXml("expected <history> tag here.")
+        url, timestamp = None, None
+
+        for child in xml_root:
+            if child.tag == 'url':
+                url = child.text
+            elif child.tag == 'timestamp':
+                try:
+                    timestamp = datetime.fromtimestamp(child.text)
+                except TypeError:
+                    raise InvalidXml("invalid timestamp {}".format(child.text))
+            else:
+                raise InvalidXml("unknown tag {} as child of <history>".format(
+                    child.tag))
+        output = HistoryEntry()
+        output.search = url
+        output.timestamp = timestamp
+        output.history = in_history
+
+        return output
+

 class History(models.Model):
    """ A history for a user, containing some web connections (http, https).
    Each history is timed, in a human-behaviour manner. """

    start_ts = models.DateTimeField(
-        help_text='The starting timestamp of the history. Useful for cron-like '
-                  'structure.'
+        help_text=('The starting timestamp of the history. Useful for '
+                   'cron-like structure.')

    )
    played = models.BooleanField(default=False)
@ -47,18 +93,18 @@ class History(models.Model):
    def return_history(self):
        """ Returns the history, sorted by increasing timestamps
        """
-        history_set = self.history_set.order_by('timestamp')
-        history_set = [(item.search, item.timestamp.date()) for item in history_set]
-        return history_set
+        output_history = self.historyentry_set.order_by('timestamp')
+        output_history = [(item.search, item.timestamp.date())
+                          for item in output_history]
+        return output_history

    def __str__(self):
        """ Returns the string representation of a history.
        """
-        history_set = self.history_set.order_by('timestamp')
+        history_set = self.historyentry_set.order_by('timestamp')
        header = "[History]:\n"
        return header + "\n".join(history_set)

-
    def play_histories(self):
        """ Actually plays the history.
        """
@ -66,6 +112,52 @@ class History(models.Model):
        runner = TorInstance(self.history)
        self.save()

+    def to_xml(self, xml_root):
+        ''' Exports the current history to xml '''
+        hist_node = ET.Element("history", attrib={
+            'start-ts': self.start_ts,
+            'played': 1 if self.played else 0,
+            'user': self.user.pk,
+        })
+        xml_root.append(hist_node)
+        for entry in self.historyentry_set:
+            entry.to_xml(hist_node)
+
+    @staticmethod
+    def from_xml(xml_root):
+        ''' Loads an history from an XML file '''
+
+        REQUIRED_ATTR = ['start-ts', 'played', 'user']
+
+        if xml_root.tag != 'history':
+            raise InvalidXml('unexpected node {} as root of an history'.format(
+                xml_root.tag))
+        for attr in REQUIRED_ATTR:
+            if attr not in xml_root.attrib:
+                raise InvalidXml(('missing attribute "{}" for tag of type '
+                                  'history').format(attr))
+        start_ts = xml_root.attrib['start-ts']
+        played = xml_root.attrib['played']
+        user_pk = xml_root.attrib['user']
+        users = History.objects.filter(pk=1)
+        if len(users) != 1:
+            raise InvalidXml('primary key for History {} is invalid'.format(
+                user_pk))
+
+        output = History()
+        output.start_ts = start_ts
+        output.played = played > 0
+        output.user = users[0]
+
+        for child in xml_root:
+            HistoryEntry.from_xml(child, output)
+
+        return output
+
+
+PartialHistoryEntry = namedtuple('PartialHistoryEntry',
+                                 ['url', 'timestamp'])
+

 def generate_partial_history(user, t_start):
    """ Generate the part of the history resulting from the crawl starting at
@ -74,36 +166,51 @@ def generate_partial_history(user, t_start):
    timestamp = t_start
    result = []
    basis = generate_first_url(user)
-    result.append((basis, timestamp))
-    timestamp += 5* random.weibullvariate(1, 1.5)
+    result.append(PartialHistoryEntry(basis, timestamp))
+    t_start += 5 * random.weibullvariate(1, 1.5)
    queue = Queue()
-    search_engine_query = profiles.SearchEngine.objects.all()
-    search_engine_list = [item.url for item in search_engine_query]
-    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
+    crawler = crawl.CrawlingThread(basis, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()
    for url in urls:
-        timestamp += 5* random.weibullvariate(1, 1.5)
-        result.append((url, timestamp))
+        t_start += 5 * random.weibullvariate(1, 1.5)
+        result.append(PartialHistoryEntry(url, timestamp))
    return result

+
 def generate_first_url(user):
    """ Generate the first url of a partial history, based on the user
    information. """
-    interest = random.choice(
-        [user.interests.keywords.all(), user.interests.places.all(),
-         user.interests.websites.all(), user.interests.events.all()
-        ]
-    )
+
+    def nonempty(seq):
+        out = []
+        for elt in seq:
+            if elt:
+                out.append(elt)
+        return out
+
+    all_keywords = profiles.Keyword.objects.filter(
+        interest__profile__in=[user])
+    all_websites = profiles.Website.objects.filter(
+        interest__profile__in=[user])
+    all_places = profiles.Place.objects.filter(
+        interest__profile__in=[user])
+    all_events = profiles.Event.objects.filter(
+        interest__profile__in=[user])
+
+    interest = random.choice(nonempty([
+        all_keywords,
+        all_websites,
+        all_places,
+        all_events,
+    ]))
    search_term = random.choice(interest)
    url = search_term.generate_url(user)
    return url


-
-
-def generate_history(user, ts_start):
+def generate_history(user, start_time):
    """ Generate a new history for the user `user`, starting from timestamp
    `ts_start`.
    A few heuristics are used in order to give the impression that the history
@ -111,19 +218,27 @@ def generate_history(user, ts_start):
    """

    # let's define a new history object.
-    history = History(start_ts=ts_start, user=user)
+    history = History(start_ts=start_time, user=user)
    length = HISTORY_MIN + floor(10 * random.weibullvariate(1, 1.5))
+    history.full_clean()
+    history.save()

    history_line = 0

+    current_timestamp = start_time.timestamp()
+
    while history_line < length:
-        ts_start += 5 * random.weibullvariate(1, 2.8)
-        history_list = generate_partial_history(user, ts_start)
-        ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
+        current_timestamp += 5 * random.weibullvariate(1, 2.8)
+        history_list = generate_partial_history(user, current_timestamp)
+        current_timestamp = \
+            history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
        for (url, timestamp) in history_list:
            new_line = HistoryEntry(
                search=url,
-                timestamp=timestamp,
+                timestamp=datetime.fromtimestamp(timestamp),
                history=history
            )
+            new_line.full_clean()
            new_line.save()
+
+    return history
--- a/populate.sh
+++ b/populate.sh
@ -1,11 +1,10 @@
 #!/bin/bash
 # -*- coding: UTF8 -*-

-/usr/bin/python3 manage.py import_browser_fp
-/usr/bin/python3 manage.py import_search_engine
-/usr/bin/python3 manage.py import_keywords
-/usr/bin/python3 manage.py import_website
-/usr/bin/python3 manage.py import_places
-/usr/bin/python3 manage.py import_events
-/usr/bin/python3 manage.py import_interests
-
+python3 manage.py import_browser_fp
+python3 manage.py import_search_engine
+python3 manage.py import_keywords
+python3 manage.py import_website
+python3 manage.py import_places
+python3 manage.py import_events
+python3 manage.py import_interests
--- a/profiles/models.py
+++ b/profiles/models.py
@ -12,12 +12,36 @@ from django.db import models

 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

-NICKNAMES = open("/usr/share/dict/american-english").read().splitlines()
+NICKNAMES = None
 LASTNAMES = open(BASE_DIR + "/data/lastnames.txt").read().splitlines()
 FIRSTNAMES = open(BASE_DIR + "/data/firstnames.txt").read().splitlines()
 EMAIL_DOMAINS = open(BASE_DIR + "/data/email_domains.txt").read().splitlines()


+def require_nicknames(fct):
+    def read_file(path):
+        global NICKNAMES
+        print("Trying {}".format(path))
+        with open(path, 'r') as handle:
+            NICKNAMES = handle.read().splitlines()
+
+    nicknames_files = [
+        os.path.join(BASE_DIR, 'data/nicknames_dict'),
+        "/usr/share/dict/american-english",
+    ]
+    if NICKNAMES is None:
+        for nick_file in nicknames_files:
+            try:
+                read_file(nick_file)
+                break
+            except FileNotFoundError:
+                pass
+        if NICKNAMES is None:
+            raise FileNotFoundError
+
+    return fct
+
+
 class InvalidData(Exception):
    ''' Thrown when the DB contains invalid data, and cannot perform
    something '''
@ -72,7 +96,7 @@ class Website(models.Model):
        elif rand <= 0.1:
            url = random.choice(self.notable_pages).url
        elif rand <= 0.8:
-            search_term_text = self.name +  " " + \
+            search_term_text = self.name + " " + \
                    random.choice(self.keywords)
            url = user.search_engine.search_url(search_term_text)
        else:
@ -122,7 +146,6 @@ class Event(models.Model):
        return user.search_engine.search_url(" ".join(possibilities))


-
 class BrowserFingerprint(models.Model):
    ''' A browser fingerprint, containing things like a user agent '''

@ -147,11 +170,11 @@ class BrowserFingerprint(models.Model):

    def serialize_headers(self):
        return {
-            "Description" : str(self.description),
-            "User-Agent" : str(self.useragent),
-            "Accept-Encoding" : str(self.accept_encoding),
-            "Accept" : str(self.accept_default),
-            "Accept-Language" : str(self.accept_lang),
+            "Description": str(self.description),
+            "User-Agent": str(self.useragent),
+            "Accept-Encoding": str(self.accept_encoding),
+            "Accept": str(self.accept_default),
+            "Accept-Language": str(self.accept_lang),
        }


@ -162,8 +185,8 @@ class SearchEngine(models.Model):
    url = models.URLField()
    query_pattern = models.CharField(max_length=256)  # This field is the
    # query pattern. It should contain a `{}`, which, when substituted with a
-    # search term (using `.format()`), must yield a URL that can be resolved to
-    # perform the search
+    # search term (using `.format()`), must yield a URL tail that can be
+    # concatenated with `url` to perform a search (eg. `?q={}` for ddg).

    def __str__(self):
        return self.name
@ -171,9 +194,10 @@ class SearchEngine(models.Model):
    def search_url(self, search_term):
        ''' Obtain a url to search `search_term` with this search engine '''
        pattern = str(self.query_pattern)
+        search_term = str(search_term).replace(' ', '+')
        if '{}' not in pattern:
            raise InvalidData("Search engine {}: bad pattern".format(self))
-        return str(self.query_pattern).format(search_term)
+        return self.url + (str(self.query_pattern).format(search_term))


 class Interest(models.Model):
@ -214,11 +238,13 @@ def generate_email(nick, first_name, last_name):
    if random.random() < 0.3:
        email = first_name + "." + last_name + "@" + domain
    else:
-        email =  nick + "@" + domain
+        email = nick + "@" + domain
    return email

+
+@require_nicknames
 def create_profile(nick=None):
-    nick = "".join(random.sample(NICKNAMES, random.randrange(2,5)))
+    nick = "".join(random.sample(NICKNAMES, random.randrange(2, 5)))
    first_name = random.choice(FIRSTNAMES)
    last_name = random.choice(LASTNAMES)
    email = generate_email(nick, first_name, last_name)
@ -227,7 +253,11 @@ def create_profile(nick=None):
        first_name=first_name,
        last_name=last_name,
        email=email,
-        uses_url=(random.random() < 0.5),
+        uses_urls=(random.random() < 0.5),
    )
    profile.search_engine = random.choice(SearchEngine.objects.all())
    profile.browser_fingerprint = random.choice(BrowserFingerprint.objects.all())
+
+    profile.full_clean()
+    profile.save()
+    return profile
Author	SHA1	Message	Date
Théophile Bastian	90a6164861	Fix search engine URL generation	2018-02-26 11:47:31 +01:00
Théophile Bastian	b7be4f4df4	Crawling and histories: fix a lot of stuff	2018-02-26 11:47:31 +01:00
Théophile Bastian	a6d7d6b62b	Actually save to DB a created history	2018-02-26 11:46:19 +01:00
Théophile Bastian	f33820a4dc	Use dict from data/nicknames_dict for nicknames	2018-02-26 11:46:19 +01:00
Théophile Bastian	04fcc2b324	Fix populate.sh exec path	2018-02-26 11:46:19 +01:00
Théophile Bastian	6e4709ac91	Histories: xml import/export — untested To be tested when history generation is available	2018-02-26 11:46:19 +01:00
Théophile Bastian	fd4e1d35c7	Crawl: do not use global SEARCH_ENGINES	2018-02-26 11:45:08 +01:00
Théophile Bastian	8f1d69bc41	Crawler: use a random fingerprint	2018-02-26 11:39:55 +01:00
Théophile Bastian	38ccd04d31	Fix tor_runner import	2018-02-26 11:27:27 +01:00