Crawl: do not use global SEARCH_ENGINES

2018-02-26 11:45:08 +01:00 · 2018-02-26 11:45:08 +01:00 · a4de51b84a
parent 4f0148cb63
commit a4de51b84a
2 changed files with 9 additions and 10 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -15,7 +15,7 @@ import async_timeout
 from bs4 import BeautifulSoup, Comment
-from profiles.models import BrowserFingerprint
+from profiles.models import BrowserFingerprint, SearchEngine
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 SEARCH_ENGINE = []
 class Settings:
    USER_AGENT = 'Default User'
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):
 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
    search_engines = []  # Must be set by CrawlingThread.__init__
    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        self.can_fetch_b = False
        self.user_agent = (user_agent if user_agent is not None
                           else settings.USER_AGENT)
-        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+        if any(self.urlroot() in item for item in self.search_engines):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """
-    def __init__(self, user, url, engine_list, queue):
+    def __init__(self, user, url, queue):
-        global settings
+        engine_list = [engine.url for engine in SearchEngine.objects.all()]
-        global SEARCH_ENGINE
+        WebsiteScheduler.search_engines = engine_list
        SEARCH_ENGINE = engine_list
        nb_fingerprint = len(BrowserFingerprint.objects.all())
        fingerprint = BrowserFingerprint.objects.all()[
--- a/histories/models.py
+++ b/histories/models.py
@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
    result.append((basis, timestamp))
    timestamp += 5* random.weibullvariate(1, 1.5)
    queue = Queue()
-    search_engine_query = profiles.SearchEngine.objects.all()
+    crawler = crawl.CrawlingThread(user, basis, queue)
    search_engine_list = [item.url for item in search_engine_query]
    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()