Crawl: do not use global SEARCH_ENGINES

2018-02-26 11:45:08 +01:00 · 2018-02-26 11:45:08 +01:00 · fd4e1d35c7
commit fd4e1d35c7
parent 8f1d69bc41
2 changed files with 9 additions and 10 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -15,7 +15,7 @@ import async_timeout

 from bs4 import BeautifulSoup, Comment

-from profiles.models import BrowserFingerprint
+from profiles.models import BrowserFingerprint, SearchEngine

 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10

 FOOTER_URL = re.compile(".*footer.*")

-SEARCH_ENGINE = []

 class Settings:
    USER_AGENT = 'Default User'
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):

 class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
    """ Schedule the accesses to a website as of robots.txt """
+
+    search_engines = []  # Must be set by CrawlingThread.__init__
+
    def __init__(self, name, user_agent):
        self.name = name
        self.last_crawled = datetime.fromtimestamp(0)
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
        self.can_fetch_b = False
        self.user_agent = (user_agent if user_agent is not None
                           else settings.USER_AGENT)
-        if any(self.urlroot() in item for item in SEARCH_ENGINE):
+        if any(self.urlroot() in item for item in self.search_engines):
            print("found a search engine for %s" % self.urlroot())
            self.crawl_delay = timedelta(seconds=5)
            self.can_fetch_b = True
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self, user, url, engine_list, queue):
-        global settings
-        global SEARCH_ENGINE
-        SEARCH_ENGINE = engine_list
+    def __init__(self, user, url, queue):
+        engine_list = [engine.url for engine in SearchEngine.objects.all()]
+        WebsiteScheduler.search_engines = engine_list

        nb_fingerprint = len(BrowserFingerprint.objects.all())
        fingerprint = BrowserFingerprint.objects.all()[
--- a/histories/models.py
+++ b/histories/models.py
@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
    result.append((basis, timestamp))
    timestamp += 5* random.weibullvariate(1, 1.5)
    queue = Queue()
-    search_engine_query = profiles.SearchEngine.objects.all()
-    search_engine_list = [item.url for item in search_engine_query]
-    crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
+    crawler = crawl.CrawlingThread(user, basis, queue)
    crawler.start()
    crawler.join()
    urls = queue.get()