diff --git a/crawl/crawl.py b/crawl/crawl.py index 3d050a4..d005d20 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -15,7 +15,7 @@ import async_timeout from bs4 import BeautifulSoup, Comment -from profiles.models import BrowserFingerprint +from profiles.models import BrowserFingerprint, SearchEngine # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings @@ -28,7 +28,6 @@ MAX_PER_PAGE = 10 FOOTER_URL = re.compile(".*footer.*") -SEARCH_ENGINE = [] class Settings: USER_AGENT = 'Default User' @@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type): class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): """ Schedule the accesses to a website as of robots.txt """ + + search_engines = [] # Must be set by CrawlingThread.__init__ + def __init__(self, name, user_agent): self.name = name self.last_crawled = datetime.fromtimestamp(0) @@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): self.can_fetch_b = False self.user_agent = (user_agent if user_agent is not None else settings.USER_AGENT) - if any(self.urlroot() in item for item in SEARCH_ENGINE): + if any(self.urlroot() in item for item in self.search_engines): print("found a search engine for %s" % self.urlroot()) self.crawl_delay = timedelta(seconds=5) self.can_fetch_b = True @@ -173,10 +175,9 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, user, url, engine_list, queue): - global settings - global SEARCH_ENGINE - SEARCH_ENGINE = engine_list + def __init__(self, user, url, queue): + engine_list = [engine.url for engine in SearchEngine.objects.all()] + WebsiteScheduler.search_engines = engine_list nb_fingerprint = len(BrowserFingerprint.objects.all()) fingerprint = BrowserFingerprint.objects.all()[ diff --git a/histories/models.py b/histories/models.py index d9b20a2..7fd0ae6 100644 --- a/histories/models.py +++ b/histories/models.py @@ -77,9 +77,7 @@ def generate_partial_history(user, t_start): result.append((basis, timestamp)) timestamp += 5* random.weibullvariate(1, 1.5) queue = Queue() - search_engine_query = profiles.SearchEngine.objects.all() - search_engine_list = [item.url for item in search_engine_query] - crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue) + crawler = crawl.CrawlingThread(user, basis, queue) crawler.start() crawler.join() urls = queue.get()