Crawl: do not use global SEARCH_ENGINES

This commit is contained in:
Théophile Bastian 2018-02-26 11:45:08 +01:00
parent 4f0148cb63
commit a4de51b84a
2 changed files with 9 additions and 10 deletions

View file

@ -15,7 +15,7 @@ import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint
from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings:
USER_AGENT = 'Default User'
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in SEARCH_ENGINE):
if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, engine_list, queue):
global settings
global SEARCH_ENGINE
SEARCH_ENGINE = engine_list
def __init__(self, user, url, queue):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[

View file

@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler = crawl.CrawlingThread(user, basis, queue)
crawler.start()
crawler.join()
urls = queue.get()