Crawl: do not use global SEARCH_ENGINES

This commit is contained in:
Théophile Bastian 2018-02-26 11:45:08 +01:00
parent 4f0148cb63
commit a4de51b84a
2 changed files with 9 additions and 10 deletions

View file

@ -15,7 +15,7 @@ import async_timeout
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint from profiles.models import BrowserFingerprint, SearchEngine
# Ugly hack to use this module alone instead of integrating it with Django # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings # from django.conf import settings
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*") FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings: class Settings:
USER_AGENT = 'Default User' USER_AGENT = 'Default User'
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """ """ Schedule the accesses to a website as of robots.txt """
search_engines = [] # Must be set by CrawlingThread.__init__
def __init__(self, name, user_agent): def __init__(self, name, user_agent):
self.name = name self.name = name
self.last_crawled = datetime.fromtimestamp(0) self.last_crawled = datetime.fromtimestamp(0)
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
self.can_fetch_b = False self.can_fetch_b = False
self.user_agent = (user_agent if user_agent is not None self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT) else settings.USER_AGENT)
if any(self.urlroot() in item for item in SEARCH_ENGINE): if any(self.urlroot() in item for item in self.search_engines):
print("found a search engine for %s" % self.urlroot()) print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5) self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True self.can_fetch_b = True
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, user, url, engine_list, queue): def __init__(self, user, url, queue):
global settings engine_list = [engine.url for engine in SearchEngine.objects.all()]
global SEARCH_ENGINE WebsiteScheduler.search_engines = engine_list
SEARCH_ENGINE = engine_list
nb_fingerprint = len(BrowserFingerprint.objects.all()) nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[ fingerprint = BrowserFingerprint.objects.all()[

View file

@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
result.append((basis, timestamp)) result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5) timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue() queue = Queue()
search_engine_query = profiles.SearchEngine.objects.all() crawler = crawl.CrawlingThread(user, basis, queue)
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls = queue.get() urls = queue.get()