Crawl: do not use global SEARCH_ENGINES
This commit is contained in:
parent
8f1d69bc41
commit
fd4e1d35c7
2 changed files with 9 additions and 10 deletions
|
@ -15,7 +15,7 @@ import async_timeout
|
|||
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
from profiles.models import BrowserFingerprint
|
||||
from profiles.models import BrowserFingerprint, SearchEngine
|
||||
|
||||
# Ugly hack to use this module alone instead of integrating it with Django
|
||||
# from django.conf import settings
|
||||
|
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10
|
|||
|
||||
FOOTER_URL = re.compile(".*footer.*")
|
||||
|
||||
SEARCH_ENGINE = []
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Default User'
|
||||
|
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):
|
|||
|
||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||
""" Schedule the accesses to a website as of robots.txt """
|
||||
|
||||
search_engines = [] # Must be set by CrawlingThread.__init__
|
||||
|
||||
def __init__(self, name, user_agent):
|
||||
self.name = name
|
||||
self.last_crawled = datetime.fromtimestamp(0)
|
||||
|
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
self.can_fetch_b = False
|
||||
self.user_agent = (user_agent if user_agent is not None
|
||||
else settings.USER_AGENT)
|
||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
||||
if any(self.urlroot() in item for item in self.search_engines):
|
||||
print("found a search engine for %s" % self.urlroot())
|
||||
self.crawl_delay = timedelta(seconds=5)
|
||||
self.can_fetch_b = True
|
||||
|
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
|
|||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||
since the thread will need its own event loop. """
|
||||
|
||||
def __init__(self, user, url, engine_list, queue):
|
||||
global settings
|
||||
global SEARCH_ENGINE
|
||||
SEARCH_ENGINE = engine_list
|
||||
def __init__(self, user, url, queue):
|
||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||
WebsiteScheduler.search_engines = engine_list
|
||||
|
||||
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||
fingerprint = BrowserFingerprint.objects.all()[
|
||||
|
|
|
@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
|
|||
result.append((basis, timestamp))
|
||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||
queue = Queue()
|
||||
search_engine_query = profiles.SearchEngine.objects.all()
|
||||
search_engine_list = [item.url for item in search_engine_query]
|
||||
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
||||
crawler.start()
|
||||
crawler.join()
|
||||
urls = queue.get()
|
||||
|
|
Loading…
Reference in a new issue