Crawl: do not use global SEARCH_ENGINES
This commit is contained in:
parent
4f0148cb63
commit
a4de51b84a
2 changed files with 9 additions and 10 deletions
|
@ -15,7 +15,7 @@ import async_timeout
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Comment
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
from profiles.models import BrowserFingerprint
|
from profiles.models import BrowserFingerprint, SearchEngine
|
||||||
|
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
@ -28,7 +28,6 @@ MAX_PER_PAGE = 10
|
||||||
|
|
||||||
FOOTER_URL = re.compile(".*footer.*")
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
SEARCH_ENGINE = []
|
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Default User'
|
USER_AGENT = 'Default User'
|
||||||
|
@ -105,6 +104,9 @@ class WebsiteSchedulerMeta(type):
|
||||||
|
|
||||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
""" Schedule the accesses to a website as of robots.txt """
|
""" Schedule the accesses to a website as of robots.txt """
|
||||||
|
|
||||||
|
search_engines = [] # Must be set by CrawlingThread.__init__
|
||||||
|
|
||||||
def __init__(self, name, user_agent):
|
def __init__(self, name, user_agent):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.last_crawled = datetime.fromtimestamp(0)
|
self.last_crawled = datetime.fromtimestamp(0)
|
||||||
|
@ -112,7 +114,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
self.can_fetch_b = False
|
self.can_fetch_b = False
|
||||||
self.user_agent = (user_agent if user_agent is not None
|
self.user_agent = (user_agent if user_agent is not None
|
||||||
else settings.USER_AGENT)
|
else settings.USER_AGENT)
|
||||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
if any(self.urlroot() in item for item in self.search_engines):
|
||||||
print("found a search engine for %s" % self.urlroot())
|
print("found a search engine for %s" % self.urlroot())
|
||||||
self.crawl_delay = timedelta(seconds=5)
|
self.crawl_delay = timedelta(seconds=5)
|
||||||
self.can_fetch_b = True
|
self.can_fetch_b = True
|
||||||
|
@ -173,10 +175,9 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, engine_list, queue):
|
def __init__(self, user, url, queue):
|
||||||
global settings
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
global SEARCH_ENGINE
|
WebsiteScheduler.search_engines = engine_list
|
||||||
SEARCH_ENGINE = engine_list
|
|
||||||
|
|
||||||
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
||||||
fingerprint = BrowserFingerprint.objects.all()[
|
fingerprint = BrowserFingerprint.objects.all()[
|
||||||
|
|
|
@ -77,9 +77,7 @@ def generate_partial_history(user, t_start):
|
||||||
result.append((basis, timestamp))
|
result.append((basis, timestamp))
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
search_engine_query = profiles.SearchEngine.objects.all()
|
crawler = crawl.CrawlingThread(user, basis, queue)
|
||||||
search_engine_list = [item.url for item in search_engine_query]
|
|
||||||
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls = queue.get()
|
||||||
|
|
Loading…
Reference in a new issue