Check if crawling a search engine
This commit is contained in:
parent
15db8b4697
commit
adb892ab7d
2 changed files with 44 additions and 27 deletions
|
@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
|
||||||
|
|
||||||
FOOTER_URL = re.compile(".*footer.*")
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
|
SEARCH_ENGINE = []
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Default User'
|
USER_AGENT = 'Default User'
|
||||||
|
|
||||||
|
@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.last_crawled = datetime.fromtimestamp(0)
|
self.last_crawled = datetime.fromtimestamp(0)
|
||||||
self.dead = False
|
self.dead = False
|
||||||
try:
|
self.can_fetch_b = False
|
||||||
robots_url = self.urlroot() + 'robots.txt'
|
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
print("found a search engine for %s" % self.urlroot())
|
||||||
self.robot_parser.read() # TODO async?
|
self.crawl_delay = timedelta(seconds=5)
|
||||||
except (URLError, CertificateError):
|
self.can_fetch_b = True
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
robots_url = self.urlroot() + 'robots.txt'
|
||||||
self.robot_parser = RobotFileParser(robots_url)
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
self.robot_parser.read()
|
self.robot_parser.read() # TODO async?
|
||||||
except URLError: # Almost surely an offline website.
|
except (URLError, CertificateError):
|
||||||
|
try:
|
||||||
|
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||||
|
self.robot_parser = RobotFileParser(robots_url)
|
||||||
|
self.robot_parser.read()
|
||||||
|
except URLError: # Almost surely an offline website.
|
||||||
|
self.dead = True
|
||||||
|
self.crawl_delay = 0
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
raise e
|
||||||
|
if not self.robot_parser.default_entry:
|
||||||
self.dead = True
|
self.dead = True
|
||||||
self.crawl_delay = 0
|
if not self.dead:
|
||||||
except Exception as e:
|
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||||
print(e)
|
if delay is None:
|
||||||
raise e
|
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||||
if not self.robot_parser.default_entry:
|
if req_rate is None:
|
||||||
self.dead = True
|
delay = 5
|
||||||
if not self.dead:
|
else:
|
||||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
delay = req_rate.requests, req_rate.seconds
|
||||||
if delay is None:
|
self.crawl_delay = timedelta(seconds=delay)
|
||||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
|
||||||
if req_rate is None:
|
|
||||||
delay = 5
|
|
||||||
else:
|
|
||||||
delay = req_rate.requests, req_rate.seconds
|
|
||||||
self.crawl_delay = timedelta(seconds=delay)
|
|
||||||
|
|
||||||
def urlroot(self):
|
def urlroot(self):
|
||||||
''' Get the root url for this website '''
|
''' Get the root url for this website '''
|
||||||
|
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
|
|
||||||
def can_fetch(self, url):
|
def can_fetch(self, url):
|
||||||
''' Check whether this program can fetch a given page '''
|
''' Check whether this program can fetch a given page '''
|
||||||
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
|
||||||
|
|
||||||
def fetching(self):
|
def fetching(self):
|
||||||
''' Tell the scheduler that a page is being fetched now '''
|
''' Tell the scheduler that a page is being fetched now '''
|
||||||
|
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, queue):
|
def __init__(self, user, url, engine_list, queue):
|
||||||
global settings
|
global settings
|
||||||
|
global SEARCH_ENGINE
|
||||||
|
SEARCH_ENGINE = engine_list
|
||||||
self.queue = queue
|
self.queue = queue
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
if user:
|
if user:
|
||||||
|
@ -204,6 +214,7 @@ class PageGetter:
|
||||||
scheduler.fetching()
|
scheduler.fetching()
|
||||||
async with async_timeout.timeout(10):
|
async with async_timeout.timeout(10):
|
||||||
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
||||||
|
print("Resp status %s" % resp.status)
|
||||||
try:
|
try:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
|
||||||
queued += [sample_url for sample_url in sampled if
|
queued += [sample_url for sample_url in sampled if
|
||||||
sample_url not in queued and sample_url not in
|
sample_url not in queued and sample_url not in
|
||||||
crawled]
|
crawled]
|
||||||
|
else:
|
||||||
|
print("No html received")
|
||||||
print(crawled)
|
print(crawled)
|
||||||
queue.put(crawled)
|
queue.put(crawled)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
crawl = CrawlingThread(None, "https://python.org/", queue)
|
crawl = CrawlingThread(None,
|
||||||
|
"https://google.com/search?q=fabriquer+masque+manif",
|
||||||
|
["https://google.com/search/"], queue)
|
||||||
crawl.start()
|
crawl.start()
|
||||||
crawl.join()
|
crawl.join()
|
||||||
|
|
|
@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
|
||||||
result.append((basis, timestamp))
|
result.append((basis, timestamp))
|
||||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
search_engine_query = profiles.SearchEngine.objects.all()
|
||||||
|
search_engine_list = [item.url for item in search_engine_query]
|
||||||
|
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
||||||
crawler.start()
|
crawler.start()
|
||||||
crawler.join()
|
crawler.join()
|
||||||
urls = queue.get()
|
urls = queue.get()
|
||||||
|
|
Loading…
Reference in a new issue