Check if crawling a search engine

This commit is contained in:
Rémi Oudin 2018-02-26 11:12:36 +01:00
parent 15db8b4697
commit adb892ab7d
2 changed files with 44 additions and 27 deletions

View File

@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings:
USER_AGENT = 'Default User'
@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
try:
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
self.can_fetch_b = False
if any(self.urlroot() in item for item in SEARCH_ENGINE):
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, queue):
def __init__(self, user, url, engine_list, queue):
global settings
global SEARCH_ENGINE
SEARCH_ENGINE = engine_list
self.queue = queue
super(CrawlingThread, self).__init__()
if user:
@ -204,6 +214,7 @@ class PageGetter:
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try:
return await resp.text()
except UnicodeDecodeError:
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
else:
print("No html received")
print(crawled)
queue.put(crawled)
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue)
crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl.start()
crawl.join()

View File

@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue)
search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler.start()
crawler.join()
urls = queue.get()