Check if crawling a search engine

This commit is contained in:
Rémi Oudin 2018-02-26 11:12:36 +01:00
parent 15db8b4697
commit adb892ab7d
2 changed files with 44 additions and 27 deletions

View file

@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*") FOOTER_URL = re.compile(".*footer.*")
SEARCH_ENGINE = []
class Settings: class Settings:
USER_AGENT = 'Default User' USER_AGENT = 'Default User'
@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
self.name = name self.name = name
self.last_crawled = datetime.fromtimestamp(0) self.last_crawled = datetime.fromtimestamp(0)
self.dead = False self.dead = False
try: self.can_fetch_b = False
robots_url = self.urlroot() + 'robots.txt' if any(self.urlroot() in item for item in SEARCH_ENGINE):
self.robot_parser = RobotFileParser(robots_url) print("found a search engine for %s" % self.urlroot())
self.robot_parser.read() # TODO async? self.crawl_delay = timedelta(seconds=5)
except (URLError, CertificateError): self.can_fetch_b = True
else:
try: try:
robots_url = self.unsafe_urlroot() + 'robots.txt' robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url) self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() self.robot_parser.read() # TODO async?
except URLError: # Almost surely an offline website. except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True self.dead = True
self.crawl_delay = 0 if not self.dead:
except Exception as e: delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
print(e) if delay is None:
raise e req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if not self.robot_parser.default_entry: if req_rate is None:
self.dead = True delay = 5
if not self.dead: else:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT) delay = req_rate.requests, req_rate.seconds
if delay is None: self.crawl_delay = timedelta(seconds=delay)
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self): def urlroot(self):
''' Get the root url for this website ''' ''' Get the root url for this website '''
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url): def can_fetch(self, url):
''' Check whether this program can fetch a given page ''' ''' Check whether this program can fetch a given page '''
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url) return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
def fetching(self): def fetching(self):
''' Tell the scheduler that a page is being fetched now ''' ''' Tell the scheduler that a page is being fetched now '''
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, user, url, queue): def __init__(self, user, url, engine_list, queue):
global settings global settings
global SEARCH_ENGINE
SEARCH_ENGINE = engine_list
self.queue = queue self.queue = queue
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
if user: if user:
@ -204,6 +214,7 @@ class PageGetter:
scheduler.fetching() scheduler.fetching()
async with async_timeout.timeout(10): async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp: async with self.session.get(self.url, verify_ssl=ssl) as resp:
print("Resp status %s" % resp.status)
try: try:
return await resp.text() return await resp.text()
except UnicodeDecodeError: except UnicodeDecodeError:
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
queued += [sample_url for sample_url in sampled if queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in sample_url not in queued and sample_url not in
crawled] crawled]
else:
print("No html received")
print(crawled) print(crawled)
queue.put(crawled) queue.put(crawled)
if __name__ == '__main__': if __name__ == '__main__':
queue = Queue() queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue) crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl.start() crawl.start()
crawl.join() crawl.join()

View file

@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
result.append((basis, timestamp)) result.append((basis, timestamp))
timestamp += 5* random.weibullvariate(1, 1.5) timestamp += 5* random.weibullvariate(1, 1.5)
queue = Queue() queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue) search_engine_query = profiles.SearchEngine.objects.all()
search_engine_list = [item.url for item in search_engine_query]
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
crawler.start() crawler.start()
crawler.join() crawler.join()
urls = queue.get() urls = queue.get()