Check if crawling a search engine
This commit is contained in:
parent
15db8b4697
commit
adb892ab7d
2 changed files with 44 additions and 27 deletions
|
@ -26,6 +26,8 @@ MAX_PER_PAGE = 10
|
|||
|
||||
FOOTER_URL = re.compile(".*footer.*")
|
||||
|
||||
SEARCH_ENGINE = []
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Default User'
|
||||
|
||||
|
@ -108,32 +110,38 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
self.name = name
|
||||
self.last_crawled = datetime.fromtimestamp(0)
|
||||
self.dead = False
|
||||
try:
|
||||
robots_url = self.urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read() # TODO async?
|
||||
except (URLError, CertificateError):
|
||||
self.can_fetch_b = False
|
||||
if any(self.urlroot() in item for item in SEARCH_ENGINE):
|
||||
print("found a search engine for %s" % self.urlroot())
|
||||
self.crawl_delay = timedelta(seconds=5)
|
||||
self.can_fetch_b = True
|
||||
else:
|
||||
try:
|
||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||
robots_url = self.urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read()
|
||||
except URLError: # Almost surely an offline website.
|
||||
self.robot_parser.read() # TODO async?
|
||||
except (URLError, CertificateError):
|
||||
try:
|
||||
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read()
|
||||
except URLError: # Almost surely an offline website.
|
||||
self.dead = True
|
||||
self.crawl_delay = 0
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise e
|
||||
if not self.robot_parser.default_entry:
|
||||
self.dead = True
|
||||
self.crawl_delay = 0
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise e
|
||||
if not self.robot_parser.default_entry:
|
||||
self.dead = True
|
||||
if not self.dead:
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
if not self.dead:
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
|
||||
def urlroot(self):
|
||||
''' Get the root url for this website '''
|
||||
|
@ -151,7 +159,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
|
||||
def can_fetch(self, url):
|
||||
''' Check whether this program can fetch a given page '''
|
||||
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
||||
return (self.can_fetch_b ) or ((not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url))
|
||||
|
||||
def fetching(self):
|
||||
''' Tell the scheduler that a page is being fetched now '''
|
||||
|
@ -162,8 +170,10 @@ class CrawlingThread(Thread):
|
|||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||
since the thread will need its own event loop. """
|
||||
|
||||
def __init__(self, user, url, queue):
|
||||
def __init__(self, user, url, engine_list, queue):
|
||||
global settings
|
||||
global SEARCH_ENGINE
|
||||
SEARCH_ENGINE = engine_list
|
||||
self.queue = queue
|
||||
super(CrawlingThread, self).__init__()
|
||||
if user:
|
||||
|
@ -204,6 +214,7 @@ class PageGetter:
|
|||
scheduler.fetching()
|
||||
async with async_timeout.timeout(10):
|
||||
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
||||
print("Resp status %s" % resp.status)
|
||||
try:
|
||||
return await resp.text()
|
||||
except UnicodeDecodeError:
|
||||
|
@ -248,11 +259,15 @@ async def async_crawler(url, queue):
|
|||
queued += [sample_url for sample_url in sampled if
|
||||
sample_url not in queued and sample_url not in
|
||||
crawled]
|
||||
else:
|
||||
print("No html received")
|
||||
print(crawled)
|
||||
queue.put(crawled)
|
||||
|
||||
if __name__ == '__main__':
|
||||
queue = Queue()
|
||||
crawl = CrawlingThread(None, "https://python.org/", queue)
|
||||
crawl = CrawlingThread(None,
|
||||
"https://google.com/search?q=fabriquer+masque+manif",
|
||||
["https://google.com/search/"], queue)
|
||||
crawl.start()
|
||||
crawl.join()
|
||||
|
|
|
@ -77,7 +77,9 @@ def generate_partial_history(user, t_start):
|
|||
result.append((basis, timestamp))
|
||||
timestamp += 5* random.weibullvariate(1, 1.5)
|
||||
queue = Queue()
|
||||
crawler = crawl.CrawlingThread(user, basis, queue)
|
||||
search_engine_query = profiles.SearchEngine.objects.all()
|
||||
search_engine_list = [item.url for item in search_engine_query]
|
||||
crawler = crawl.CrawlingThread(user, basis, search_engine_list, queue)
|
||||
crawler.start()
|
||||
crawler.join()
|
||||
urls = queue.get()
|
||||
|
|
Loading…
Reference in a new issue