Better filter

This commit is contained in:
Rémi Oudin 2018-02-24 11:39:04 +01:00
parent f0b8672c89
commit e56c088632

View file

@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url):
elif link.startswith('/'): #Internal link, linking to page root url elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link) links_list.append(root_url + link)
elif link.startswith("#"): elif link.startswith("#"):
print("Invalid link : internal bookmark") continue
else: else:
links_list.append(current_page + "/" + link) links_list.append(current_page + "/" + link)
@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url):
# Works only with python >= 3.6 # Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list)) links_list = list(dict.fromkeys(links_list))
forbidden_words = ['login', 'agreement']
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
return links_list return links_list
@ -235,6 +240,7 @@ async def async_crawler(url):
sample_url not in queue and sample_url not in sample_url not in queue and sample_url not in
crawled] crawled]
print(crawled) print(crawled)
return crawled
if __name__ == '__main__': if __name__ == '__main__':
crawl = CrawlingThread() crawl = CrawlingThread()