From e56c088632725bb2d5325b1f27f72eb9e9c94bf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Sat, 24 Feb 2018 11:39:04 +0100 Subject: [PATCH] Better filter --- crawl/crawl.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 6eb748c..cc86f18 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url): elif link.startswith('/'): #Internal link, linking to page root url links_list.append(root_url + link) elif link.startswith("#"): - print("Invalid link : internal bookmark") + continue else: links_list.append(current_page + "/" + link) @@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url): # Works only with python >= 3.6 links_list = list(dict.fromkeys(links_list)) + forbidden_words = ['login', 'agreement'] + links_list = [link for link in links_list if not any(word in link.lower() + for word in + forbidden_words)] + return links_list @@ -235,6 +240,7 @@ async def async_crawler(url): sample_url not in queue and sample_url not in crawled] print(crawled) + return crawled if __name__ == '__main__': crawl = CrawlingThread()