From e56c088632725bb2d5325b1f27f72eb9e9c94bf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Sat, 24 Feb 2018 11:39:04 +0100
Subject: [PATCH] Better filter

---
 crawl/crawl.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 6eb748c..cc86f18 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url):
             elif link.startswith('/'): #Internal link, linking to page root url
                 links_list.append(root_url + link)
             elif link.startswith("#"):
-                print("Invalid link : internal bookmark")
+                continue
             else:
                 links_list.append(current_page + "/" + link)
 
@@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url):
     # Works only with python >= 3.6
     links_list = list(dict.fromkeys(links_list))
 
+    forbidden_words = ['login', 'agreement']
+    links_list = [link for link in links_list if not any(word in link.lower()
+                                                         for word in
+                                                         forbidden_words)]
+
     return links_list
 
 
@@ -235,6 +240,7 @@ async def async_crawler(url):
                               sample_url not in queue and sample_url not in
                               crawled]
     print(crawled)
+    return crawled
 
 if __name__ == '__main__':
     crawl = CrawlingThread()