From 4e6ac5ac7baf8fd5cb1866fbd30efadc7e1e8044 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Wed, 21 Feb 2018 22:51:05 +0100
Subject: [PATCH] Url getter function : retrieves the list of so-called
 relevant links

---
 crawl/crawl.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 25d8de9..76affb3 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -24,7 +24,8 @@ settings = Settings()
 startup_time = datetime.now()
 
 
-def url_getter(html):
+def url_getter(html, current_page, root_url):
+    links_list = [] # The final resutl
     soup = BeautifulSoup(html, "html.parser")
     # Get only the body
     body = soup.find('body')
@@ -38,8 +39,24 @@ def url_getter(html):
     # Remove all bookmark links pointing to the current html page.
     links = body.find_all("a")
     for link in links:
-        if re.match(BOOKMARK_URL, link["href"]):
-            link.extract()
+        if link.startswith("http"):
+            links_list.append(link)
+        elif link.startswith('/'): #Internal link, linking to page root url
+            link_list.append(root_url + link)
+        elif link.startswith("#"):
+            print("Invalid link : internal bookmark")
+        else:
+            links_list.append(current_page + link)
+
+    ## uniqifier works with python <= 3.6
+    #seen = set()
+    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
+
+    # uniqifier
+    # Works only with python >= 3.6
+    links_list = list(dict.fromkeys(seq))
+
+
 
 
 class WebsiteSchedulerMeta(type):
@@ -140,7 +157,6 @@ class PageGetter:
             async with self.session.get(self.url) as resp:
                 return await resp.text()
 
-async def async_parser(html_text)
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """