Url getter function : retrieves the list of so-called relevant links

2018-02-21 22:51:05 +01:00 · 2018-02-21 22:51:05 +01:00 · 4e6ac5ac7b
commit 4e6ac5ac7b
parent a907cad33d
1 changed files with 20 additions and 4 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -24,7 +24,8 @@ settings = Settings()
 startup_time = datetime.now()


-def url_getter(html):
+def url_getter(html, current_page, root_url):
+    links_list = [] # The final resutl
    soup = BeautifulSoup(html, "html.parser")
    # Get only the body
    body = soup.find('body')
@ -38,8 +39,24 @@ def url_getter(html):
    # Remove all bookmark links pointing to the current html page.
    links = body.find_all("a")
    for link in links:
-        if re.match(BOOKMARK_URL, link["href"]):
-            link.extract()
+        if link.startswith("http"):
+            links_list.append(link)
+        elif link.startswith('/'): #Internal link, linking to page root url
+            link_list.append(root_url + link)
+        elif link.startswith("#"):
+            print("Invalid link : internal bookmark")
+        else:
+            links_list.append(current_page + link)
+
+    ## uniqifier works with python <= 3.6
+    #seen = set()
+    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
+
+    # uniqifier
+    # Works only with python >= 3.6
+    links_list = list(dict.fromkeys(seq))
+
+


 class WebsiteSchedulerMeta(type):
@ -140,7 +157,6 @@ class PageGetter:
            async with self.session.get(self.url) as resp:
                return await resp.text()

-async def async_parser(html_text)

 async def async_print(url):
    """ Debug function to follow what's actually happening """