From 4e6ac5ac7baf8fd5cb1866fbd30efadc7e1e8044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Wed, 21 Feb 2018 22:51:05 +0100 Subject: [PATCH] Url getter function : retrieves the list of so-called relevant links --- crawl/crawl.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 25d8de9..76affb3 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -24,7 +24,8 @@ settings = Settings() startup_time = datetime.now() -def url_getter(html): +def url_getter(html, current_page, root_url): + links_list = [] # The final resutl soup = BeautifulSoup(html, "html.parser") # Get only the body body = soup.find('body') @@ -38,8 +39,24 @@ def url_getter(html): # Remove all bookmark links pointing to the current html page. links = body.find_all("a") for link in links: - if re.match(BOOKMARK_URL, link["href"]): - link.extract() + if link.startswith("http"): + links_list.append(link) + elif link.startswith('/'): #Internal link, linking to page root url + link_list.append(root_url + link) + elif link.startswith("#"): + print("Invalid link : internal bookmark") + else: + links_list.append(current_page + link) + + ## uniqifier works with python <= 3.6 + #seen = set() + #links_list = [x for x in links_list if x not in seen and not seen.add(x)] + + # uniqifier + # Works only with python >= 3.6 + links_list = list(dict.fromkeys(seq)) + + class WebsiteSchedulerMeta(type): @@ -140,7 +157,6 @@ class PageGetter: async with self.session.get(self.url) as resp: return await resp.text() -async def async_parser(html_text) async def async_print(url): """ Debug function to follow what's actually happening """