Url getter function : retrieves the list of so-called relevant links

This commit is contained in:
Rémi Oudin 2018-02-21 22:51:05 +01:00
parent a907cad33d
commit 4e6ac5ac7b

View file

@ -24,7 +24,8 @@ settings = Settings()
startup_time = datetime.now() startup_time = datetime.now()
def url_getter(html): def url_getter(html, current_page, root_url):
links_list = [] # The final resutl
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
# Get only the body # Get only the body
body = soup.find('body') body = soup.find('body')
@ -38,8 +39,24 @@ def url_getter(html):
# Remove all bookmark links pointing to the current html page. # Remove all bookmark links pointing to the current html page.
links = body.find_all("a") links = body.find_all("a")
for link in links: for link in links:
if re.match(BOOKMARK_URL, link["href"]): if link.startswith("http"):
link.extract() links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
link_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
else:
links_list.append(current_page + link)
## uniqifier works with python <= 3.6
#seen = set()
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(seq))
class WebsiteSchedulerMeta(type): class WebsiteSchedulerMeta(type):
@ -140,7 +157,6 @@ class PageGetter:
async with self.session.get(self.url) as resp: async with self.session.get(self.url) as resp:
return await resp.text() return await resp.text()
async def async_parser(html_text)
async def async_print(url): async def async_print(url):
""" Debug function to follow what's actually happening """ """ Debug function to follow what's actually happening """