Url getter function : retrieves the list of so-called relevant links
This commit is contained in:
parent
a907cad33d
commit
4e6ac5ac7b
1 changed files with 20 additions and 4 deletions
|
@ -24,7 +24,8 @@ settings = Settings()
|
|||
startup_time = datetime.now()
|
||||
|
||||
|
||||
def url_getter(html):
|
||||
def url_getter(html, current_page, root_url):
|
||||
links_list = [] # The final resutl
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Get only the body
|
||||
body = soup.find('body')
|
||||
|
@ -38,8 +39,24 @@ def url_getter(html):
|
|||
# Remove all bookmark links pointing to the current html page.
|
||||
links = body.find_all("a")
|
||||
for link in links:
|
||||
if re.match(BOOKMARK_URL, link["href"]):
|
||||
link.extract()
|
||||
if link.startswith("http"):
|
||||
links_list.append(link)
|
||||
elif link.startswith('/'): #Internal link, linking to page root url
|
||||
link_list.append(root_url + link)
|
||||
elif link.startswith("#"):
|
||||
print("Invalid link : internal bookmark")
|
||||
else:
|
||||
links_list.append(current_page + link)
|
||||
|
||||
## uniqifier works with python <= 3.6
|
||||
#seen = set()
|
||||
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
|
||||
|
||||
# uniqifier
|
||||
# Works only with python >= 3.6
|
||||
links_list = list(dict.fromkeys(seq))
|
||||
|
||||
|
||||
|
||||
|
||||
class WebsiteSchedulerMeta(type):
|
||||
|
@ -140,7 +157,6 @@ class PageGetter:
|
|||
async with self.session.get(self.url) as resp:
|
||||
return await resp.text()
|
||||
|
||||
async def async_parser(html_text)
|
||||
|
||||
async def async_print(url):
|
||||
""" Debug function to follow what's actually happening """
|
||||
|
|
Loading…
Reference in a new issue