Url getter function : retrieves the list of so-called relevant links
This commit is contained in:
parent
a907cad33d
commit
4e6ac5ac7b
1 changed files with 20 additions and 4 deletions
|
@ -24,7 +24,8 @@ settings = Settings()
|
||||||
startup_time = datetime.now()
|
startup_time = datetime.now()
|
||||||
|
|
||||||
|
|
||||||
def url_getter(html):
|
def url_getter(html, current_page, root_url):
|
||||||
|
links_list = [] # The final resutl
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
# Get only the body
|
# Get only the body
|
||||||
body = soup.find('body')
|
body = soup.find('body')
|
||||||
|
@ -38,8 +39,24 @@ def url_getter(html):
|
||||||
# Remove all bookmark links pointing to the current html page.
|
# Remove all bookmark links pointing to the current html page.
|
||||||
links = body.find_all("a")
|
links = body.find_all("a")
|
||||||
for link in links:
|
for link in links:
|
||||||
if re.match(BOOKMARK_URL, link["href"]):
|
if link.startswith("http"):
|
||||||
link.extract()
|
links_list.append(link)
|
||||||
|
elif link.startswith('/'): #Internal link, linking to page root url
|
||||||
|
link_list.append(root_url + link)
|
||||||
|
elif link.startswith("#"):
|
||||||
|
print("Invalid link : internal bookmark")
|
||||||
|
else:
|
||||||
|
links_list.append(current_page + link)
|
||||||
|
|
||||||
|
## uniqifier works with python <= 3.6
|
||||||
|
#seen = set()
|
||||||
|
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
|
||||||
|
|
||||||
|
# uniqifier
|
||||||
|
# Works only with python >= 3.6
|
||||||
|
links_list = list(dict.fromkeys(seq))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
|
@ -140,7 +157,6 @@ class PageGetter:
|
||||||
async with self.session.get(self.url) as resp:
|
async with self.session.get(self.url) as resp:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
|
|
||||||
async def async_parser(html_text)
|
|
||||||
|
|
||||||
async def async_print(url):
|
async def async_print(url):
|
||||||
""" Debug function to follow what's actually happening """
|
""" Debug function to follow what's actually happening """
|
||||||
|
|
Loading…
Reference in a new issue