Better filter
This commit is contained in:
parent
f0b8672c89
commit
e56c088632
1 changed files with 7 additions and 1 deletions
|
@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url):
|
||||||
elif link.startswith('/'): #Internal link, linking to page root url
|
elif link.startswith('/'): #Internal link, linking to page root url
|
||||||
links_list.append(root_url + link)
|
links_list.append(root_url + link)
|
||||||
elif link.startswith("#"):
|
elif link.startswith("#"):
|
||||||
print("Invalid link : internal bookmark")
|
continue
|
||||||
else:
|
else:
|
||||||
links_list.append(current_page + "/" + link)
|
links_list.append(current_page + "/" + link)
|
||||||
|
|
||||||
|
@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url):
|
||||||
# Works only with python >= 3.6
|
# Works only with python >= 3.6
|
||||||
links_list = list(dict.fromkeys(links_list))
|
links_list = list(dict.fromkeys(links_list))
|
||||||
|
|
||||||
|
forbidden_words = ['login', 'agreement']
|
||||||
|
links_list = [link for link in links_list if not any(word in link.lower()
|
||||||
|
for word in
|
||||||
|
forbidden_words)]
|
||||||
|
|
||||||
return links_list
|
return links_list
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,6 +240,7 @@ async def async_crawler(url):
|
||||||
sample_url not in queue and sample_url not in
|
sample_url not in queue and sample_url not in
|
||||||
crawled]
|
crawled]
|
||||||
print(crawled)
|
print(crawled)
|
||||||
|
return crawled
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawl = CrawlingThread()
|
crawl = CrawlingThread()
|
||||||
|
|
Loading…
Reference in a new issue