Better filter
This commit is contained in:
parent
f0b8672c89
commit
e56c088632
1 changed files with 7 additions and 1 deletions
|
@ -60,7 +60,7 @@ def url_getter(html, current_page, root_url):
|
|||
elif link.startswith('/'): #Internal link, linking to page root url
|
||||
links_list.append(root_url + link)
|
||||
elif link.startswith("#"):
|
||||
print("Invalid link : internal bookmark")
|
||||
continue
|
||||
else:
|
||||
links_list.append(current_page + "/" + link)
|
||||
|
||||
|
@ -71,6 +71,11 @@ def url_getter(html, current_page, root_url):
|
|||
# Works only with python >= 3.6
|
||||
links_list = list(dict.fromkeys(links_list))
|
||||
|
||||
forbidden_words = ['login', 'agreement']
|
||||
links_list = [link for link in links_list if not any(word in link.lower()
|
||||
for word in
|
||||
forbidden_words)]
|
||||
|
||||
return links_list
|
||||
|
||||
|
||||
|
@ -235,6 +240,7 @@ async def async_crawler(url):
|
|||
sample_url not in queue and sample_url not in
|
||||
crawled]
|
||||
print(crawled)
|
||||
return crawled
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawl = CrawlingThread()
|
||||
|
|
Loading…
Reference in a new issue