Nearly working crawler

This commit is contained in:
Rémi Oudin 2018-02-22 14:33:07 +01:00
parent e19e623df1
commit 9b78e268c9
1 changed files with 15 additions and 5 deletions

View File

@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
HARD_LIMIT = 20
MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'Blah'
USER_AGENT = 'BlahBlah'
settings = Settings()
startup_time = datetime.now()
@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
# Get only the body
body = soup.find('body')
# remove the body
body.footer.decompose()
if body.footer:
body.footer.decompose()
# remove all comments
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
for comment in comments:
comment.extract()
print("Retrieving footers")
footers = soup.findAll(id=FOOTER_URL)
for footer in footers:
footer.extract()
# Remove all bookmark links pointing to the current html page.
links = map(lambda link: link["href"], body.find_all("a"))
for link in links:
@ -132,7 +140,7 @@ class CrawlingThread(Thread):
def run(self):
tasks = []
tasks.append(async_crawler('https://python.org'))
tasks.append(async_crawler("https://python.org/"))
#tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop()
@ -192,11 +200,13 @@ async def async_crawler(url):
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += url
queue += sample(
crawled += [url]
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if sample_url not in
queue and sample_url not in crawled]
print(crawled)
if __name__ == '__main__':