From 9b78e268c90c3edeab7a6ac12830ef9a0bb40998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Thu, 22 Feb 2018 14:33:07 +0100 Subject: [PATCH] Nearly working crawler --- crawl/crawl.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index c85a220..132acee 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment HARD_LIMIT = 20 MAX_PER_PAGE = 10 +FOOTER_URL = re.compile(".*footer.*") + class Settings: - USER_AGENT = 'Blah' + USER_AGENT = 'BlahBlah' settings = Settings() startup_time = datetime.now() @@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url): # Get only the body body = soup.find('body') # remove the body - body.footer.decompose() + if body.footer: + body.footer.decompose() # remove all comments comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() + print("Retrieving footers") + footers = soup.findAll(id=FOOTER_URL) + for footer in footers: + footer.extract() + # Remove all bookmark links pointing to the current html page. links = map(lambda link: link["href"], body.find_all("a")) for link in links: @@ -132,7 +140,7 @@ class CrawlingThread(Thread): def run(self): tasks = [] - tasks.append(async_crawler('https://python.org')) + tasks.append(async_crawler("https://python.org/")) #tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() @@ -192,11 +200,13 @@ async def async_crawler(url): url, parsed_url.scheme + "://" + parsed_url.netloc ) - crawled += url - queue += sample( + crawled += [url] + sampled = sample( new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) + queue += [sample_url for sample_url in sampled if sample_url not in + queue and sample_url not in crawled] print(crawled) if __name__ == '__main__':