From 9b78e268c90c3edeab7a6ac12830ef9a0bb40998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Thu, 22 Feb 2018 14:33:07 +0100
Subject: [PATCH] Nearly working crawler

---
 crawl/crawl.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index c85a220..132acee 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
 HARD_LIMIT = 20
 MAX_PER_PAGE = 10
 
+FOOTER_URL = re.compile(".*footer.*")
+
 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'BlahBlah'
 
 settings = Settings()
 startup_time = datetime.now()
@@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
     # Get only the body
     body = soup.find('body')
     # remove the body
-    body.footer.decompose()
+    if body.footer:
+        body.footer.decompose()
     # remove all comments
     comments = soup.findAll(text=lambda text: isinstance(text, Comment))
     for comment in comments:
         comment.extract()
 
+    print("Retrieving footers")
+    footers = soup.findAll(id=FOOTER_URL)
+    for footer in footers:
+        footer.extract()
+
     # Remove all bookmark links pointing to the current html page.
     links = map(lambda link: link["href"], body.find_all("a"))
     for link in links:
@@ -132,7 +140,7 @@ class CrawlingThread(Thread):
 
     def run(self):
         tasks = []
-        tasks.append(async_crawler('https://python.org'))
+        tasks.append(async_crawler("https://python.org/"))
         #tasks.append(async_print('https://python.org/about/gettingstarted'))
 
         loop = asyncio.new_event_loop()
@@ -192,11 +200,13 @@ async def async_crawler(url):
                 url,
                 parsed_url.scheme + "://" + parsed_url.netloc
             )
-            crawled += url
-            queue += sample(
+            crawled += [url]
+            sampled = sample(
                 new_urls,
                 randrange(min(MAX_PER_PAGE, len(new_urls)))
             )
+            queue += [sample_url for sample_url in sampled if sample_url not in
+                      queue and sample_url not in crawled]
     print(crawled)
 
 if __name__ == '__main__':