Nearly working crawler

2018-02-22 14:33:07 +01:00 · 2018-02-22 14:33:07 +01:00 · 9b78e268c9
commit 9b78e268c9
parent e19e623df1
1 changed files with 15 additions and 5 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
 HARD_LIMIT = 20
 MAX_PER_PAGE = 10

+FOOTER_URL = re.compile(".*footer.*")
+
 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'BlahBlah'

 settings = Settings()
 startup_time = datetime.now()
@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
    # Get only the body
    body = soup.find('body')
    # remove the body
-    body.footer.decompose()
+    if body.footer:
+        body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

+    print("Retrieving footers")
+    footers = soup.findAll(id=FOOTER_URL)
+    for footer in footers:
+        footer.extract()
+
    # Remove all bookmark links pointing to the current html page.
    links = map(lambda link: link["href"], body.find_all("a"))
    for link in links:
@ -132,7 +140,7 @@ class CrawlingThread(Thread):

    def run(self):
        tasks = []
-        tasks.append(async_crawler('https://python.org'))
+        tasks.append(async_crawler("https://python.org/"))
        #tasks.append(async_print('https://python.org/about/gettingstarted'))

        loop = asyncio.new_event_loop()
@ -192,11 +200,13 @@ async def async_crawler(url):
                url,
                parsed_url.scheme + "://" + parsed_url.netloc
            )
-            crawled += url
-            queue += sample(
+            crawled += [url]
+            sampled = sample(
                new_urls,
                randrange(min(MAX_PER_PAGE, len(new_urls)))
            )
+            queue += [sample_url for sample_url in sampled if sample_url not in
+                      queue and sample_url not in crawled]
    print(crawled)

 if __name__ == '__main__':