Nearly working crawler

2018-02-22 14:33:07 +01:00 · 2018-02-22 14:33:07 +01:00 · 9b78e268c9
commit 9b78e268c9
parent e19e623df1
1 changed files with 15 additions and 5 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
 HARD_LIMIT = 20
 MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")
 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'BlahBlah'
 settings = Settings()
 startup_time = datetime.now()
@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
    # Get only the body
    body = soup.find('body')
    # remove the body
    if body.footer:
        body.footer.decompose()
    # remove all comments
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    print("Retrieving footers")
    footers = soup.findAll(id=FOOTER_URL)
    for footer in footers:
        footer.extract()
    # Remove all bookmark links pointing to the current html page.
    links = map(lambda link: link["href"], body.find_all("a"))
    for link in links:
@ -132,7 +140,7 @@ class CrawlingThread(Thread):
    def run(self):
        tasks = []
-        tasks.append(async_crawler('https://python.org'))
+        tasks.append(async_crawler("https://python.org/"))
        #tasks.append(async_print('https://python.org/about/gettingstarted'))
        loop = asyncio.new_event_loop()
@ -192,11 +200,13 @@ async def async_crawler(url):
                url,
                parsed_url.scheme + "://" + parsed_url.netloc
            )
-            crawled += url
+            crawled += [url]
-            queue += sample(
+            sampled = sample(
                new_urls,
                randrange(min(MAX_PER_PAGE, len(new_urls)))
            )
            queue += [sample_url for sample_url in sampled if sample_url not in
                      queue and sample_url not in crawled]
    print(crawled)
 if __name__ == '__main__':