Multiple bug fixes. TODO : remove <div id=footer>-like patterns

2018-02-22 14:07:53 +01:00 · 2018-02-22 14:07:53 +01:00 · e19e623df1
commit e19e623df1
parent 236e15296c
1 changed files with 37 additions and 8 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,7 +1,8 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse

-from bs4 import BeautifulSoup, Comment
+from random import sample, randrange
 import re
 from datetime import datetime, timedelta

@ -9,13 +10,16 @@ import asyncio
 import aiohttp
 import async_timeout

+from bs4 import BeautifulSoup, Comment
+
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings

 # Gets all the direct bookmarks in the html.
 # We want this to avoid following this kind of bookmark

-BOOKMARK_URL = "#.*"
+HARD_LIMIT = 20
+MAX_PER_PAGE = 10

 class Settings:
    USER_AGENT = 'Blah'
@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url):
    # remove the body
    body.footer.decompose()
    # remove all comments
-    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    # Remove all bookmark links pointing to the current html page.
-    links = body.find_all("a")
+    links = map(lambda link: link["href"], body.find_all("a"))
    for link in links:
        if link.startswith("http"):
            links_list.append(link)
        elif link.startswith('/'): #Internal link, linking to page root url
-            link_list.append(root_url + link)
+            links_list.append(root_url + link)
        elif link.startswith("#"):
            print("Invalid link : internal bookmark")
        else:
@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url):
    #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
    # uniqifier
    # Works only with python >= 3.6
-    links_list = list(dict.fromkeys(seq))
+    links_list = list(dict.fromkeys(links_list))
+    print(links_list)

    return links_list

@ -127,8 +132,8 @@ class CrawlingThread(Thread):

    def run(self):
        tasks = []
-        tasks.append(async_print('https://python.org'))
-        tasks.append(async_print('https://python.org/about/gettingstarted'))
+        tasks.append(async_crawler('https://python.org'))
+        #tasks.append(async_print('https://python.org/about/gettingstarted'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -169,6 +174,30 @@ async def async_print(url):
            url,
            datetime.now() - startup_time))

+async def async_crawler(url):
+    queue = [url]
+    crawled = []
+    while (not queue) or (len(crawled) < HARD_LIMIT):
+        async with aiohttp.ClientSession() as session:
+            try:
+                url = queue.pop(0)
+            except IndexError:
+                print("Error queue is empty")
+                return crawled
+            parsed_url = urlparse(url)
+            print("Crawling {}".format(url))
+            html = await PageGetter(session, url).get()
+            new_urls = url_getter(
+                html,
+                url,
+                parsed_url.scheme + "://" + parsed_url.netloc
+            )
+            crawled += url
+            queue += sample(
+                new_urls,
+                randrange(min(MAX_PER_PAGE, len(new_urls)))
+            )
+    print(crawled)

 if __name__ == '__main__':
    crawl = CrawlingThread()