diff --git a/crawl/crawl.py b/crawl/crawl.py index f18f4cf..c85a220 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,7 +1,8 @@ from threading import Thread from urllib.robotparser import RobotFileParser +from urllib.parse import urlparse -from bs4 import BeautifulSoup, Comment +from random import sample, randrange import re from datetime import datetime, timedelta @@ -9,13 +10,16 @@ import asyncio import aiohttp import async_timeout +from bs4 import BeautifulSoup, Comment + # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings # Gets all the direct bookmarks in the html. # We want this to avoid following this kind of bookmark -BOOKMARK_URL = "#.*" +HARD_LIMIT = 20 +MAX_PER_PAGE = 10 class Settings: USER_AGENT = 'Blah' @@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url): # remove the body body.footer.decompose() # remove all comments - comments = soup.findAll(text=lambda text:isinstance(text, Comment)) + comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() # Remove all bookmark links pointing to the current html page. - links = body.find_all("a") + links = map(lambda link: link["href"], body.find_all("a")) for link in links: if link.startswith("http"): links_list.append(link) elif link.startswith('/'): #Internal link, linking to page root url - link_list.append(root_url + link) + links_list.append(root_url + link) elif link.startswith("#"): print("Invalid link : internal bookmark") else: @@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url): #links_list = [x for x in links_list if x not in seen and not seen.add(x)] # uniqifier # Works only with python >= 3.6 - links_list = list(dict.fromkeys(seq)) + links_list = list(dict.fromkeys(links_list)) + print(links_list) return links_list @@ -127,8 +132,8 @@ class CrawlingThread(Thread): def run(self): tasks = [] - tasks.append(async_print('https://python.org')) - tasks.append(async_print('https://python.org/about/gettingstarted')) + tasks.append(async_crawler('https://python.org')) + #tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -169,6 +174,30 @@ async def async_print(url): url, datetime.now() - startup_time)) +async def async_crawler(url): + queue = [url] + crawled = [] + while (not queue) or (len(crawled) < HARD_LIMIT): + async with aiohttp.ClientSession() as session: + try: + url = queue.pop(0) + except IndexError: + print("Error queue is empty") + return crawled + parsed_url = urlparse(url) + print("Crawling {}".format(url)) + html = await PageGetter(session, url).get() + new_urls = url_getter( + html, + url, + parsed_url.scheme + "://" + parsed_url.netloc + ) + crawled += url + queue += sample( + new_urls, + randrange(min(MAX_PER_PAGE, len(new_urls))) + ) + print(crawled) if __name__ == '__main__': crawl = CrawlingThread()