From 15323c34659af445751592d8a5b4e3a60a41004d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Sun, 25 Feb 2018 15:08:06 +0100 Subject: [PATCH] [REBASE ME] Crawl: enhance efficiency and output a tree --- crawl/crawl.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 48aaba6..091bfe0 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -162,9 +162,9 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, user, url, queue): + def __init__(self, user, url, output_tree): global settings - self.queue = queue + self.output_tree = output_tree super(CrawlingThread, self).__init__() if user: settings.USER_AGENT = user.serialize_headers() @@ -175,7 +175,7 @@ class CrawlingThread(Thread): tasks = [] #tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler('https://python.org/')) - tasks.append(async_crawler(self.url, self.queue)) + tasks.append(async_crawler(self.url, self.output_tree)) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -220,39 +220,54 @@ async def async_print(url): url, datetime.now() - startup_time)) -async def async_crawler(url, queue): - queued = [url] - crawled = [] + +class CrawlElem: + ''' Describes a crawled element, to be assembled into a tree ''' + + def __init__(self, url, parent): + self.url = url + self.parent = parent + +async def async_crawler(url, output_tree): + queued = [CrawlElem(url, None)] + crawled = set() + crawl_tree = [] + while queued and (len(crawled) < HARD_LIMIT): async with aiohttp.ClientSession() as session: try: - url = queued.pop(0) + crawl_elt = queued.pop(0) + url = crawl_elt.url except IndexError: print("Error queue is empty") return crawled + crawled += url parsed_url = urlparse(url) print("Crawling {}".format(url)) html = await PageGetter(session, url).get(ssl=False) if html: + crawl_tree.append(crawl_elt) new_urls = url_getter( html, url, parsed_url.scheme + "://" + parsed_url.netloc ) - crawled += [url] if new_urls: sampled = sample( new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) - queued += [sample_url for sample_url in sampled if - sample_url not in queued and sample_url not in - crawled] + queued += [ + CrawlElem(sample_url, crawl_elt) + for sample_url in sampled + if sample_url not in queued + and sample_url not in crawled + ] print(crawled) - queue.put(crawled) + output_tree += crawl_tree if __name__ == '__main__': - queue = Queue() - crawl = CrawlingThread(None, "https://python.org/", queue) + crawl_tree = [] + crawl = CrawlingThread(None, "https://python.org/", crawl_tree) crawl.start() crawl.join()