[REBASE ME] Crawl: enhance efficiency and output a tree

This commit is contained in:
Théophile Bastian 2018-02-25 15:08:06 +01:00
parent bc7348f677
commit 15323c3465

View file

@ -162,9 +162,9 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, user, url, queue): def __init__(self, user, url, output_tree):
global settings global settings
self.queue = queue self.output_tree = output_tree
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
if user: if user:
settings.USER_AGENT = user.serialize_headers() settings.USER_AGENT = user.serialize_headers()
@ -175,7 +175,7 @@ class CrawlingThread(Thread):
tasks = [] tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue)) tasks.append(async_crawler(self.url, self.output_tree))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -220,39 +220,54 @@ async def async_print(url):
url, url,
datetime.now() - startup_time)) datetime.now() - startup_time))
async def async_crawler(url, queue):
queued = [url] class CrawlElem:
crawled = [] ''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def async_crawler(url, output_tree):
queued = [CrawlElem(url, None)]
crawled = set()
crawl_tree = []
while queued and (len(crawled) < HARD_LIMIT): while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
try: try:
url = queued.pop(0) crawl_elt = queued.pop(0)
url = crawl_elt.url
except IndexError: except IndexError:
print("Error queue is empty") print("Error queue is empty")
return crawled return crawled
crawled += url
parsed_url = urlparse(url) parsed_url = urlparse(url)
print("Crawling {}".format(url)) print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False) html = await PageGetter(session, url).get(ssl=False)
if html: if html:
crawl_tree.append(crawl_elt)
new_urls = url_getter( new_urls = url_getter(
html, html,
url, url,
parsed_url.scheme + "://" + parsed_url.netloc parsed_url.scheme + "://" + parsed_url.netloc
) )
crawled += [url]
if new_urls: if new_urls:
sampled = sample( sampled = sample(
new_urls, new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls))) randrange(min(MAX_PER_PAGE, len(new_urls)))
) )
queued += [sample_url for sample_url in sampled if queued += [
sample_url not in queued and sample_url not in CrawlElem(sample_url, crawl_elt)
crawled] for sample_url in sampled
if sample_url not in queued
and sample_url not in crawled
]
print(crawled) print(crawled)
queue.put(crawled) output_tree += crawl_tree
if __name__ == '__main__': if __name__ == '__main__':
queue = Queue() crawl_tree = []
crawl = CrawlingThread(None, "https://python.org/", queue) crawl = CrawlingThread(None, "https://python.org/", crawl_tree)
crawl.start() crawl.start()
crawl.join() crawl.join()