[REBASE ME] Crawl: enhance efficiency and output a tree

This commit is contained in:
Théophile Bastian 2018-02-25 15:08:06 +01:00
parent bc7348f677
commit 15323c3465
1 changed files with 29 additions and 14 deletions

View File

@ -162,9 +162,9 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, user, url, queue):
def __init__(self, user, url, output_tree):
global settings
self.queue = queue
self.output_tree = output_tree
super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
@ -175,7 +175,7 @@ class CrawlingThread(Thread):
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
tasks.append(async_crawler(self.url, self.output_tree))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -220,39 +220,54 @@ async def async_print(url):
url,
datetime.now() - startup_time))
async def async_crawler(url, queue):
queued = [url]
crawled = []
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def async_crawler(url, output_tree):
queued = [CrawlElem(url, None)]
crawled = set()
crawl_tree = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queued.pop(0)
crawl_elt = queued.pop(0)
url = crawl_elt.url
except IndexError:
print("Error queue is empty")
return crawled
crawled += url
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get(ssl=False)
if html:
crawl_tree.append(crawl_elt)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
queued += [
CrawlElem(sample_url, crawl_elt)
for sample_url in sampled
if sample_url not in queued
and sample_url not in crawled
]
print(crawled)
queue.put(crawled)
output_tree += crawl_tree
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue)
crawl_tree = []
crawl = CrawlingThread(None, "https://python.org/", crawl_tree)
crawl.start()
crawl.join()