[REBASE ME] Crawl: enhance efficiency and output a tree
This commit is contained in:
parent
bc7348f677
commit
15323c3465
1 changed files with 29 additions and 14 deletions
|
@ -162,9 +162,9 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, user, url, queue):
|
def __init__(self, user, url, output_tree):
|
||||||
global settings
|
global settings
|
||||||
self.queue = queue
|
self.output_tree = output_tree
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
if user:
|
if user:
|
||||||
settings.USER_AGENT = user.serialize_headers()
|
settings.USER_AGENT = user.serialize_headers()
|
||||||
|
@ -175,7 +175,7 @@ class CrawlingThread(Thread):
|
||||||
tasks = []
|
tasks = []
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(async_crawler(self.url, self.queue))
|
tasks.append(async_crawler(self.url, self.output_tree))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
@ -220,39 +220,54 @@ async def async_print(url):
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
datetime.now() - startup_time))
|
||||||
|
|
||||||
async def async_crawler(url, queue):
|
|
||||||
queued = [url]
|
class CrawlElem:
|
||||||
crawled = []
|
''' Describes a crawled element, to be assembled into a tree '''
|
||||||
|
|
||||||
|
def __init__(self, url, parent):
|
||||||
|
self.url = url
|
||||||
|
self.parent = parent
|
||||||
|
|
||||||
|
async def async_crawler(url, output_tree):
|
||||||
|
queued = [CrawlElem(url, None)]
|
||||||
|
crawled = set()
|
||||||
|
crawl_tree = []
|
||||||
|
|
||||||
while queued and (len(crawled) < HARD_LIMIT):
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
try:
|
try:
|
||||||
url = queued.pop(0)
|
crawl_elt = queued.pop(0)
|
||||||
|
url = crawl_elt.url
|
||||||
except IndexError:
|
except IndexError:
|
||||||
print("Error queue is empty")
|
print("Error queue is empty")
|
||||||
return crawled
|
return crawled
|
||||||
|
crawled += url
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
print("Crawling {}".format(url))
|
print("Crawling {}".format(url))
|
||||||
html = await PageGetter(session, url).get(ssl=False)
|
html = await PageGetter(session, url).get(ssl=False)
|
||||||
if html:
|
if html:
|
||||||
|
crawl_tree.append(crawl_elt)
|
||||||
new_urls = url_getter(
|
new_urls = url_getter(
|
||||||
html,
|
html,
|
||||||
url,
|
url,
|
||||||
parsed_url.scheme + "://" + parsed_url.netloc
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
)
|
)
|
||||||
crawled += [url]
|
|
||||||
if new_urls:
|
if new_urls:
|
||||||
sampled = sample(
|
sampled = sample(
|
||||||
new_urls,
|
new_urls,
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
)
|
)
|
||||||
queued += [sample_url for sample_url in sampled if
|
queued += [
|
||||||
sample_url not in queued and sample_url not in
|
CrawlElem(sample_url, crawl_elt)
|
||||||
crawled]
|
for sample_url in sampled
|
||||||
|
if sample_url not in queued
|
||||||
|
and sample_url not in crawled
|
||||||
|
]
|
||||||
print(crawled)
|
print(crawled)
|
||||||
queue.put(crawled)
|
output_tree += crawl_tree
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
queue = Queue()
|
crawl_tree = []
|
||||||
crawl = CrawlingThread(None, "https://python.org/", queue)
|
crawl = CrawlingThread(None, "https://python.org/", crawl_tree)
|
||||||
crawl.start()
|
crawl.start()
|
||||||
crawl.join()
|
crawl.join()
|
||||||
|
|
Loading…
Reference in a new issue