Add a timeout to a single page retrieval

This commit is contained in:
Théophile Bastian 2018-02-26 15:42:36 +01:00
parent e140d4a8a7
commit 67ad232533
1 changed files with 5 additions and 1 deletions

View File

@ -289,7 +289,11 @@ async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url, user_agent).get(ssl=False)
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
new_tasks = []