From 67ad232533c9d2301d45d68ac696ad82a5545a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Mon, 26 Feb 2018 15:42:36 +0100 Subject: [PATCH] Add a timeout to a single page retrieval --- crawl/crawl.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 28564f8..8754530 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -289,7 +289,11 @@ async def async_crawler(url, out_tree, crawled, user_agent, session, parent): crawled.add(simplify_url(url)) parsed_url = urlparse(url) print("Crawling {}".format(url)) - html = await PageGetter(session, url, user_agent).get(ssl=False) + try: + with async_timeout.timeout(3): + html = await PageGetter(session, url, user_agent).get(ssl=False) + except asyncio.TimeoutError: + return new_tasks = []