Add a timeout to a single page retrieval

2018-02-26 15:42:36 +01:00 · 2018-02-26 15:42:36 +01:00 · 67ad232533
commit 67ad232533
parent e140d4a8a7
1 changed files with 5 additions and 1 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -289,7 +289,11 @@ async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
    crawled.add(simplify_url(url))
    parsed_url = urlparse(url)
    print("Crawling {}".format(url))
-    html = await PageGetter(session, url, user_agent).get(ssl=False)
+    try:
+        with async_timeout.timeout(3):
+            html = await PageGetter(session, url, user_agent).get(ssl=False)
+    except asyncio.TimeoutError:
+        return

    new_tasks = []