From 67ad232533c9d2301d45d68ac696ad82a5545a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Mon, 26 Feb 2018 15:42:36 +0100
Subject: [PATCH] Add a timeout to a single page retrieval

---
 crawl/crawl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 28564f8..8754530 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -289,7 +289,11 @@ async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
     crawled.add(simplify_url(url))
     parsed_url = urlparse(url)
     print("Crawling {}".format(url))
-    html = await PageGetter(session, url, user_agent).get(ssl=False)
+    try:
+        with async_timeout.timeout(3):
+            html = await PageGetter(session, url, user_agent).get(ssl=False)
+    except asyncio.TimeoutError:
+        return
 
     new_tasks = []