From 02e91bb2b744a67e62e812dfeeea4d1a541fc121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Mon, 26 Feb 2018 11:56:02 +0100 Subject: [PATCH] Fix function calls --- crawl/crawl.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 9d266f3..10c7a53 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -234,7 +234,8 @@ class PageGetter: async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: - html = await PageGetter(session, url).get(ssl=False) + html = await PageGetter(session, url, + settings.USER_AGENT).get(ssl=False) print('GOT {}HTML for {}'.format( 'None ' if html is None else '', @@ -244,9 +245,11 @@ async def async_print(url): async def async_crawler(url, queue, headers=None): if headers is None: - headers = { - 'User-Agent': settings.USER_AGENT, - } + headers = {} + if 'User-Agent' not in headers: + headers['User-Agent'] = settings.USER_AGENT + + user_agent = headers['User-Agent'] queued = [url] crawled = [] @@ -259,7 +262,7 @@ async def async_crawler(url, queue, headers=None): return crawled parsed_url = urlparse(url) print("Crawling {}".format(url)) - html = await PageGetter(session, url).get(ssl=False) + html = await PageGetter(session, url, user_agent).get(ssl=False) if html: new_urls = url_getter( html,