Exception handling

Big problem with the url https:/plus.google.com/+Python concerning
robots parsing.
Didn't find the bug. @tobast, if you have some time to look at it :)
This commit is contained in:
Rémi Oudin 2018-02-23 00:37:36 +01:00
parent 77ca7ebcb9
commit 0e02f22d08
1 changed files with 69 additions and 44 deletions

View File

@ -1,7 +1,9 @@
from threading import Thread from threading import Thread
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange from random import sample, randrange
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*") FOOTER_URL = re.compile(".*footer.*")
class Settings: class Settings:
USER_AGENT = 'BlahBlah' USER_AGENT = 'Blah'
settings = Settings() settings = Settings()
startup_time = datetime.now() startup_time = datetime.now()
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
# Get only the body # Get only the body
body = soup.find('body') body = soup.find('body')
if not body:
return links_list
# remove the body # remove the body
if body.footer: if body.footer:
body.footer.decompose() body.footer.decompose()
@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
for comment in comments: for comment in comments:
comment.extract() comment.extract()
print("Retrieving footers")
footers = soup.findAll(id=FOOTER_URL) footers = soup.findAll(id=FOOTER_URL)
for footer in footers: for footer in footers:
footer.extract() footer.extract()
# Remove all bookmark links pointing to the current html page. # Remove all bookmark links pointing to the current html page.
links = map(lambda link: link["href"], body.find_all("a")) links = map(lambda link: link.get("href", ""), body.find_all("a"))
for link in links: for link in links:
if link.startswith("http"): if link: #Edge case, if no href found.
links_list.append(link) if link.startswith("http"):
elif link.startswith('/'): #Internal link, linking to page root url links_list.append(link)
links_list.append(root_url + link) elif link.startswith('/'): #Internal link, linking to page root url
elif link.startswith("#"): links_list.append(root_url + link)
print("Invalid link : internal bookmark") elif link.startswith("#"):
else: print("Invalid link : internal bookmark")
links_list.append(current_page + link) else:
links_list.append(current_page + "/" + link)
## uniqifier works with python <= 3.6 ## uniqifier works with python <= 3.6
#seen = set() #seen = set()
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
# uniqifier # uniqifier
# Works only with python >= 3.6 # Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list)) links_list = list(dict.fromkeys(links_list))
print(links_list)
return links_list return links_list
@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.last_crawled = datetime.fromtimestamp(0) self.last_crawled = datetime.fromtimestamp(0)
robots_url = self.urlroot() + 'robots.txt' self.dead = False
self.robot_parser = RobotFileParser(robots_url) try:
self.robot_parser.read() # TODO async? robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
delay = self.robot_parser.crawl_delay(settings.USER_AGENT) self.robot_parser.read() # TODO async?
if delay is None: except (URLError, CertificateError):
req_rate = self.robot_parser.request_rate(settings.USER_AGENT) try:
if req_rate is None: robots_url = self.unsafe_urlroot() + 'robots.txt'
delay = 5 self.robot_parser = RobotFileParser(robots_url)
else: self.robot_parser.read()
delay = req_rate.requests, req_rate.seconds except URLError: # Almost surely an offline website.
self.crawl_delay = timedelta(seconds=delay) self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self): def urlroot(self):
''' Get the root url for this website ''' ''' Get the root url for this website '''
return 'https://{}/'.format(self.name) return 'https://{}/'.format(self.name)
def unsafe_urlroot(self):
return 'http://{}/'.format(self.name)
def fetch_delay(self): def fetch_delay(self):
''' Get the delay needed before fetching a page is possible ''' ''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay can_fetch_time = self.last_crawled + self.crawl_delay
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url): def can_fetch(self, url):
''' Check whether this program can fetch a given page ''' ''' Check whether this program can fetch a given page '''
return self.robot_parser.can_fetch(settings.USER_AGENT, url) return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
def fetching(self): def fetching(self):
''' Tell the scheduler that a page is being fetched now ''' ''' Tell the scheduler that a page is being fetched now '''
@ -140,8 +159,8 @@ class CrawlingThread(Thread):
def run(self): def run(self):
tasks = [] tasks = []
tasks.append(async_crawler("https://python.org/")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_print('https://python.org/about/gettingstarted')) tasks.append(async_print('https://python.org/'))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -156,7 +175,7 @@ class PageGetter:
self.url = url self.url = url
self.session = session self.session = session
async def get(self): async def get(self, ssl=True):
""" Actually retrieve the webpage """ """ Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url) scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url): if not scheduler.can_fetch(self.url):
@ -168,14 +187,17 @@ class PageGetter:
delay = scheduler.fetch_delay() delay = scheduler.fetch_delay()
scheduler.fetching() scheduler.fetching()
async with async_timeout.timeout(10): async with async_timeout.timeout(10):
async with self.session.get(self.url) as resp: async with self.session.get(self.url, ssl=ssl) as resp:
return await resp.text() try:
return await resp.text()
except UnicodeDecodeError:
return None
async def async_print(url): async def async_print(url):
""" Debug function to follow what's actually happening """ """ Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get() html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {} at {}'.format( print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '', 'None ' if html is None else '',
@ -194,19 +216,22 @@ async def async_crawler(url):
return crawled return crawled
parsed_url = urlparse(url) parsed_url = urlparse(url)
print("Crawling {}".format(url)) print("Crawling {}".format(url))
html = await PageGetter(session, url).get() html = await PageGetter(session, url).get(ssl=False)
new_urls = url_getter( if html:
html, new_urls = url_getter(
url, html,
parsed_url.scheme + "://" + parsed_url.netloc url,
) parsed_url.scheme + "://" + parsed_url.netloc
crawled += [url] )
sampled = sample( crawled += [url]
new_urls, if new_urls:
randrange(min(MAX_PER_PAGE, len(new_urls))) sampled = sample(
) new_urls,
queue += [sample_url for sample_url in sampled if sample_url not in randrange(min(MAX_PER_PAGE, len(new_urls)))
queue and sample_url not in crawled] )
queue += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in
crawled]
print(crawled) print(crawled)
if __name__ == '__main__': if __name__ == '__main__':