Exception handling

Big problem with the url https:/plus.google.com/+Python concerning
robots parsing.
Didn't find the bug. @tobast, if you have some time to look at it :)
This commit is contained in:
Rémi Oudin 2018-02-23 00:37:36 +01:00
parent 77ca7ebcb9
commit 0e02f22d08
1 changed files with 69 additions and 44 deletions

View File

@ -1,7 +1,9 @@
from threading import Thread
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
from random import sample, randrange
import re
from datetime import datetime, timedelta
@ -24,7 +26,7 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'BlahBlah'
USER_AGENT = 'Blah'
settings = Settings()
startup_time = datetime.now()
@ -35,6 +37,8 @@ def url_getter(html, current_page, root_url):
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
if not body:
return links_list
# remove the body
if body.footer:
body.footer.decompose()
@ -43,22 +47,22 @@ def url_getter(html, current_page, root_url):
for comment in comments:
comment.extract()
print("Retrieving footers")
footers = soup.findAll(id=FOOTER_URL)
for footer in footers:
footer.extract()
# Remove all bookmark links pointing to the current html page.
links = map(lambda link: link["href"], body.find_all("a"))
links = map(lambda link: link.get("href", ""), body.find_all("a"))
for link in links:
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
else:
links_list.append(current_page + link)
if link: #Edge case, if no href found.
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
else:
links_list.append(current_page + "/" + link)
## uniqifier works with python <= 3.6
#seen = set()
@ -66,7 +70,6 @@ def url_getter(html, current_page, root_url):
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
print(links_list)
return links_list
@ -98,23 +101,39 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def __init__(self, name):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
self.dead = False
try:
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
except URLError: # Almost surely an offline website.
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.dead:
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
return 'https://{}/'.format(self.name)
def unsafe_urlroot(self):
return 'http://{}/'.format(self.name)
def fetch_delay(self):
''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay
@ -124,7 +143,7 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
return (not self.dead) and self.robot_parser.can_fetch(settings.USER_AGENT, url)
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
@ -140,8 +159,8 @@ class CrawlingThread(Thread):
def run(self):
tasks = []
tasks.append(async_crawler("https://python.org/"))
#tasks.append(async_print('https://python.org/about/gettingstarted'))
#tasks.append(async_crawler("http://plus.google.com/+Python"))
tasks.append(async_print('https://python.org/'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -156,7 +175,7 @@ class PageGetter:
self.url = url
self.session = session
async def get(self):
async def get(self, ssl=True):
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url):
@ -168,14 +187,17 @@ class PageGetter:
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url) as resp:
return await resp.text()
async with self.session.get(self.url, ssl=ssl) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get()
html = await PageGetter(session, url).get(ssl=False)
print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '',
@ -194,19 +216,22 @@ async def async_crawler(url):
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get()
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if sample_url not in
queue and sample_url not in crawled]
html = await PageGetter(session, url).get(ssl=False)
if html:
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in
crawled]
print(crawled)
if __name__ == '__main__':