mpri-webdam/crawl/crawl.py

323 lines
10 KiB
Python
Raw Permalink Normal View History

from threading import Thread
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
2018-02-26 11:27:07 +01:00
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
import asyncio
import aiohttp
import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
2018-02-26 11:27:07 +01:00
2018-02-21 11:54:41 +01:00
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
2018-02-21 19:06:46 +01:00
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
HARD_LIMIT = 20
MAX_PER_PAGE = 10
2018-02-21 19:06:46 +01:00
2018-02-22 14:33:07 +01:00
FOOTER_URL = re.compile(".*footer.*")
2018-02-26 11:12:36 +01:00
class Settings:
USER_AGENT = 'Default User'
settings = Settings()
def url_getter(html, current_page, root_url):
links_list = [] # The final resutl
2018-02-21 19:06:46 +01:00
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
if not body:
return links_list
2018-02-21 19:06:46 +01:00
# remove the body
2018-02-22 14:33:07 +01:00
if body.footer:
body.footer.decompose()
2018-02-21 19:06:46 +01:00
# remove all comments
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
2018-02-21 19:06:46 +01:00
for comment in comments:
comment.extract()
2018-02-22 14:33:07 +01:00
footers = soup.findAll(id=FOOTER_URL)
for footer in footers:
footer.extract()
2018-02-21 19:06:46 +01:00
# Remove all bookmark links pointing to the current html page.
links = map(lambda link: link.get("href", ""), body.find_all("a"))
2018-02-21 19:06:46 +01:00
for link in links:
if link: #Edge case, if no href found.
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
2018-02-24 11:39:04 +01:00
continue
else:
links_list.append(current_page + "/" + link)
## uniqifier works with python <= 3.6
#seen = set()
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
2018-02-26 15:14:53 +01:00
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
2018-02-24 11:39:04 +01:00
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
return links_list
class WebsiteSchedulerMeta(type):
2018-02-21 11:54:41 +01:00
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
_instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url):
2018-02-21 11:54:41 +01:00
""" Canonicalize a url """
return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs):
canonical = cls.canonical_url(url)
if canonical not in cls._instances:
cls._instances[canonical] = \
super(WebsiteSchedulerMeta, cls) \
.__call__(canonical, *args, **kwargs)
return cls._instances[canonical]
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
search_engines = [] # Must be set by CrawlingThread.__init__
2018-02-26 11:27:07 +01:00
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
2018-02-26 11:12:36 +01:00
self.can_fetch_b = False
2018-02-26 11:27:07 +01:00
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
2018-02-26 11:12:36 +01:00
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
2018-02-26 11:12:36 +01:00
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
2018-02-26 11:12:36 +01:00
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
2018-02-26 11:27:07 +01:00
except URLError: # Almost surely an offline website.
2018-02-26 11:12:36 +01:00
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
2018-02-26 11:12:36 +01:00
if not self.dead:
2018-02-26 11:27:07 +01:00
delay = self.robot_parser.crawl_delay(self.user_agent)
2018-02-26 11:12:36 +01:00
if delay is None:
2018-02-26 11:27:07 +01:00
req_rate = self.robot_parser.request_rate(self.user_agent)
2018-02-26 11:12:36 +01:00
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
return 'https://{}/'.format(self.name)
def unsafe_urlroot(self):
return 'http://{}/'.format(self.name)
def fetch_delay(self):
''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay
if can_fetch_time < datetime.now():
return timedelta(0)
return can_fetch_time - datetime.now()
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
2018-02-26 11:27:07 +01:00
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
self.last_crawled = datetime.now()
class CrawlingThread(Thread):
2018-02-21 11:54:41 +01:00
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
2018-02-26 15:27:57 +01:00
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
2018-02-26 11:27:07 +01:00
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
2018-02-26 15:27:57 +01:00
self.output_tree = []
super(CrawlingThread, self).__init__()
self.url = url
def run(self):
tasks = []
2018-02-26 11:27:07 +01:00
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
2018-02-26 15:27:57 +01:00
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
2018-02-26 15:27:57 +01:00
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
class PageGetter:
2018-02-21 11:54:41 +01:00
""" Asynchronously get a webpage, abiding by robots.txt """
2018-02-26 11:27:07 +01:00
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
2018-02-26 11:27:07 +01:00
self.user_agent = user_agent
async def get(self, ssl=True):
2018-02-21 11:54:41 +01:00
""" Actually retrieve the webpage """
2018-02-26 11:27:07 +01:00
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
delay = scheduler.fetch_delay()
while delay > timedelta(0):
await asyncio.sleep(delay.total_seconds())
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
async def async_print(url):
2018-02-21 11:54:41 +01:00
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
2018-02-26 11:56:02 +01:00
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
2018-02-21 19:06:46 +01:00
2018-02-26 11:27:07 +01:00
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
2018-02-26 11:27:07 +01:00
))
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
2018-02-26 15:27:57 +01:00
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
2018-02-26 11:27:07 +01:00
if headers is None:
2018-02-26 11:56:02 +01:00
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
2018-02-26 15:27:57 +01:00
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
2018-02-26 15:27:57 +01:00
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)