mpri-webdam/crawl/crawl.py

323 lines
10 KiB
Python
Raw Normal View History

from threading import Thread
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
from ssl import CertificateError
2018-02-26 11:27:07 +01:00
from random import sample, randrange, randint
import re
from datetime import datetime, timedelta
import asyncio
import aiohttp
import async_timeout
from bs4 import BeautifulSoup, Comment
from profiles.models import BrowserFingerprint, SearchEngine
2018-02-26 11:27:07 +01:00
2018-02-21 11:54:41 +01:00
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
2018-02-21 19:06:46 +01:00
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
HARD_LIMIT = 20
MAX_PER_PAGE = 10
2018-02-21 19:06:46 +01:00
2018-02-22 14:33:07 +01:00
FOOTER_URL = re.compile(".*footer.*")
2018-02-26 11:12:36 +01:00
class Settings:
USER_AGENT = 'Default User'
settings = Settings()
def url_getter(html, current_page, root_url):
links_list = [] # The final resutl
2018-02-21 19:06:46 +01:00
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
if not body:
return links_list
2018-02-21 19:06:46 +01:00
# remove the body
2018-02-22 14:33:07 +01:00
if body.footer:
body.footer.decompose()
2018-02-21 19:06:46 +01:00
# remove all comments
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
2018-02-21 19:06:46 +01:00
for comment in comments:
comment.extract()
2018-02-22 14:33:07 +01:00
footers = soup.findAll(id=FOOTER_URL)
for footer in footers:
footer.extract()
2018-02-21 19:06:46 +01:00
# Remove all bookmark links pointing to the current html page.
links = map(lambda link: link.get("href", ""), body.find_all("a"))
2018-02-21 19:06:46 +01:00
for link in links:
if link: #Edge case, if no href found.
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
links_list.append(root_url + link)
elif link.startswith("#"):
2018-02-24 11:39:04 +01:00
continue
else:
links_list.append(current_page + "/" + link)
## uniqifier works with python <= 3.6
#seen = set()
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(links_list))
2018-02-26 15:14:53 +01:00
forbidden_words = ['login', 'agreement', 'mailto', 'settings']
2018-02-24 11:39:04 +01:00
links_list = [link for link in links_list if not any(word in link.lower()
for word in
forbidden_words)]
return links_list
class WebsiteSchedulerMeta(type):
2018-02-21 11:54:41 +01:00
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
_instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url):
2018-02-21 11:54:41 +01:00
""" Canonicalize a url """
return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs):
canonical = cls.canonical_url(url)
if canonical not in cls._instances:
cls._instances[canonical] = \
super(WebsiteSchedulerMeta, cls) \
.__call__(canonical, *args, **kwargs)
return cls._instances[canonical]
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
search_engines = [] # Must be set by CrawlingThread.__init__
2018-02-26 11:27:07 +01:00
def __init__(self, name, user_agent):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
self.dead = False
2018-02-26 11:12:36 +01:00
self.can_fetch_b = False
2018-02-26 11:27:07 +01:00
self.user_agent = (user_agent if user_agent is not None
else settings.USER_AGENT)
if any(self.urlroot() in item for item in self.search_engines):
2018-02-26 11:12:36 +01:00
print("found a search engine for %s" % self.urlroot())
self.crawl_delay = timedelta(seconds=5)
self.can_fetch_b = True
else:
try:
2018-02-26 11:12:36 +01:00
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
2018-02-26 11:12:36 +01:00
self.robot_parser.read() # TODO async?
except (URLError, CertificateError):
try:
robots_url = self.unsafe_urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read()
2018-02-26 11:27:07 +01:00
except URLError: # Almost surely an offline website.
2018-02-26 11:12:36 +01:00
self.dead = True
self.crawl_delay = 0
except Exception as e:
print(e)
raise e
if not self.robot_parser.default_entry:
self.dead = True
2018-02-26 11:12:36 +01:00
if not self.dead:
2018-02-26 11:27:07 +01:00
delay = self.robot_parser.crawl_delay(self.user_agent)
2018-02-26 11:12:36 +01:00
if delay is None:
2018-02-26 11:27:07 +01:00
req_rate = self.robot_parser.request_rate(self.user_agent)
2018-02-26 11:12:36 +01:00
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
return 'https://{}/'.format(self.name)
def unsafe_urlroot(self):
return 'http://{}/'.format(self.name)
def fetch_delay(self):
''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay
if can_fetch_time < datetime.now():
return timedelta(0)
return can_fetch_time - datetime.now()
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
2018-02-26 11:27:07 +01:00
return ((self.can_fetch_b)
or ((not self.dead) and
self.robot_parser.can_fetch(self.user_agent, url)))
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
self.last_crawled = datetime.now()
class CrawlingThread(Thread):
2018-02-21 11:54:41 +01:00
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
2018-02-26 15:27:57 +01:00
def __init__(self, url):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
2018-02-26 11:27:07 +01:00
nb_fingerprint = len(BrowserFingerprint.objects.all())
fingerprint = BrowserFingerprint.objects.all()[
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
2018-02-26 15:27:57 +01:00
self.output_tree = []
super(CrawlingThread, self).__init__()
self.url = url
def run(self):
tasks = []
2018-02-26 11:27:07 +01:00
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
2018-02-26 15:27:57 +01:00
tasks.append(run_crawl(self.url, self.output_tree, self.headers))
2018-02-26 15:27:57 +01:00
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
class PageGetter:
2018-02-21 11:54:41 +01:00
""" Asynchronously get a webpage, abiding by robots.txt """
2018-02-26 11:27:07 +01:00
headers = None
def __init__(self, session, url, user_agent):
self.url = url
self.session = session
2018-02-26 11:27:07 +01:00
self.user_agent = user_agent
async def get(self, ssl=True):
2018-02-21 11:54:41 +01:00
""" Actually retrieve the webpage """
2018-02-26 11:27:07 +01:00
scheduler = WebsiteScheduler(self.url, self.user_agent)
if not scheduler.can_fetch(self.url):
return None
delay = scheduler.fetch_delay()
while delay > timedelta(0):
await asyncio.sleep(delay.total_seconds())
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url, verify_ssl=ssl) as resp:
try:
return await resp.text()
except UnicodeDecodeError:
return None
async def async_print(url):
2018-02-21 11:54:41 +01:00
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
2018-02-26 11:56:02 +01:00
html = await PageGetter(session, url,
settings.USER_AGENT).get(ssl=False)
2018-02-21 19:06:46 +01:00
2018-02-26 11:27:07 +01:00
print('GOT {}HTML for {}'.format(
'None ' if html is None else '',
url,
2018-02-26 11:27:07 +01:00
))
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
2018-02-26 15:27:57 +01:00
async def run_crawl(url, output_tree, headers=None):
''' Starts a crawling session '''
2018-02-26 11:27:07 +01:00
if headers is None:
2018-02-26 11:56:02 +01:00
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
crawled = set()
2018-02-26 15:27:57 +01:00
async with aiohttp.ClientSession(headers=headers) as session:
await async_crawler(
url, output_tree, crawled, user_agent, session, None)
def simplify_url(url):
anchor = url.find('#')
if anchor >= 0:
url = url[:anchor]
prot = url.find('://')
if prot >= 0:
url = url[prot+3:]
if url.startswith('www.'):
url = url[4:]
return url
async def async_crawler(url, out_tree, crawled, user_agent, session, parent):
if len(crawled) >= HARD_LIMIT:
return
crawled.add(simplify_url(url))
parsed_url = urlparse(url)
print("Crawling {}".format(url))
try:
with async_timeout.timeout(3):
html = await PageGetter(session, url, user_agent).get(ssl=False)
except asyncio.TimeoutError:
return
2018-02-26 15:27:57 +01:00
new_tasks = []
if html:
this_elem = CrawlElem(url, parent)
out_tree.append(this_elem)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
for sample_url in sampled:
if simplify_url(sample_url) not in crawled:
new_tasks.append(async_crawler(
sample_url, out_tree, crawled, user_agent, session,
this_elem))
else:
print("No html received")
if len(crawled) >= HARD_LIMIT:
return
if new_tasks:
await asyncio.wait(new_tasks)