mpri-webdam/crawl/crawl.py

176 lines
5.4 KiB
Python
Raw Normal View History

from threading import Thread
from urllib.robotparser import RobotFileParser
2018-02-21 19:06:46 +01:00
from bs4 import BeautifulSoup, Comment
import re
from datetime import datetime, timedelta
import asyncio
import aiohttp
import async_timeout
2018-02-21 11:54:41 +01:00
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
2018-02-21 19:06:46 +01:00
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
BOOKMARK_URL = "#.*"
class Settings:
USER_AGENT = 'Blah'
settings = Settings()
startup_time = datetime.now()
def url_getter(html, current_page, root_url):
links_list = [] # The final resutl
2018-02-21 19:06:46 +01:00
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
# remove the body
body.footer.decompose()
# remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
# Remove all bookmark links pointing to the current html page.
links = body.find_all("a")
for link in links:
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
link_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
else:
links_list.append(current_page + link)
## uniqifier works with python <= 3.6
#seen = set()
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(seq))
2018-02-21 19:06:46 +01:00
class WebsiteSchedulerMeta(type):
2018-02-21 11:54:41 +01:00
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
_instances = {}
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
def canonical_url(cls, url):
2018-02-21 11:54:41 +01:00
""" Canonicalize a url """
return cls._canonicalize.search(url).groups()[1]
def __call__(cls, url, *args, **kwargs):
canonical = cls.canonical_url(url)
if canonical not in cls._instances:
cls._instances[canonical] = \
super(WebsiteSchedulerMeta, cls) \
.__call__(canonical, *args, **kwargs)
return cls._instances[canonical]
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
""" Schedule the accesses to a website as of robots.txt """
def __init__(self, name):
self.name = name
self.last_crawled = datetime.fromtimestamp(0)
robots_url = self.urlroot() + 'robots.txt'
self.robot_parser = RobotFileParser(robots_url)
self.robot_parser.read() # TODO async?
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
if delay is None:
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
if req_rate is None:
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
''' Get the root url for this website '''
return 'https://{}/'.format(self.name)
def fetch_delay(self):
''' Get the delay needed before fetching a page is possible '''
can_fetch_time = self.last_crawled + self.crawl_delay
if can_fetch_time < datetime.now():
return timedelta(0)
return can_fetch_time - datetime.now()
def can_fetch(self, url):
''' Check whether this program can fetch a given page '''
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
def fetching(self):
''' Tell the scheduler that a page is being fetched now '''
self.last_crawled = datetime.now()
class CrawlingThread(Thread):
2018-02-21 11:54:41 +01:00
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self):
super(CrawlingThread, self).__init__()
def run(self):
tasks = []
tasks.append(async_print('https://python.org'))
2018-02-21 19:06:46 +01:00
tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
class PageGetter:
2018-02-21 11:54:41 +01:00
""" Asynchronously get a webpage, abiding by robots.txt """
def __init__(self, session, url):
self.url = url
self.session = session
async def get(self):
2018-02-21 11:54:41 +01:00
""" Actually retrieve the webpage """
scheduler = WebsiteScheduler(self.url)
if not scheduler.can_fetch(self.url):
return None
delay = scheduler.fetch_delay()
while delay > timedelta(0):
await asyncio.sleep(delay.total_seconds())
delay = scheduler.fetch_delay()
scheduler.fetching()
async with async_timeout.timeout(10):
async with self.session.get(self.url) as resp:
return await resp.text()
async def async_print(url):
2018-02-21 11:54:41 +01:00
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get()
2018-02-21 19:06:46 +01:00
print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '',
url,
datetime.now() - startup_time))
if __name__ == '__main__':
crawl = CrawlingThread()
crawl.start()
crawl.join()