from threading import Thread from urllib.robotparser import RobotFileParser from urllib.parse import urlparse from random import sample, randrange import re from datetime import datetime, timedelta import asyncio import aiohttp import async_timeout from bs4 import BeautifulSoup, Comment # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings # Gets all the direct bookmarks in the html. # We want this to avoid following this kind of bookmark HARD_LIMIT = 20 MAX_PER_PAGE = 10 FOOTER_URL = re.compile(".*footer.*") class Settings: USER_AGENT = 'BlahBlah' settings = Settings() startup_time = datetime.now() def url_getter(html, current_page, root_url): links_list = [] # The final resutl soup = BeautifulSoup(html, "html.parser") # Get only the body body = soup.find('body') # remove the body if body.footer: body.footer.decompose() # remove all comments comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() print("Retrieving footers") footers = soup.findAll(id=FOOTER_URL) for footer in footers: footer.extract() # Remove all bookmark links pointing to the current html page. links = map(lambda link: link["href"], body.find_all("a")) for link in links: if link.startswith("http"): links_list.append(link) elif link.startswith('/'): #Internal link, linking to page root url links_list.append(root_url + link) elif link.startswith("#"): print("Invalid link : internal bookmark") else: links_list.append(current_page + link) ## uniqifier works with python <= 3.6 #seen = set() #links_list = [x for x in links_list if x not in seen and not seen.add(x)] # uniqifier # Works only with python >= 3.6 links_list = list(dict.fromkeys(links_list)) print(links_list) return links_list class WebsiteSchedulerMeta(type): """ Meta-class for WebsiteScheduler, allowing a singleton class-like interface, but spawning one instance per canonical website URL """ _instances = {} _canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)') def canonical_url(cls, url): """ Canonicalize a url """ return cls._canonicalize.search(url).groups()[1] def __call__(cls, url, *args, **kwargs): canonical = cls.canonical_url(url) if canonical not in cls._instances: cls._instances[canonical] = \ super(WebsiteSchedulerMeta, cls) \ .__call__(canonical, *args, **kwargs) return cls._instances[canonical] class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): """ Schedule the accesses to a website as of robots.txt """ def __init__(self, name): self.name = name self.last_crawled = datetime.fromtimestamp(0) robots_url = self.urlroot() + 'robots.txt' self.robot_parser = RobotFileParser(robots_url) self.robot_parser.read() # TODO async? delay = self.robot_parser.crawl_delay(settings.USER_AGENT) if delay is None: req_rate = self.robot_parser.request_rate(settings.USER_AGENT) if req_rate is None: delay = 5 else: delay = req_rate.requests, req_rate.seconds self.crawl_delay = timedelta(seconds=delay) def urlroot(self): ''' Get the root url for this website ''' return 'https://{}/'.format(self.name) def fetch_delay(self): ''' Get the delay needed before fetching a page is possible ''' can_fetch_time = self.last_crawled + self.crawl_delay if can_fetch_time < datetime.now(): return timedelta(0) return can_fetch_time - datetime.now() def can_fetch(self, url): ''' Check whether this program can fetch a given page ''' return self.robot_parser.can_fetch(settings.USER_AGENT, url) def fetching(self): ''' Tell the scheduler that a page is being fetched now ''' self.last_crawled = datetime.now() class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ def __init__(self): super(CrawlingThread, self).__init__() def run(self): tasks = [] tasks.append(async_crawler("https://python.org/")) #tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(asyncio.wait(tasks)) loop.close() class PageGetter: """ Asynchronously get a webpage, abiding by robots.txt """ def __init__(self, session, url): self.url = url self.session = session async def get(self): """ Actually retrieve the webpage """ scheduler = WebsiteScheduler(self.url) if not scheduler.can_fetch(self.url): return None delay = scheduler.fetch_delay() while delay > timedelta(0): await asyncio.sleep(delay.total_seconds()) delay = scheduler.fetch_delay() scheduler.fetching() async with async_timeout.timeout(10): async with self.session.get(self.url) as resp: return await resp.text() async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', url, datetime.now() - startup_time)) async def async_crawler(url): queue = [url] crawled = [] while queue or (len(crawled) < HARD_LIMIT): async with aiohttp.ClientSession() as session: try: url = queue.pop(0) except IndexError: print("Error queue is empty") return crawled parsed_url = urlparse(url) print("Crawling {}".format(url)) html = await PageGetter(session, url).get() new_urls = url_getter( html, url, parsed_url.scheme + "://" + parsed_url.netloc ) crawled += [url] sampled = sample( new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) queue += [sample_url for sample_url in sampled if sample_url not in queue and sample_url not in crawled] print(crawled) if __name__ == '__main__': crawl = CrawlingThread() crawl.start() crawl.join()