From a907cad33d52865c6e6034b8cbd7d6c777a549b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Wed, 21 Feb 2018 19:06:46 +0100 Subject: [PATCH] Start of url getter function --- crawl/crawl.py | 28 +++++++++++++++++++++++++++- requirements.txt | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 6baafad..25d8de9 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,6 +1,7 @@ from threading import Thread from urllib.robotparser import RobotFileParser +from bs4 import BeautifulSoup, Comment import re from datetime import datetime, timedelta @@ -11,6 +12,11 @@ import async_timeout # Ugly hack to use this module alone instead of integrating it with Django # from django.conf import settings +# Gets all the direct bookmarks in the html. +# We want this to avoid following this kind of bookmark + +BOOKMARK_URL = "#.*" + class Settings: USER_AGENT = 'Blah' @@ -18,6 +24,24 @@ settings = Settings() startup_time = datetime.now() +def url_getter(html): + soup = BeautifulSoup(html, "html.parser") + # Get only the body + body = soup.find('body') + # remove the body + body.footer.decompose() + # remove all comments + comments = soup.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + # Remove all bookmark links pointing to the current html page. + links = body.find_all("a") + for link in links: + if re.match(BOOKMARK_URL, link["href"]): + link.extract() + + class WebsiteSchedulerMeta(type): """ Meta-class for WebsiteScheduler, allowing a singleton class-like interface, but spawning one instance per canonical website URL """ @@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta): delay = 5 else: delay = req_rate.requests, req_rate.seconds - self.crawl_delay = timedelta(seconds=delay) def urlroot(self): @@ -87,6 +110,7 @@ class CrawlingThread(Thread): def run(self): tasks = [] tasks.append(async_print('https://python.org')) + tasks.append(async_print('https://python.org/about/gettingstarted')) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -116,11 +140,13 @@ class PageGetter: async with self.session.get(self.url) as resp: return await resp.text() +async def async_parser(html_text) async def async_print(url): """ Debug function to follow what's actually happening """ async with aiohttp.ClientSession() as session: html = await PageGetter(session, url).get() + print('GOT {}HTML for {} at {}'.format( 'None ' if html is None else '', url, diff --git a/requirements.txt b/requirements.txt index bea30c4..480760f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ multidict==4.1.0 pycares==2.3.0 pytz==2017.3 yarl==1.1.1 +beautifulsoup4==4.6.0