Start of url getter function

This commit is contained in:
Rémi Oudin 2018-02-21 19:06:46 +01:00
parent b05e642c79
commit a907cad33d
2 changed files with 28 additions and 1 deletions

View file

@ -1,6 +1,7 @@
from threading import Thread
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup, Comment
import re
from datetime import datetime, timedelta
@ -11,6 +12,11 @@ import async_timeout
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
BOOKMARK_URL = "#.*"
class Settings:
USER_AGENT = 'Blah'
@ -18,6 +24,24 @@ settings = Settings()
startup_time = datetime.now()
def url_getter(html):
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
# remove the body
body.footer.decompose()
# remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
# Remove all bookmark links pointing to the current html page.
links = body.find_all("a")
for link in links:
if re.match(BOOKMARK_URL, link["href"]):
link.extract()
class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """
@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
delay = 5
else:
delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay)
def urlroot(self):
@ -87,6 +110,7 @@ class CrawlingThread(Thread):
def run(self):
tasks = []
tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -116,11 +140,13 @@ class PageGetter:
async with self.session.get(self.url) as resp:
return await resp.text()
async def async_parser(html_text)
async def async_print(url):
""" Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get()
print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '',
url,

View file

@ -11,3 +11,4 @@ multidict==4.1.0
pycares==2.3.0
pytz==2017.3
yarl==1.1.1
beautifulsoup4==4.6.0