Start of url getter function

This commit is contained in:
Rémi Oudin 2018-02-21 19:06:46 +01:00
parent b05e642c79
commit a907cad33d
2 changed files with 28 additions and 1 deletions

View file

@ -1,6 +1,7 @@
from threading import Thread from threading import Thread
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup, Comment
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -11,6 +12,11 @@ import async_timeout
# Ugly hack to use this module alone instead of integrating it with Django # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings # from django.conf import settings
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
BOOKMARK_URL = "#.*"
class Settings: class Settings:
USER_AGENT = 'Blah' USER_AGENT = 'Blah'
@ -18,6 +24,24 @@ settings = Settings()
startup_time = datetime.now() startup_time = datetime.now()
def url_getter(html):
soup = BeautifulSoup(html, "html.parser")
# Get only the body
body = soup.find('body')
# remove the body
body.footer.decompose()
# remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
# Remove all bookmark links pointing to the current html page.
links = body.find_all("a")
for link in links:
if re.match(BOOKMARK_URL, link["href"]):
link.extract()
class WebsiteSchedulerMeta(type): class WebsiteSchedulerMeta(type):
""" Meta-class for WebsiteScheduler, allowing a singleton class-like """ Meta-class for WebsiteScheduler, allowing a singleton class-like
interface, but spawning one instance per canonical website URL """ interface, but spawning one instance per canonical website URL """
@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
delay = 5 delay = 5
else: else:
delay = req_rate.requests, req_rate.seconds delay = req_rate.requests, req_rate.seconds
self.crawl_delay = timedelta(seconds=delay) self.crawl_delay = timedelta(seconds=delay)
def urlroot(self): def urlroot(self):
@ -87,6 +110,7 @@ class CrawlingThread(Thread):
def run(self): def run(self):
tasks = [] tasks = []
tasks.append(async_print('https://python.org')) tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -116,11 +140,13 @@ class PageGetter:
async with self.session.get(self.url) as resp: async with self.session.get(self.url) as resp:
return await resp.text() return await resp.text()
async def async_parser(html_text)
async def async_print(url): async def async_print(url):
""" Debug function to follow what's actually happening """ """ Debug function to follow what's actually happening """
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
html = await PageGetter(session, url).get() html = await PageGetter(session, url).get()
print('GOT {}HTML for {} at {}'.format( print('GOT {}HTML for {} at {}'.format(
'None ' if html is None else '', 'None ' if html is None else '',
url, url,

View file

@ -11,3 +11,4 @@ multidict==4.1.0
pycares==2.3.0 pycares==2.3.0
pytz==2017.3 pytz==2017.3
yarl==1.1.1 yarl==1.1.1
beautifulsoup4==4.6.0