Start of url getter function
This commit is contained in:
parent
b05e642c79
commit
a907cad33d
2 changed files with 28 additions and 1 deletions
|
@ -1,6 +1,7 @@
|
|||
from threading import Thread
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
@ -11,6 +12,11 @@ import async_timeout
|
|||
# Ugly hack to use this module alone instead of integrating it with Django
|
||||
# from django.conf import settings
|
||||
|
||||
# Gets all the direct bookmarks in the html.
|
||||
# We want this to avoid following this kind of bookmark
|
||||
|
||||
BOOKMARK_URL = "#.*"
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Blah'
|
||||
|
||||
|
@ -18,6 +24,24 @@ settings = Settings()
|
|||
startup_time = datetime.now()
|
||||
|
||||
|
||||
def url_getter(html):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Get only the body
|
||||
body = soup.find('body')
|
||||
# remove the body
|
||||
body.footer.decompose()
|
||||
# remove all comments
|
||||
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
# Remove all bookmark links pointing to the current html page.
|
||||
links = body.find_all("a")
|
||||
for link in links:
|
||||
if re.match(BOOKMARK_URL, link["href"]):
|
||||
link.extract()
|
||||
|
||||
|
||||
class WebsiteSchedulerMeta(type):
|
||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||
interface, but spawning one instance per canonical website URL """
|
||||
|
@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
|
||||
def urlroot(self):
|
||||
|
@ -87,6 +110,7 @@ class CrawlingThread(Thread):
|
|||
def run(self):
|
||||
tasks = []
|
||||
tasks.append(async_print('https://python.org'))
|
||||
tasks.append(async_print('https://python.org/about/gettingstarted'))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
@ -116,11 +140,13 @@ class PageGetter:
|
|||
async with self.session.get(self.url) as resp:
|
||||
return await resp.text()
|
||||
|
||||
async def async_parser(html_text)
|
||||
|
||||
async def async_print(url):
|
||||
""" Debug function to follow what's actually happening """
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await PageGetter(session, url).get()
|
||||
|
||||
print('GOT {}HTML for {} at {}'.format(
|
||||
'None ' if html is None else '',
|
||||
url,
|
||||
|
|
|
@ -11,3 +11,4 @@ multidict==4.1.0
|
|||
pycares==2.3.0
|
||||
pytz==2017.3
|
||||
yarl==1.1.1
|
||||
beautifulsoup4==4.6.0
|
||||
|
|
Loading…
Reference in a new issue