Start of url getter function
This commit is contained in:
parent
b05e642c79
commit
a907cad33d
2 changed files with 28 additions and 1 deletions
|
@ -1,6 +1,7 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Comment
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -11,6 +12,11 @@ import async_timeout
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
|
# Gets all the direct bookmarks in the html.
|
||||||
|
# We want this to avoid following this kind of bookmark
|
||||||
|
|
||||||
|
BOOKMARK_URL = "#.*"
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Blah'
|
USER_AGENT = 'Blah'
|
||||||
|
|
||||||
|
@ -18,6 +24,24 @@ settings = Settings()
|
||||||
startup_time = datetime.now()
|
startup_time = datetime.now()
|
||||||
|
|
||||||
|
|
||||||
|
def url_getter(html):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
# Get only the body
|
||||||
|
body = soup.find('body')
|
||||||
|
# remove the body
|
||||||
|
body.footer.decompose()
|
||||||
|
# remove all comments
|
||||||
|
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
# Remove all bookmark links pointing to the current html page.
|
||||||
|
links = body.find_all("a")
|
||||||
|
for link in links:
|
||||||
|
if re.match(BOOKMARK_URL, link["href"]):
|
||||||
|
link.extract()
|
||||||
|
|
||||||
|
|
||||||
class WebsiteSchedulerMeta(type):
|
class WebsiteSchedulerMeta(type):
|
||||||
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
||||||
interface, but spawning one instance per canonical website URL """
|
interface, but spawning one instance per canonical website URL """
|
||||||
|
@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||||
delay = 5
|
delay = 5
|
||||||
else:
|
else:
|
||||||
delay = req_rate.requests, req_rate.seconds
|
delay = req_rate.requests, req_rate.seconds
|
||||||
|
|
||||||
self.crawl_delay = timedelta(seconds=delay)
|
self.crawl_delay = timedelta(seconds=delay)
|
||||||
|
|
||||||
def urlroot(self):
|
def urlroot(self):
|
||||||
|
@ -87,6 +110,7 @@ class CrawlingThread(Thread):
|
||||||
def run(self):
|
def run(self):
|
||||||
tasks = []
|
tasks = []
|
||||||
tasks.append(async_print('https://python.org'))
|
tasks.append(async_print('https://python.org'))
|
||||||
|
tasks.append(async_print('https://python.org/about/gettingstarted'))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
@ -116,11 +140,13 @@ class PageGetter:
|
||||||
async with self.session.get(self.url) as resp:
|
async with self.session.get(self.url) as resp:
|
||||||
return await resp.text()
|
return await resp.text()
|
||||||
|
|
||||||
|
async def async_parser(html_text)
|
||||||
|
|
||||||
async def async_print(url):
|
async def async_print(url):
|
||||||
""" Debug function to follow what's actually happening """
|
""" Debug function to follow what's actually happening """
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
html = await PageGetter(session, url).get()
|
html = await PageGetter(session, url).get()
|
||||||
|
|
||||||
print('GOT {}HTML for {} at {}'.format(
|
print('GOT {}HTML for {} at {}'.format(
|
||||||
'None ' if html is None else '',
|
'None ' if html is None else '',
|
||||||
url,
|
url,
|
||||||
|
|
|
@ -11,3 +11,4 @@ multidict==4.1.0
|
||||||
pycares==2.3.0
|
pycares==2.3.0
|
||||||
pytz==2017.3
|
pytz==2017.3
|
||||||
yarl==1.1.1
|
yarl==1.1.1
|
||||||
|
beautifulsoup4==4.6.0
|
||||||
|
|
Loading…
Reference in a new issue