Start of url getter function

2018-02-21 19:06:46 +01:00 · 2018-02-21 19:06:46 +01:00 · a907cad33d
commit a907cad33d
parent b05e642c79
2 changed files with 28 additions and 1 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,6 +1,7 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser

+from bs4 import BeautifulSoup, Comment
 import re
 from datetime import datetime, timedelta

@ -11,6 +12,11 @@ import async_timeout
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings

+# Gets all the direct bookmarks in the html.
+# We want this to avoid following this kind of bookmark
+
+BOOKMARK_URL = "#.*"
+
 class Settings:
    USER_AGENT = 'Blah'

@ -18,6 +24,24 @@ settings = Settings()
 startup_time = datetime.now()


+def url_getter(html):
+    soup = BeautifulSoup(html, "html.parser")
+    # Get only the body
+    body = soup.find('body')
+    # remove the body
+    body.footer.decompose()
+    # remove all comments
+    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    for comment in comments:
+        comment.extract()
+
+    # Remove all bookmark links pointing to the current html page.
+    links = body.find_all("a")
+    for link in links:
+        if re.match(BOOKMARK_URL, link["href"]):
+            link.extract()
+
+
 class WebsiteSchedulerMeta(type):
    """ Meta-class for WebsiteScheduler, allowing a singleton class-like
    interface, but spawning one instance per canonical website URL """
@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                delay = 5
            else:
                delay = req_rate.requests, req_rate.seconds
-
        self.crawl_delay = timedelta(seconds=delay)

    def urlroot(self):
@ -87,6 +110,7 @@ class CrawlingThread(Thread):
    def run(self):
        tasks = []
        tasks.append(async_print('https://python.org'))
+        tasks.append(async_print('https://python.org/about/gettingstarted'))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -116,11 +140,13 @@ class PageGetter:
            async with self.session.get(self.url) as resp:
                return await resp.text()

+async def async_parser(html_text)

 async def async_print(url):
    """ Debug function to follow what's actually happening """
    async with aiohttp.ClientSession() as session:
        html = await PageGetter(session, url).get()
+
        print('GOT {}HTML for {} at {}'.format(
            'None ' if html is None else '',
            url,
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ multidict==4.1.0
 pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
+beautifulsoup4==4.6.0