From a907cad33d52865c6e6034b8cbd7d6c777a549b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <oudin@crans.org>
Date: Wed, 21 Feb 2018 19:06:46 +0100
Subject: [PATCH] Start of url getter function

---
 crawl/crawl.py   | 28 +++++++++++++++++++++++++++-
 requirements.txt |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/crawl/crawl.py b/crawl/crawl.py
index 6baafad..25d8de9 100644
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@@ -1,6 +1,7 @@
 from threading import Thread
 from urllib.robotparser import RobotFileParser
 
+from bs4 import BeautifulSoup, Comment
 import re
 from datetime import datetime, timedelta
 
@@ -11,6 +12,11 @@ import async_timeout
 # Ugly hack to use this module alone instead of integrating it with Django
 # from django.conf import settings
 
+# Gets all the direct bookmarks in the html.
+# We want this to avoid following this kind of bookmark
+
+BOOKMARK_URL = "#.*"
+
 class Settings:
     USER_AGENT = 'Blah'
 
@@ -18,6 +24,24 @@ settings = Settings()
 startup_time = datetime.now()
 
 
+def url_getter(html):
+    soup = BeautifulSoup(html, "html.parser")
+    # Get only the body
+    body = soup.find('body')
+    # remove the body
+    body.footer.decompose()
+    # remove all comments
+    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    for comment in comments:
+        comment.extract()
+
+    # Remove all bookmark links pointing to the current html page.
+    links = body.find_all("a")
+    for link in links:
+        if re.match(BOOKMARK_URL, link["href"]):
+            link.extract()
+
+
 class WebsiteSchedulerMeta(type):
     """ Meta-class for WebsiteScheduler, allowing a singleton class-like
     interface, but spawning one instance per canonical website URL """
@@ -54,7 +78,6 @@ class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
                 delay = 5
             else:
                 delay = req_rate.requests, req_rate.seconds
-
         self.crawl_delay = timedelta(seconds=delay)
 
     def urlroot(self):
@@ -87,6 +110,7 @@ class CrawlingThread(Thread):
     def run(self):
         tasks = []
         tasks.append(async_print('https://python.org'))
+        tasks.append(async_print('https://python.org/about/gettingstarted'))
 
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
@@ -116,11 +140,13 @@ class PageGetter:
             async with self.session.get(self.url) as resp:
                 return await resp.text()
 
+async def async_parser(html_text)
 
 async def async_print(url):
     """ Debug function to follow what's actually happening """
     async with aiohttp.ClientSession() as session:
         html = await PageGetter(session, url).get()
+
         print('GOT {}HTML for {} at {}'.format(
             'None ' if html is None else '',
             url,
diff --git a/requirements.txt b/requirements.txt
index bea30c4..480760f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ multidict==4.1.0
 pycares==2.3.0
 pytz==2017.3
 yarl==1.1.1
+beautifulsoup4==4.6.0