Multiple bug fixes. TODO : remove <div id=footer>-like patterns

This commit is contained in:
Rémi Oudin 2018-02-22 14:07:53 +01:00
parent 236e15296c
commit e19e623df1

View file

@ -1,7 +1,8 @@
from threading import Thread
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from bs4 import BeautifulSoup, Comment
from random import sample, randrange
import re
from datetime import datetime, timedelta
@ -9,13 +10,16 @@ import asyncio
import aiohttp
import async_timeout
from bs4 import BeautifulSoup, Comment
# Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings
# Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark
BOOKMARK_URL = "#.*"
HARD_LIMIT = 20
MAX_PER_PAGE = 10
class Settings:
USER_AGENT = 'Blah'
@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url):
# remove the body
body.footer.decompose()
# remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
for comment in comments:
comment.extract()
# Remove all bookmark links pointing to the current html page.
links = body.find_all("a")
links = map(lambda link: link["href"], body.find_all("a"))
for link in links:
if link.startswith("http"):
links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url
link_list.append(root_url + link)
links_list.append(root_url + link)
elif link.startswith("#"):
print("Invalid link : internal bookmark")
else:
@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url):
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier
# Works only with python >= 3.6
links_list = list(dict.fromkeys(seq))
links_list = list(dict.fromkeys(links_list))
print(links_list)
return links_list
@ -127,8 +132,8 @@ class CrawlingThread(Thread):
def run(self):
tasks = []
tasks.append(async_print('https://python.org'))
tasks.append(async_print('https://python.org/about/gettingstarted'))
tasks.append(async_crawler('https://python.org'))
#tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -169,6 +174,30 @@ async def async_print(url):
url,
datetime.now() - startup_time))
async def async_crawler(url):
queue = [url]
crawled = []
while (not queue) or (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queue.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get()
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += url
queue += sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
print(crawled)
if __name__ == '__main__':
crawl = CrawlingThread()