Multiple bug fixes. TODO : remove <div id=footer>-like patterns

This commit is contained in:
Rémi Oudin 2018-02-22 14:07:53 +01:00
parent 236e15296c
commit e19e623df1

View file

@ -1,7 +1,8 @@
from threading import Thread from threading import Thread
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from bs4 import BeautifulSoup, Comment from random import sample, randrange
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -9,13 +10,16 @@ import asyncio
import aiohttp import aiohttp
import async_timeout import async_timeout
from bs4 import BeautifulSoup, Comment
# Ugly hack to use this module alone instead of integrating it with Django # Ugly hack to use this module alone instead of integrating it with Django
# from django.conf import settings # from django.conf import settings
# Gets all the direct bookmarks in the html. # Gets all the direct bookmarks in the html.
# We want this to avoid following this kind of bookmark # We want this to avoid following this kind of bookmark
BOOKMARK_URL = "#.*" HARD_LIMIT = 20
MAX_PER_PAGE = 10
class Settings: class Settings:
USER_AGENT = 'Blah' USER_AGENT = 'Blah'
@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url):
# remove the body # remove the body
body.footer.decompose() body.footer.decompose()
# remove all comments # remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment)) comments = soup.findAll(text=lambda text: isinstance(text, Comment))
for comment in comments: for comment in comments:
comment.extract() comment.extract()
# Remove all bookmark links pointing to the current html page. # Remove all bookmark links pointing to the current html page.
links = body.find_all("a") links = map(lambda link: link["href"], body.find_all("a"))
for link in links: for link in links:
if link.startswith("http"): if link.startswith("http"):
links_list.append(link) links_list.append(link)
elif link.startswith('/'): #Internal link, linking to page root url elif link.startswith('/'): #Internal link, linking to page root url
link_list.append(root_url + link) links_list.append(root_url + link)
elif link.startswith("#"): elif link.startswith("#"):
print("Invalid link : internal bookmark") print("Invalid link : internal bookmark")
else: else:
@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url):
#links_list = [x for x in links_list if x not in seen and not seen.add(x)] #links_list = [x for x in links_list if x not in seen and not seen.add(x)]
# uniqifier # uniqifier
# Works only with python >= 3.6 # Works only with python >= 3.6
links_list = list(dict.fromkeys(seq)) links_list = list(dict.fromkeys(links_list))
print(links_list)
return links_list return links_list
@ -127,8 +132,8 @@ class CrawlingThread(Thread):
def run(self): def run(self):
tasks = [] tasks = []
tasks.append(async_print('https://python.org')) tasks.append(async_crawler('https://python.org'))
tasks.append(async_print('https://python.org/about/gettingstarted')) #tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -169,6 +174,30 @@ async def async_print(url):
url, url,
datetime.now() - startup_time)) datetime.now() - startup_time))
async def async_crawler(url):
queue = [url]
crawled = []
while (not queue) or (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queue.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url).get()
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += url
queue += sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
print(crawled)
if __name__ == '__main__': if __name__ == '__main__':
crawl = CrawlingThread() crawl = CrawlingThread()