Multiple bug fixes. TODO : remove <div id=footer>-like patterns
This commit is contained in:
parent
236e15296c
commit
e19e623df1
1 changed files with 37 additions and 8 deletions
|
@ -1,7 +1,8 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Comment
|
from random import sample, randrange
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -9,13 +10,16 @@ import asyncio
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import async_timeout
|
import async_timeout
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
# Ugly hack to use this module alone instead of integrating it with Django
|
# Ugly hack to use this module alone instead of integrating it with Django
|
||||||
# from django.conf import settings
|
# from django.conf import settings
|
||||||
|
|
||||||
# Gets all the direct bookmarks in the html.
|
# Gets all the direct bookmarks in the html.
|
||||||
# We want this to avoid following this kind of bookmark
|
# We want this to avoid following this kind of bookmark
|
||||||
|
|
||||||
BOOKMARK_URL = "#.*"
|
HARD_LIMIT = 20
|
||||||
|
MAX_PER_PAGE = 10
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Blah'
|
USER_AGENT = 'Blah'
|
||||||
|
@ -32,17 +36,17 @@ def url_getter(html, current_page, root_url):
|
||||||
# remove the body
|
# remove the body
|
||||||
body.footer.decompose()
|
body.footer.decompose()
|
||||||
# remove all comments
|
# remove all comments
|
||||||
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
||||||
for comment in comments:
|
for comment in comments:
|
||||||
comment.extract()
|
comment.extract()
|
||||||
|
|
||||||
# Remove all bookmark links pointing to the current html page.
|
# Remove all bookmark links pointing to the current html page.
|
||||||
links = body.find_all("a")
|
links = map(lambda link: link["href"], body.find_all("a"))
|
||||||
for link in links:
|
for link in links:
|
||||||
if link.startswith("http"):
|
if link.startswith("http"):
|
||||||
links_list.append(link)
|
links_list.append(link)
|
||||||
elif link.startswith('/'): #Internal link, linking to page root url
|
elif link.startswith('/'): #Internal link, linking to page root url
|
||||||
link_list.append(root_url + link)
|
links_list.append(root_url + link)
|
||||||
elif link.startswith("#"):
|
elif link.startswith("#"):
|
||||||
print("Invalid link : internal bookmark")
|
print("Invalid link : internal bookmark")
|
||||||
else:
|
else:
|
||||||
|
@ -53,7 +57,8 @@ def url_getter(html, current_page, root_url):
|
||||||
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
|
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
|
||||||
# uniqifier
|
# uniqifier
|
||||||
# Works only with python >= 3.6
|
# Works only with python >= 3.6
|
||||||
links_list = list(dict.fromkeys(seq))
|
links_list = list(dict.fromkeys(links_list))
|
||||||
|
print(links_list)
|
||||||
|
|
||||||
return links_list
|
return links_list
|
||||||
|
|
||||||
|
@ -127,8 +132,8 @@ class CrawlingThread(Thread):
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
tasks = []
|
tasks = []
|
||||||
tasks.append(async_print('https://python.org'))
|
tasks.append(async_crawler('https://python.org'))
|
||||||
tasks.append(async_print('https://python.org/about/gettingstarted'))
|
#tasks.append(async_print('https://python.org/about/gettingstarted'))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
@ -169,6 +174,30 @@ async def async_print(url):
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
datetime.now() - startup_time))
|
||||||
|
|
||||||
|
async def async_crawler(url):
|
||||||
|
queue = [url]
|
||||||
|
crawled = []
|
||||||
|
while (not queue) or (len(crawled) < HARD_LIMIT):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
try:
|
||||||
|
url = queue.pop(0)
|
||||||
|
except IndexError:
|
||||||
|
print("Error queue is empty")
|
||||||
|
return crawled
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
print("Crawling {}".format(url))
|
||||||
|
html = await PageGetter(session, url).get()
|
||||||
|
new_urls = url_getter(
|
||||||
|
html,
|
||||||
|
url,
|
||||||
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
|
)
|
||||||
|
crawled += url
|
||||||
|
queue += sample(
|
||||||
|
new_urls,
|
||||||
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
|
)
|
||||||
|
print(crawled)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawl = CrawlingThread()
|
crawl = CrawlingThread()
|
||||||
|
|
Loading…
Reference in a new issue