Nearly working crawler

This commit is contained in:
Rémi Oudin 2018-02-22 14:33:07 +01:00
parent e19e623df1
commit 9b78e268c9

View file

@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
HARD_LIMIT = 20 HARD_LIMIT = 20
MAX_PER_PAGE = 10 MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings: class Settings:
USER_AGENT = 'Blah' USER_AGENT = 'BlahBlah'
settings = Settings() settings = Settings()
startup_time = datetime.now() startup_time = datetime.now()
@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
# Get only the body # Get only the body
body = soup.find('body') body = soup.find('body')
# remove the body # remove the body
body.footer.decompose() if body.footer:
body.footer.decompose()
# remove all comments # remove all comments
comments = soup.findAll(text=lambda text: isinstance(text, Comment)) comments = soup.findAll(text=lambda text: isinstance(text, Comment))
for comment in comments: for comment in comments:
comment.extract() comment.extract()
print("Retrieving footers")
footers = soup.findAll(id=FOOTER_URL)
for footer in footers:
footer.extract()
# Remove all bookmark links pointing to the current html page. # Remove all bookmark links pointing to the current html page.
links = map(lambda link: link["href"], body.find_all("a")) links = map(lambda link: link["href"], body.find_all("a"))
for link in links: for link in links:
@ -132,7 +140,7 @@ class CrawlingThread(Thread):
def run(self): def run(self):
tasks = [] tasks = []
tasks.append(async_crawler('https://python.org')) tasks.append(async_crawler("https://python.org/"))
#tasks.append(async_print('https://python.org/about/gettingstarted')) #tasks.append(async_print('https://python.org/about/gettingstarted'))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
@ -192,11 +200,13 @@ async def async_crawler(url):
url, url,
parsed_url.scheme + "://" + parsed_url.netloc parsed_url.scheme + "://" + parsed_url.netloc
) )
crawled += url crawled += [url]
queue += sample( sampled = sample(
new_urls, new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls))) randrange(min(MAX_PER_PAGE, len(new_urls)))
) )
queue += [sample_url for sample_url in sampled if sample_url not in
queue and sample_url not in crawled]
print(crawled) print(crawled)
if __name__ == '__main__': if __name__ == '__main__':