Nearly working crawler
This commit is contained in:
parent
e19e623df1
commit
9b78e268c9
1 changed files with 15 additions and 5 deletions
|
@ -21,8 +21,10 @@ from bs4 import BeautifulSoup, Comment
|
||||||
HARD_LIMIT = 20
|
HARD_LIMIT = 20
|
||||||
MAX_PER_PAGE = 10
|
MAX_PER_PAGE = 10
|
||||||
|
|
||||||
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Blah'
|
USER_AGENT = 'BlahBlah'
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
startup_time = datetime.now()
|
startup_time = datetime.now()
|
||||||
|
@ -34,12 +36,18 @@ def url_getter(html, current_page, root_url):
|
||||||
# Get only the body
|
# Get only the body
|
||||||
body = soup.find('body')
|
body = soup.find('body')
|
||||||
# remove the body
|
# remove the body
|
||||||
|
if body.footer:
|
||||||
body.footer.decompose()
|
body.footer.decompose()
|
||||||
# remove all comments
|
# remove all comments
|
||||||
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
||||||
for comment in comments:
|
for comment in comments:
|
||||||
comment.extract()
|
comment.extract()
|
||||||
|
|
||||||
|
print("Retrieving footers")
|
||||||
|
footers = soup.findAll(id=FOOTER_URL)
|
||||||
|
for footer in footers:
|
||||||
|
footer.extract()
|
||||||
|
|
||||||
# Remove all bookmark links pointing to the current html page.
|
# Remove all bookmark links pointing to the current html page.
|
||||||
links = map(lambda link: link["href"], body.find_all("a"))
|
links = map(lambda link: link["href"], body.find_all("a"))
|
||||||
for link in links:
|
for link in links:
|
||||||
|
@ -132,7 +140,7 @@ class CrawlingThread(Thread):
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
tasks = []
|
tasks = []
|
||||||
tasks.append(async_crawler('https://python.org'))
|
tasks.append(async_crawler("https://python.org/"))
|
||||||
#tasks.append(async_print('https://python.org/about/gettingstarted'))
|
#tasks.append(async_print('https://python.org/about/gettingstarted'))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
|
@ -192,11 +200,13 @@ async def async_crawler(url):
|
||||||
url,
|
url,
|
||||||
parsed_url.scheme + "://" + parsed_url.netloc
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
)
|
)
|
||||||
crawled += url
|
crawled += [url]
|
||||||
queue += sample(
|
sampled = sample(
|
||||||
new_urls,
|
new_urls,
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
)
|
)
|
||||||
|
queue += [sample_url for sample_url in sampled if sample_url not in
|
||||||
|
queue and sample_url not in crawled]
|
||||||
print(crawled)
|
print(crawled)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue