2018-02-20 12:48:53 +01:00
|
|
|
from threading import Thread
|
2018-02-24 23:17:24 +01:00
|
|
|
from queue import Queue
|
2018-02-20 12:48:53 +01:00
|
|
|
from urllib.robotparser import RobotFileParser
|
2018-02-23 00:37:36 +01:00
|
|
|
from urllib.error import URLError
|
2018-02-22 14:07:53 +01:00
|
|
|
from urllib.parse import urlparse
|
2018-02-20 12:48:53 +01:00
|
|
|
|
2018-02-23 00:37:36 +01:00
|
|
|
from ssl import CertificateError
|
2018-02-26 11:27:07 +01:00
|
|
|
from random import sample, randrange, randint
|
2018-02-20 12:48:53 +01:00
|
|
|
import re
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
import aiohttp
|
|
|
|
import async_timeout
|
|
|
|
|
2018-02-22 14:07:53 +01:00
|
|
|
from bs4 import BeautifulSoup, Comment
|
|
|
|
|
2018-02-26 11:45:08 +01:00
|
|
|
from profiles.models import BrowserFingerprint, SearchEngine
|
2018-02-26 11:27:07 +01:00
|
|
|
|
2018-02-21 11:54:41 +01:00
|
|
|
# Ugly hack to use this module alone instead of integrating it with Django
|
|
|
|
# from django.conf import settings
|
2018-02-20 12:48:53 +01:00
|
|
|
|
2018-02-21 19:06:46 +01:00
|
|
|
# Gets all the direct bookmarks in the html.
|
|
|
|
# We want this to avoid following this kind of bookmark
|
|
|
|
|
2018-02-22 14:07:53 +01:00
|
|
|
HARD_LIMIT = 20
|
|
|
|
MAX_PER_PAGE = 10
|
2018-02-21 19:06:46 +01:00
|
|
|
|
2018-02-22 14:33:07 +01:00
|
|
|
FOOTER_URL = re.compile(".*footer.*")
|
|
|
|
|
2018-02-26 11:12:36 +01:00
|
|
|
|
2018-02-20 12:48:53 +01:00
|
|
|
class Settings:
|
2018-02-24 23:17:24 +01:00
|
|
|
USER_AGENT = 'Default User'
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
settings = Settings()
|
|
|
|
|
|
|
|
|
2018-02-21 22:51:05 +01:00
|
|
|
def url_getter(html, current_page, root_url):
|
|
|
|
links_list = [] # The final resutl
|
2018-02-21 19:06:46 +01:00
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Get only the body
|
|
|
|
body = soup.find('body')
|
2018-02-23 00:37:36 +01:00
|
|
|
if not body:
|
|
|
|
return links_list
|
2018-02-21 19:06:46 +01:00
|
|
|
# remove the body
|
2018-02-22 14:33:07 +01:00
|
|
|
if body.footer:
|
|
|
|
body.footer.decompose()
|
2018-02-21 19:06:46 +01:00
|
|
|
# remove all comments
|
2018-02-22 14:07:53 +01:00
|
|
|
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
2018-02-21 19:06:46 +01:00
|
|
|
for comment in comments:
|
|
|
|
comment.extract()
|
|
|
|
|
2018-02-22 14:33:07 +01:00
|
|
|
footers = soup.findAll(id=FOOTER_URL)
|
|
|
|
for footer in footers:
|
|
|
|
footer.extract()
|
|
|
|
|
2018-02-21 19:06:46 +01:00
|
|
|
# Remove all bookmark links pointing to the current html page.
|
2018-02-23 00:37:36 +01:00
|
|
|
links = map(lambda link: link.get("href", ""), body.find_all("a"))
|
2018-02-21 19:06:46 +01:00
|
|
|
for link in links:
|
2018-02-23 00:37:36 +01:00
|
|
|
if link: #Edge case, if no href found.
|
|
|
|
if link.startswith("http"):
|
|
|
|
links_list.append(link)
|
|
|
|
elif link.startswith('/'): #Internal link, linking to page root url
|
|
|
|
links_list.append(root_url + link)
|
|
|
|
elif link.startswith("#"):
|
2018-02-24 11:39:04 +01:00
|
|
|
continue
|
2018-02-23 00:37:36 +01:00
|
|
|
else:
|
|
|
|
links_list.append(current_page + "/" + link)
|
2018-02-21 22:51:05 +01:00
|
|
|
|
|
|
|
## uniqifier works with python <= 3.6
|
|
|
|
#seen = set()
|
|
|
|
#links_list = [x for x in links_list if x not in seen and not seen.add(x)]
|
|
|
|
# uniqifier
|
|
|
|
# Works only with python >= 3.6
|
2018-02-22 14:07:53 +01:00
|
|
|
links_list = list(dict.fromkeys(links_list))
|
2018-02-21 22:51:05 +01:00
|
|
|
|
2018-02-24 15:41:46 +01:00
|
|
|
forbidden_words = ['login', 'agreement', 'mailto']
|
2018-02-24 11:39:04 +01:00
|
|
|
links_list = [link for link in links_list if not any(word in link.lower()
|
|
|
|
for word in
|
|
|
|
forbidden_words)]
|
|
|
|
|
2018-02-21 23:11:57 +01:00
|
|
|
return links_list
|
|
|
|
|
2018-02-21 22:51:05 +01:00
|
|
|
|
2018-02-20 12:48:53 +01:00
|
|
|
class WebsiteSchedulerMeta(type):
|
2018-02-21 11:54:41 +01:00
|
|
|
""" Meta-class for WebsiteScheduler, allowing a singleton class-like
|
|
|
|
interface, but spawning one instance per canonical website URL """
|
|
|
|
|
2018-02-20 12:48:53 +01:00
|
|
|
_instances = {}
|
|
|
|
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
|
|
|
|
|
|
|
def canonical_url(cls, url):
|
2018-02-21 11:54:41 +01:00
|
|
|
""" Canonicalize a url """
|
2018-02-20 12:48:53 +01:00
|
|
|
return cls._canonicalize.search(url).groups()[1]
|
|
|
|
|
|
|
|
def __call__(cls, url, *args, **kwargs):
|
|
|
|
canonical = cls.canonical_url(url)
|
|
|
|
if canonical not in cls._instances:
|
|
|
|
cls._instances[canonical] = \
|
|
|
|
super(WebsiteSchedulerMeta, cls) \
|
|
|
|
.__call__(canonical, *args, **kwargs)
|
|
|
|
return cls._instances[canonical]
|
|
|
|
|
|
|
|
|
|
|
|
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
|
|
|
""" Schedule the accesses to a website as of robots.txt """
|
2018-02-26 11:45:08 +01:00
|
|
|
|
|
|
|
search_engines = [] # Must be set by CrawlingThread.__init__
|
|
|
|
|
2018-02-26 11:27:07 +01:00
|
|
|
def __init__(self, name, user_agent):
|
2018-02-20 12:48:53 +01:00
|
|
|
self.name = name
|
|
|
|
self.last_crawled = datetime.fromtimestamp(0)
|
2018-02-23 00:37:36 +01:00
|
|
|
self.dead = False
|
2018-02-26 11:12:36 +01:00
|
|
|
self.can_fetch_b = False
|
2018-02-26 11:27:07 +01:00
|
|
|
self.user_agent = (user_agent if user_agent is not None
|
|
|
|
else settings.USER_AGENT)
|
2018-02-26 11:45:08 +01:00
|
|
|
if any(self.urlroot() in item for item in self.search_engines):
|
2018-02-26 11:12:36 +01:00
|
|
|
print("found a search engine for %s" % self.urlroot())
|
|
|
|
self.crawl_delay = timedelta(seconds=5)
|
|
|
|
self.can_fetch_b = True
|
|
|
|
else:
|
2018-02-23 00:37:36 +01:00
|
|
|
try:
|
2018-02-26 11:12:36 +01:00
|
|
|
robots_url = self.urlroot() + 'robots.txt'
|
2018-02-23 00:37:36 +01:00
|
|
|
self.robot_parser = RobotFileParser(robots_url)
|
2018-02-26 11:12:36 +01:00
|
|
|
self.robot_parser.read() # TODO async?
|
|
|
|
except (URLError, CertificateError):
|
|
|
|
try:
|
|
|
|
robots_url = self.unsafe_urlroot() + 'robots.txt'
|
|
|
|
self.robot_parser = RobotFileParser(robots_url)
|
|
|
|
self.robot_parser.read()
|
2018-02-26 11:27:07 +01:00
|
|
|
except URLError: # Almost surely an offline website.
|
2018-02-26 11:12:36 +01:00
|
|
|
self.dead = True
|
|
|
|
self.crawl_delay = 0
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
raise e
|
|
|
|
if not self.robot_parser.default_entry:
|
2018-02-23 00:37:36 +01:00
|
|
|
self.dead = True
|
2018-02-26 11:12:36 +01:00
|
|
|
if not self.dead:
|
2018-02-26 11:27:07 +01:00
|
|
|
delay = self.robot_parser.crawl_delay(self.user_agent)
|
2018-02-26 11:12:36 +01:00
|
|
|
if delay is None:
|
2018-02-26 11:27:07 +01:00
|
|
|
req_rate = self.robot_parser.request_rate(self.user_agent)
|
2018-02-26 11:12:36 +01:00
|
|
|
if req_rate is None:
|
|
|
|
delay = 5
|
|
|
|
else:
|
|
|
|
delay = req_rate.requests, req_rate.seconds
|
|
|
|
self.crawl_delay = timedelta(seconds=delay)
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
def urlroot(self):
|
|
|
|
''' Get the root url for this website '''
|
|
|
|
return 'https://{}/'.format(self.name)
|
|
|
|
|
2018-02-23 00:37:36 +01:00
|
|
|
def unsafe_urlroot(self):
|
|
|
|
return 'http://{}/'.format(self.name)
|
|
|
|
|
2018-02-20 12:48:53 +01:00
|
|
|
def fetch_delay(self):
|
|
|
|
''' Get the delay needed before fetching a page is possible '''
|
|
|
|
can_fetch_time = self.last_crawled + self.crawl_delay
|
|
|
|
if can_fetch_time < datetime.now():
|
|
|
|
return timedelta(0)
|
|
|
|
return can_fetch_time - datetime.now()
|
|
|
|
|
|
|
|
def can_fetch(self, url):
|
|
|
|
''' Check whether this program can fetch a given page '''
|
2018-02-26 11:27:07 +01:00
|
|
|
return ((self.can_fetch_b)
|
|
|
|
or ((not self.dead) and
|
|
|
|
self.robot_parser.can_fetch(self.user_agent, url)))
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
def fetching(self):
|
|
|
|
''' Tell the scheduler that a page is being fetched now '''
|
|
|
|
self.last_crawled = datetime.now()
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlingThread(Thread):
|
2018-02-21 11:54:41 +01:00
|
|
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
|
|
|
since the thread will need its own event loop. """
|
|
|
|
|
2018-02-26 00:24:54 +01:00
|
|
|
def __init__(self, url, queue):
|
2018-02-26 11:45:08 +01:00
|
|
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
|
|
|
WebsiteScheduler.search_engines = engine_list
|
2018-02-26 11:27:07 +01:00
|
|
|
|
|
|
|
nb_fingerprint = len(BrowserFingerprint.objects.all())
|
|
|
|
fingerprint = BrowserFingerprint.objects.all()[
|
|
|
|
randint(0, nb_fingerprint - 1)]
|
|
|
|
self.headers = fingerprint.serialize_headers()
|
|
|
|
|
2018-02-24 23:17:24 +01:00
|
|
|
self.queue = queue
|
2018-02-20 12:48:53 +01:00
|
|
|
super(CrawlingThread, self).__init__()
|
2018-02-24 23:17:24 +01:00
|
|
|
self.url = url
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
def run(self):
|
|
|
|
tasks = []
|
2018-02-26 11:27:07 +01:00
|
|
|
|
2018-02-23 00:37:36 +01:00
|
|
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
2018-02-24 23:17:24 +01:00
|
|
|
#tasks.append(async_crawler('https://python.org/'))
|
2018-02-26 11:27:07 +01:00
|
|
|
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
asyncio.set_event_loop(loop)
|
|
|
|
loop.run_until_complete(asyncio.wait(tasks))
|
|
|
|
loop.close()
|
|
|
|
|
|
|
|
|
|
|
|
class PageGetter:
|
2018-02-21 11:54:41 +01:00
|
|
|
""" Asynchronously get a webpage, abiding by robots.txt """
|
|
|
|
|
2018-02-26 11:27:07 +01:00
|
|
|
headers = None
|
|
|
|
|
|
|
|
def __init__(self, session, url, user_agent):
|
2018-02-20 12:48:53 +01:00
|
|
|
self.url = url
|
|
|
|
self.session = session
|
2018-02-26 11:27:07 +01:00
|
|
|
self.user_agent = user_agent
|
2018-02-20 12:48:53 +01:00
|
|
|
|
2018-02-23 00:37:36 +01:00
|
|
|
async def get(self, ssl=True):
|
2018-02-21 11:54:41 +01:00
|
|
|
""" Actually retrieve the webpage """
|
2018-02-26 11:27:07 +01:00
|
|
|
scheduler = WebsiteScheduler(self.url, self.user_agent)
|
2018-02-20 12:48:53 +01:00
|
|
|
if not scheduler.can_fetch(self.url):
|
|
|
|
return None
|
|
|
|
|
|
|
|
delay = scheduler.fetch_delay()
|
|
|
|
while delay > timedelta(0):
|
|
|
|
await asyncio.sleep(delay.total_seconds())
|
|
|
|
delay = scheduler.fetch_delay()
|
|
|
|
scheduler.fetching()
|
|
|
|
async with async_timeout.timeout(10):
|
2018-02-26 10:23:32 +01:00
|
|
|
async with self.session.get(self.url, verify_ssl=ssl) as resp:
|
2018-02-26 11:12:36 +01:00
|
|
|
print("Resp status %s" % resp.status)
|
2018-02-23 00:37:36 +01:00
|
|
|
try:
|
|
|
|
return await resp.text()
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
return None
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
|
|
|
|
async def async_print(url):
|
2018-02-21 11:54:41 +01:00
|
|
|
""" Debug function to follow what's actually happening """
|
2018-02-20 12:48:53 +01:00
|
|
|
async with aiohttp.ClientSession() as session:
|
2018-02-26 11:56:02 +01:00
|
|
|
html = await PageGetter(session, url,
|
|
|
|
settings.USER_AGENT).get(ssl=False)
|
2018-02-21 19:06:46 +01:00
|
|
|
|
2018-02-26 11:27:07 +01:00
|
|
|
print('GOT {}HTML for {}'.format(
|
2018-02-20 12:48:53 +01:00
|
|
|
'None ' if html is None else '',
|
|
|
|
url,
|
2018-02-26 11:27:07 +01:00
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
async def async_crawler(url, queue, headers=None):
|
|
|
|
if headers is None:
|
2018-02-26 11:56:02 +01:00
|
|
|
headers = {}
|
|
|
|
if 'User-Agent' not in headers:
|
|
|
|
headers['User-Agent'] = settings.USER_AGENT
|
|
|
|
|
|
|
|
user_agent = headers['User-Agent']
|
2018-02-20 12:48:53 +01:00
|
|
|
|
2018-02-24 23:17:24 +01:00
|
|
|
queued = [url]
|
2018-02-22 14:07:53 +01:00
|
|
|
crawled = []
|
2018-02-24 23:17:24 +01:00
|
|
|
while queued and (len(crawled) < HARD_LIMIT):
|
2018-02-26 11:27:07 +01:00
|
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
2018-02-22 14:07:53 +01:00
|
|
|
try:
|
2018-02-24 23:17:24 +01:00
|
|
|
url = queued.pop(0)
|
2018-02-22 14:07:53 +01:00
|
|
|
except IndexError:
|
|
|
|
print("Error queue is empty")
|
|
|
|
return crawled
|
|
|
|
parsed_url = urlparse(url)
|
|
|
|
print("Crawling {}".format(url))
|
2018-02-26 11:56:02 +01:00
|
|
|
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
2018-02-23 00:37:36 +01:00
|
|
|
if html:
|
|
|
|
new_urls = url_getter(
|
|
|
|
html,
|
|
|
|
url,
|
|
|
|
parsed_url.scheme + "://" + parsed_url.netloc
|
|
|
|
)
|
|
|
|
crawled += [url]
|
|
|
|
if new_urls:
|
|
|
|
sampled = sample(
|
|
|
|
new_urls,
|
|
|
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
|
|
|
)
|
2018-02-24 23:17:24 +01:00
|
|
|
queued += [sample_url for sample_url in sampled if
|
|
|
|
sample_url not in queued and sample_url not in
|
2018-02-23 00:37:36 +01:00
|
|
|
crawled]
|
2018-02-26 11:12:36 +01:00
|
|
|
else:
|
|
|
|
print("No html received")
|
2018-02-22 14:07:53 +01:00
|
|
|
print(crawled)
|
2018-02-24 23:17:24 +01:00
|
|
|
queue.put(crawled)
|
2018-02-20 12:48:53 +01:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2018-02-24 23:17:24 +01:00
|
|
|
queue = Queue()
|
2018-02-26 11:12:36 +01:00
|
|
|
crawl = CrawlingThread(None,
|
|
|
|
"https://google.com/search?q=fabriquer+masque+manif",
|
|
|
|
["https://google.com/search/"], queue)
|
2018-02-20 12:48:53 +01:00
|
|
|
crawl.start()
|
|
|
|
crawl.join()
|