Add tentative crawl file
Nothing functional, just tests
This commit is contained in:
parent
c05c2561d2
commit
c97acb22b5
1 changed files with 126 additions and 0 deletions
126
crawl/crawl.py
Normal file
126
crawl/crawl.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
from threading import Thread
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import random
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import async_timeout
|
||||
|
||||
#from django.conf import settings
|
||||
|
||||
|
||||
class Settings:
|
||||
USER_AGENT = 'Blah'
|
||||
|
||||
settings = Settings()
|
||||
startup_time = datetime.now()
|
||||
|
||||
|
||||
class WebsiteSchedulerMeta(type):
|
||||
_instances = {}
|
||||
_canonicalize = re.compile(r'(https?://)?([^/]+)(/?|$)')
|
||||
|
||||
def canonical_url(cls, url):
|
||||
return cls._canonicalize.search(url).groups()[1]
|
||||
|
||||
def __call__(cls, url, *args, **kwargs):
|
||||
canonical = cls.canonical_url(url)
|
||||
if canonical not in cls._instances:
|
||||
cls._instances[canonical] = \
|
||||
super(WebsiteSchedulerMeta, cls) \
|
||||
.__call__(canonical, *args, **kwargs)
|
||||
return cls._instances[canonical]
|
||||
|
||||
|
||||
class WebsiteScheduler(metaclass=WebsiteSchedulerMeta):
|
||||
""" Schedule the accesses to a website as of robots.txt """
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.last_crawled = datetime.fromtimestamp(0)
|
||||
robots_url = self.urlroot() + 'robots.txt'
|
||||
self.robot_parser = RobotFileParser(robots_url)
|
||||
self.robot_parser.read() # TODO async?
|
||||
|
||||
delay = self.robot_parser.crawl_delay(settings.USER_AGENT)
|
||||
if delay is None:
|
||||
req_rate = self.robot_parser.request_rate(settings.USER_AGENT)
|
||||
if req_rate is None:
|
||||
delay = 5
|
||||
else:
|
||||
delay = req_rate.requests, req_rate.seconds
|
||||
|
||||
self.crawl_delay = timedelta(seconds=delay)
|
||||
|
||||
def urlroot(self):
|
||||
''' Get the root url for this website '''
|
||||
return 'https://{}/'.format(self.name)
|
||||
|
||||
def fetch_delay(self):
|
||||
''' Get the delay needed before fetching a page is possible '''
|
||||
can_fetch_time = self.last_crawled + self.crawl_delay
|
||||
if can_fetch_time < datetime.now():
|
||||
return timedelta(0)
|
||||
return can_fetch_time - datetime.now()
|
||||
|
||||
def can_fetch(self, url):
|
||||
''' Check whether this program can fetch a given page '''
|
||||
return self.robot_parser.can_fetch(settings.USER_AGENT, url)
|
||||
|
||||
def fetching(self):
|
||||
''' Tell the scheduler that a page is being fetched now '''
|
||||
self.last_crawled = datetime.now()
|
||||
|
||||
|
||||
class CrawlingThread(Thread):
|
||||
def __init__(self):
|
||||
super(CrawlingThread, self).__init__()
|
||||
|
||||
def run(self):
|
||||
tasks = []
|
||||
tasks.append(async_print('https://python.org'))
|
||||
tasks.append(async_print('https://python.org/webstats/'))
|
||||
tasks.append(async_print('https://python.org/3.5/'))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(asyncio.wait(tasks))
|
||||
loop.close()
|
||||
|
||||
|
||||
class PageGetter:
|
||||
def __init__(self, session, url):
|
||||
self.url = url
|
||||
self.session = session
|
||||
|
||||
async def get(self):
|
||||
scheduler = WebsiteScheduler(self.url)
|
||||
if not scheduler.can_fetch(self.url):
|
||||
return None
|
||||
|
||||
delay = scheduler.fetch_delay()
|
||||
while delay > timedelta(0):
|
||||
await asyncio.sleep(delay.total_seconds())
|
||||
delay = scheduler.fetch_delay()
|
||||
scheduler.fetching()
|
||||
async with async_timeout.timeout(10):
|
||||
async with self.session.get(self.url) as resp:
|
||||
return await resp.text()
|
||||
|
||||
|
||||
async def async_print(url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await PageGetter(session, url).get()
|
||||
print('GOT {}HTML for {} at {}'.format(
|
||||
'None ' if html is None else '',
|
||||
url,
|
||||
datetime.now() - startup_time))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawl = CrawlingThread()
|
||||
crawl.start()
|
||||
crawl.join()
|
Loading…
Add table
Reference in a new issue