Integration of crawl module in histories

This commit is contained in:
Rémi Oudin 2018-02-24 23:17:24 +01:00
parent 60bfc8cb77
commit bc7348f677
2 changed files with 34 additions and 16 deletions

View file

@ -1,4 +1,5 @@
from threading import Thread from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from urllib.error import URLError from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
@ -26,10 +27,10 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*") FOOTER_URL = re.compile(".*footer.*")
class Settings: class Settings:
USER_AGENT = 'Blah' USER_AGENT = 'Default User'
settings = Settings() settings = Settings()
startup_time = datetime.now() startup_time = datetime.min
def url_getter(html, current_page, root_url): def url_getter(html, current_page, root_url):
@ -161,16 +162,24 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self): def __init__(self, user, url, queue):
global settings
self.queue = queue
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url
def run(self): def run(self):
global startup_time
tasks = [] tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks)) loop.run_until_complete(asyncio.wait(tasks))
loop.close() loop.close()
@ -211,13 +220,13 @@ async def async_print(url):
url, url,
datetime.now() - startup_time)) datetime.now() - startup_time))
async def async_crawler(url): async def async_crawler(url, queue):
queue = [url] queued = [url]
crawled = [] crawled = []
while queue and (len(crawled) < HARD_LIMIT): while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
try: try:
url = queue.pop(0) url = queued.pop(0)
except IndexError: except IndexError:
print("Error queue is empty") print("Error queue is empty")
return crawled return crawled
@ -236,13 +245,14 @@ async def async_crawler(url):
new_urls, new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls))) randrange(min(MAX_PER_PAGE, len(new_urls)))
) )
queue += [sample_url for sample_url in sampled if queued += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in sample_url not in queued and sample_url not in
crawled] crawled]
print(crawled) print(crawled)
return crawled queue.put(crawled)
if __name__ == '__main__': if __name__ == '__main__':
crawl = CrawlingThread() queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue)
crawl.start() crawl.start()
crawl.join() crawl.join()

View file

@ -5,9 +5,10 @@ interests, keywords...
import random import random
from math import floor from math import floor
from queue import Queue
from django.db import models from django.db import models
import profiles.models as profiles import profiles.models as profiles
#from crawl import crawl from crawl import crawl
from pinocchio.settings import HISTORY_MIN from pinocchio.settings import HISTORY_MIN
@ -63,7 +64,7 @@ class History(models.Model):
self.save() self.save()
def generate_partial_history(user, t_start, history): def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at """ Generate the part of the history resulting from the crawl starting at
the given url. the given url.
""" """
@ -71,7 +72,14 @@ def generate_partial_history(user, t_start, history):
basis = generate_first_url(user) basis = generate_first_url(user)
result.append((basis, t_start)) result.append((basis, t_start))
t_start += 5* random.weibullvariate(1, 1.5) t_start += 5* random.weibullvariate(1, 1.5)
#crawler = crawl.CrawlingThread() queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
t_start += 5* random.weibullvariate(1, 1.5)
result.append((url, t_start)
return result return result
def generate_first_url(user): def generate_first_url(user):
@ -104,7 +112,7 @@ def generate_history(user, ts_start):
while history_line < length: while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8) ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start, history) history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list: for (url, timestamp) in history_list:
new_line = HistoryEntry( new_line = HistoryEntry(