Integration of crawl module in histories

This commit is contained in:
Rémi Oudin 2018-02-24 23:17:24 +01:00
parent 60bfc8cb77
commit bc7348f677
2 changed files with 34 additions and 16 deletions

View file

@ -1,4 +1,5 @@
from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
@ -26,10 +27,10 @@ MAX_PER_PAGE = 10
FOOTER_URL = re.compile(".*footer.*")
class Settings:
USER_AGENT = 'Blah'
USER_AGENT = 'Default User'
settings = Settings()
startup_time = datetime.now()
startup_time = datetime.min
def url_getter(html, current_page, root_url):
@ -161,16 +162,24 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self):
def __init__(self, user, url, queue):
global settings
self.queue = queue
super(CrawlingThread, self).__init__()
if user:
settings.USER_AGENT = user.serialize_headers()
self.url = url
def run(self):
global startup_time
tasks = []
#tasks.append(async_crawler("http://plus.google.com/+Python"))
tasks.append(async_crawler('https://python.org/'))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
startup_time = datetime.now()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
@ -211,13 +220,13 @@ async def async_print(url):
url,
datetime.now() - startup_time))
async def async_crawler(url):
queue = [url]
async def async_crawler(url, queue):
queued = [url]
crawled = []
while queue and (len(crawled) < HARD_LIMIT):
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession() as session:
try:
url = queue.pop(0)
url = queued.pop(0)
except IndexError:
print("Error queue is empty")
return crawled
@ -236,13 +245,14 @@ async def async_crawler(url):
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queue += [sample_url for sample_url in sampled if
sample_url not in queue and sample_url not in
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
print(crawled)
return crawled
queue.put(crawled)
if __name__ == '__main__':
crawl = CrawlingThread()
queue = Queue()
crawl = CrawlingThread(None, "https://python.org/", queue)
crawl.start()
crawl.join()

View file

@ -5,9 +5,10 @@ interests, keywords...
import random
from math import floor
from queue import Queue
from django.db import models
import profiles.models as profiles
#from crawl import crawl
from crawl import crawl
from pinocchio.settings import HISTORY_MIN
@ -63,7 +64,7 @@ class History(models.Model):
self.save()
def generate_partial_history(user, t_start, history):
def generate_partial_history(user, t_start):
""" Generate the part of the history resulting from the crawl starting at
the given url.
"""
@ -71,7 +72,14 @@ def generate_partial_history(user, t_start, history):
basis = generate_first_url(user)
result.append((basis, t_start))
t_start += 5* random.weibullvariate(1, 1.5)
#crawler = crawl.CrawlingThread()
queue = Queue()
crawler = crawl.CrawlingThread(user, basis, queue)
crawler.start()
crawler.join()
urls = queue.get()
for url in urls:
t_start += 5* random.weibullvariate(1, 1.5)
result.append((url, t_start)
return result
def generate_first_url(user):
@ -104,7 +112,7 @@ def generate_history(user, ts_start):
while history_line < length:
ts_start += 5 * random.weibullvariate(1, 2.8)
history_list = generate_partial_history(user, ts_start, history)
history_list = generate_partial_history(user, ts_start)
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
for (url, timestamp) in history_list:
new_line = HistoryEntry(