diff --git a/crawl/crawl.py b/crawl/crawl.py index e8467f1..48aaba6 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,4 +1,5 @@ from threading import Thread +from queue import Queue from urllib.robotparser import RobotFileParser from urllib.error import URLError from urllib.parse import urlparse @@ -26,10 +27,10 @@ MAX_PER_PAGE = 10 FOOTER_URL = re.compile(".*footer.*") class Settings: - USER_AGENT = 'Blah' + USER_AGENT = 'Default User' settings = Settings() -startup_time = datetime.now() +startup_time = datetime.min def url_getter(html, current_page, root_url): @@ -161,16 +162,24 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self): + def __init__(self, user, url, queue): + global settings + self.queue = queue super(CrawlingThread, self).__init__() + if user: + settings.USER_AGENT = user.serialize_headers() + self.url = url def run(self): + global startup_time tasks = [] #tasks.append(async_crawler("http://plus.google.com/+Python")) - tasks.append(async_crawler('https://python.org/')) + #tasks.append(async_crawler('https://python.org/')) + tasks.append(async_crawler(self.url, self.queue)) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) + startup_time = datetime.now() loop.run_until_complete(asyncio.wait(tasks)) loop.close() @@ -211,13 +220,13 @@ async def async_print(url): url, datetime.now() - startup_time)) -async def async_crawler(url): - queue = [url] +async def async_crawler(url, queue): + queued = [url] crawled = [] - while queue and (len(crawled) < HARD_LIMIT): + while queued and (len(crawled) < HARD_LIMIT): async with aiohttp.ClientSession() as session: try: - url = queue.pop(0) + url = queued.pop(0) except IndexError: print("Error queue is empty") return crawled @@ -236,13 +245,14 @@ async def async_crawler(url): new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) - queue += [sample_url for sample_url in sampled if - sample_url not in queue and sample_url not in + queued += [sample_url for sample_url in sampled if + sample_url not in queued and sample_url not in crawled] print(crawled) - return crawled + queue.put(crawled) if __name__ == '__main__': - crawl = CrawlingThread() + queue = Queue() + crawl = CrawlingThread(None, "https://python.org/", queue) crawl.start() crawl.join() diff --git a/histories/models.py b/histories/models.py index 2f52ae2..a0a8af7 100644 --- a/histories/models.py +++ b/histories/models.py @@ -5,9 +5,10 @@ interests, keywords... import random from math import floor +from queue import Queue from django.db import models import profiles.models as profiles -#from crawl import crawl +from crawl import crawl from pinocchio.settings import HISTORY_MIN @@ -63,7 +64,7 @@ class History(models.Model): self.save() -def generate_partial_history(user, t_start, history): +def generate_partial_history(user, t_start): """ Generate the part of the history resulting from the crawl starting at the given url. """ @@ -71,7 +72,14 @@ def generate_partial_history(user, t_start, history): basis = generate_first_url(user) result.append((basis, t_start)) t_start += 5* random.weibullvariate(1, 1.5) - #crawler = crawl.CrawlingThread() + queue = Queue() + crawler = crawl.CrawlingThread(user, basis, queue) + crawler.start() + crawler.join() + urls = queue.get() + for url in urls: + t_start += 5* random.weibullvariate(1, 1.5) + result.append((url, t_start) return result def generate_first_url(user): @@ -104,7 +112,7 @@ def generate_history(user, ts_start): while history_line < length: ts_start += 5 * random.weibullvariate(1, 2.8) - history_list = generate_partial_history(user, ts_start, history) + history_list = generate_partial_history(user, ts_start) ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5) for (url, timestamp) in history_list: new_line = HistoryEntry(