From 15323c34659af445751592d8a5b4e3a60a41004d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Sun, 25 Feb 2018 15:08:06 +0100 Subject: [PATCH 1/2] [REBASE ME] Crawl: enhance efficiency and output a tree --- crawl/crawl.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/crawl/crawl.py b/crawl/crawl.py index 48aaba6..091bfe0 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -162,9 +162,9 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, user, url, queue): + def __init__(self, user, url, output_tree): global settings - self.queue = queue + self.output_tree = output_tree super(CrawlingThread, self).__init__() if user: settings.USER_AGENT = user.serialize_headers() @@ -175,7 +175,7 @@ class CrawlingThread(Thread): tasks = [] #tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler('https://python.org/')) - tasks.append(async_crawler(self.url, self.queue)) + tasks.append(async_crawler(self.url, self.output_tree)) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -220,39 +220,54 @@ async def async_print(url): url, datetime.now() - startup_time)) -async def async_crawler(url, queue): - queued = [url] - crawled = [] + +class CrawlElem: + ''' Describes a crawled element, to be assembled into a tree ''' + + def __init__(self, url, parent): + self.url = url + self.parent = parent + +async def async_crawler(url, output_tree): + queued = [CrawlElem(url, None)] + crawled = set() + crawl_tree = [] + while queued and (len(crawled) < HARD_LIMIT): async with aiohttp.ClientSession() as session: try: - url = queued.pop(0) + crawl_elt = queued.pop(0) + url = crawl_elt.url except IndexError: print("Error queue is empty") return crawled + crawled += url parsed_url = urlparse(url) print("Crawling {}".format(url)) html = await PageGetter(session, url).get(ssl=False) if html: + crawl_tree.append(crawl_elt) new_urls = url_getter( html, url, parsed_url.scheme + "://" + parsed_url.netloc ) - crawled += [url] if new_urls: sampled = sample( new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) - queued += [sample_url for sample_url in sampled if - sample_url not in queued and sample_url not in - crawled] + queued += [ + CrawlElem(sample_url, crawl_elt) + for sample_url in sampled + if sample_url not in queued + and sample_url not in crawled + ] print(crawled) - queue.put(crawled) + output_tree += crawl_tree if __name__ == '__main__': - queue = Queue() - crawl = CrawlingThread(None, "https://python.org/", queue) + crawl_tree = [] + crawl = CrawlingThread(None, "https://python.org/", crawl_tree) crawl.start() crawl.join() From 93b235cb6c68db9f5e2628629fe7bb795f6f84cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= Date: Sun, 25 Feb 2018 21:20:52 +0100 Subject: [PATCH 2/2] Fix interests import --- .../management/commands/import_interests.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 profiles/management/commands/import_interests.py diff --git a/profiles/management/commands/import_interests.py b/profiles/management/commands/import_interests.py new file mode 100644 index 0000000..00d0697 --- /dev/null +++ b/profiles/management/commands/import_interests.py @@ -0,0 +1,52 @@ +""" Small module that import interests into the database. +""" + +import json +from datetime import datetime +from django.core.management.base import BaseCommand +from django.db import models +from django.core.exceptions import ObjectDoesNotExist +from profiles.models import Keyword, Interest, Place, Website, Event + +def import_file(filename): + with open(filename, mode='r') as file: + data = json.load(file) + for interest in data: + import_interest(interest) + + +def import_interest(_interest): + keywords = [] + places = [] + websites = [] + print(_interest) + for keyword in _interest.get("keywords", []): + try: + stored = Keyword.objects.get(text=keyword["keyword"]) + keywords.append(stored) + except ObjectDoesNotExist: + new_keyword = Keyword(text=keyword["keyword"]) + new_keyword.save() + keywords.append(new_keyword) + print("New keyword %s" % new_keyword) + for place in _interest.get("places", []): + places.append(Place.objects.get(name=place["place"])) + for website in _interest.get("websites", []): + websites.append(Website.objects.get(name=website["website"])) + + interest = Interest( + name=_interest.get("name", ""), + ) + interest.save() + for keyword in keywords: + print(keyword) + interest.keywords.add(keyword) + for place in places: + interest.places.add(place) + for website in websites: + interest.websites.add(website) + interest.save() + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + import_file("data/interests.json")