diff --git a/crawl/crawl.py b/crawl/crawl.py index 10c7a53..162b26b 100644 --- a/crawl/crawl.py +++ b/crawl/crawl.py @@ -1,5 +1,4 @@ from threading import Thread -from queue import Queue from urllib.robotparser import RobotFileParser from urllib.error import URLError from urllib.parse import urlparse @@ -175,7 +174,7 @@ class CrawlingThread(Thread): """ A separate thread for the crawling task. This is needed to use asyncio, since the thread will need its own event loop. """ - def __init__(self, url, queue): + def __init__(self, url, output_tree): engine_list = [engine.url for engine in SearchEngine.objects.all()] WebsiteScheduler.search_engines = engine_list @@ -184,7 +183,7 @@ class CrawlingThread(Thread): randint(0, nb_fingerprint - 1)] self.headers = fingerprint.serialize_headers() - self.queue = queue + self.output_tree = output_tree super(CrawlingThread, self).__init__() self.url = url @@ -193,7 +192,7 @@ class CrawlingThread(Thread): #tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler('https://python.org/')) - tasks.append(async_crawler(self.url, self.queue, self.headers)) + tasks.append(async_crawler(self.url, self.output_tree)) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -243,50 +242,60 @@ async def async_print(url): )) -async def async_crawler(url, queue, headers=None): + +class CrawlElem: + ''' Describes a crawled element, to be assembled into a tree ''' + + def __init__(self, url, parent): + self.url = url + self.parent = parent + +async def async_crawler(url, output_tree, headers=None): if headers is None: headers = {} if 'User-Agent' not in headers: headers['User-Agent'] = settings.USER_AGENT user_agent = headers['User-Agent'] + queued = [CrawlElem(url, None)] + crawled = set() + crawl_tree = [] - queued = [url] - crawled = [] while queued and (len(crawled) < HARD_LIMIT): async with aiohttp.ClientSession(headers=headers) as session: try: - url = queued.pop(0) + crawl_elt = queued.pop(0) + url = crawl_elt.url except IndexError: print("Error queue is empty") return crawled + crawled.add(url) parsed_url = urlparse(url) print("Crawling {}".format(url)) html = await PageGetter(session, url, user_agent).get(ssl=False) if html: + crawl_tree.append(crawl_elt) new_urls = url_getter( html, url, parsed_url.scheme + "://" + parsed_url.netloc ) - crawled += [url] if new_urls: sampled = sample( new_urls, randrange(min(MAX_PER_PAGE, len(new_urls))) ) - queued += [sample_url for sample_url in sampled if - sample_url not in queued and sample_url not in - crawled] - else: - print("No html received") + queued += [ + CrawlElem(sample_url, crawl_elt) + for sample_url in sampled + if sample_url not in queued + and sample_url not in crawled + ] print(crawled) - queue.put(crawled) + output_tree += crawl_tree if __name__ == '__main__': - queue = Queue() - crawl = CrawlingThread(None, - "https://google.com/search?q=fabriquer+masque+manif", - ["https://google.com/search/"], queue) + crawl_tree = [] + crawl = CrawlingThread(None, "https://google.com/search?q=fabriquer+masque+manif", crawl_tree) crawl.start() crawl.join() diff --git a/profiles/management/commands/import_interests.py b/profiles/management/commands/import_interests.py index 1776c3d..d5c5aaf 100644 --- a/profiles/management/commands/import_interests.py +++ b/profiles/management/commands/import_interests.py @@ -5,6 +5,7 @@ import json from datetime import datetime from django.core.management.base import BaseCommand from django.db import models +from django.core.exceptions import ObjectDoesNotExist from profiles.models import Keyword, Interest, Place, Website, Event def import_file(filename): @@ -19,15 +20,14 @@ def import_interest(_interest): places = [] websites = [] for keyword in _interest.get("keywords", []): - if not Keyword.objects.get(keyword["keyword"]): - keywords.append( - Keyword( - text=keyword["keyword"] - ) - ) - print("New keyword %s" % new_keywords) - else: - keywords.append(Keyword.objects.get(text=keyword["keyword"])) + try: + stored = Keyword.objects.get(text=keyword["keyword"]) + keywords.append(stored) + except ObjectDoesNotExist: + new_keyword = Keyword(text=keyword["keyword"]) + new_keyword.save() + keywords.append(new_keyword) + print("New keyword %s" % new_keyword) for place in _interest.get("places", []): places.append(Place.objects.get(name=place["place"])) for website in _interest.get("websites", []): @@ -36,7 +36,9 @@ def import_interest(_interest): interest = Interest( name=_interest.get("name", ""), ) + interest.save() for keyword in keywords: + print(keyword) interest.keywords.add(keyword) for place in places: interest.places.add(place) @@ -46,4 +48,4 @@ def import_interest(_interest): class Command(BaseCommand): def handle(self, *args, **kwargs): - import_file("data/events.json") + import_file("data/interests.json")