merge commit from histories_tobast into histories_models

This commit is contained in:
Rémi Oudin 2018-02-26 12:59:38 +01:00
commit 33bdae96e4
2 changed files with 40 additions and 29 deletions

View file

@ -1,5 +1,4 @@
from threading import Thread from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
from urllib.error import URLError from urllib.error import URLError
from urllib.parse import urlparse from urllib.parse import urlparse
@ -175,7 +174,7 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio, """ A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """ since the thread will need its own event loop. """
def __init__(self, url, queue): def __init__(self, url, output_tree):
engine_list = [engine.url for engine in SearchEngine.objects.all()] engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list WebsiteScheduler.search_engines = engine_list
@ -184,7 +183,7 @@ class CrawlingThread(Thread):
randint(0, nb_fingerprint - 1)] randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers() self.headers = fingerprint.serialize_headers()
self.queue = queue self.output_tree = output_tree
super(CrawlingThread, self).__init__() super(CrawlingThread, self).__init__()
self.url = url self.url = url
@ -193,7 +192,7 @@ class CrawlingThread(Thread):
#tasks.append(async_crawler("http://plus.google.com/+Python")) #tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/')) #tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue, self.headers)) tasks.append(async_crawler(self.url, self.output_tree))
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -243,50 +242,60 @@ async def async_print(url):
)) ))
async def async_crawler(url, queue, headers=None):
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def async_crawler(url, output_tree, headers=None):
if headers is None: if headers is None:
headers = {} headers = {}
if 'User-Agent' not in headers: if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent'] user_agent = headers['User-Agent']
queued = [CrawlElem(url, None)]
crawled = set()
crawl_tree = []
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT): while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession(headers=headers) as session: async with aiohttp.ClientSession(headers=headers) as session:
try: try:
url = queued.pop(0) crawl_elt = queued.pop(0)
url = crawl_elt.url
except IndexError: except IndexError:
print("Error queue is empty") print("Error queue is empty")
return crawled return crawled
crawled.add(url)
parsed_url = urlparse(url) parsed_url = urlparse(url)
print("Crawling {}".format(url)) print("Crawling {}".format(url))
html = await PageGetter(session, url, user_agent).get(ssl=False) html = await PageGetter(session, url, user_agent).get(ssl=False)
if html: if html:
crawl_tree.append(crawl_elt)
new_urls = url_getter( new_urls = url_getter(
html, html,
url, url,
parsed_url.scheme + "://" + parsed_url.netloc parsed_url.scheme + "://" + parsed_url.netloc
) )
crawled += [url]
if new_urls: if new_urls:
sampled = sample( sampled = sample(
new_urls, new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls))) randrange(min(MAX_PER_PAGE, len(new_urls)))
) )
queued += [sample_url for sample_url in sampled if queued += [
sample_url not in queued and sample_url not in CrawlElem(sample_url, crawl_elt)
crawled] for sample_url in sampled
else: if sample_url not in queued
print("No html received") and sample_url not in crawled
]
print(crawled) print(crawled)
queue.put(crawled) output_tree += crawl_tree
if __name__ == '__main__': if __name__ == '__main__':
queue = Queue() crawl_tree = []
crawl = CrawlingThread(None, crawl = CrawlingThread(None, "https://google.com/search?q=fabriquer+masque+manif", crawl_tree)
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl.start() crawl.start()
crawl.join() crawl.join()

View file

@ -5,6 +5,7 @@ import json
from datetime import datetime from datetime import datetime
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db import models from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename): def import_file(filename):
@ -19,15 +20,14 @@ def import_interest(_interest):
places = [] places = []
websites = [] websites = []
for keyword in _interest.get("keywords", []): for keyword in _interest.get("keywords", []):
if not Keyword.objects.get(keyword["keyword"]): try:
keywords.append( stored = Keyword.objects.get(text=keyword["keyword"])
Keyword( keywords.append(stored)
text=keyword["keyword"] except ObjectDoesNotExist:
) new_keyword = Keyword(text=keyword["keyword"])
) new_keyword.save()
print("New keyword %s" % new_keywords) keywords.append(new_keyword)
else: print("New keyword %s" % new_keyword)
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
for place in _interest.get("places", []): for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"])) places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []): for website in _interest.get("websites", []):
@ -36,7 +36,9 @@ def import_interest(_interest):
interest = Interest( interest = Interest(
name=_interest.get("name", ""), name=_interest.get("name", ""),
) )
interest.save()
for keyword in keywords: for keyword in keywords:
print(keyword)
interest.keywords.add(keyword) interest.keywords.add(keyword)
for place in places: for place in places:
interest.places.add(place) interest.places.add(place)
@ -46,4 +48,4 @@ def import_interest(_interest):
class Command(BaseCommand): class Command(BaseCommand):
def handle(self, *args, **kwargs): def handle(self, *args, **kwargs):
import_file("data/events.json") import_file("data/interests.json")