merge commit from histories_tobast into histories_models

This commit is contained in:
Rémi Oudin 2018-02-26 12:59:38 +01:00
commit 33bdae96e4
2 changed files with 40 additions and 29 deletions

View file

@ -1,5 +1,4 @@
from threading import Thread
from queue import Queue
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from urllib.parse import urlparse
@ -175,7 +174,7 @@ class CrawlingThread(Thread):
""" A separate thread for the crawling task. This is needed to use asyncio,
since the thread will need its own event loop. """
def __init__(self, url, queue):
def __init__(self, url, output_tree):
engine_list = [engine.url for engine in SearchEngine.objects.all()]
WebsiteScheduler.search_engines = engine_list
@ -184,7 +183,7 @@ class CrawlingThread(Thread):
randint(0, nb_fingerprint - 1)]
self.headers = fingerprint.serialize_headers()
self.queue = queue
self.output_tree = output_tree
super(CrawlingThread, self).__init__()
self.url = url
@ -193,7 +192,7 @@ class CrawlingThread(Thread):
#tasks.append(async_crawler("http://plus.google.com/+Python"))
#tasks.append(async_crawler('https://python.org/'))
tasks.append(async_crawler(self.url, self.queue, self.headers))
tasks.append(async_crawler(self.url, self.output_tree))
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -243,50 +242,60 @@ async def async_print(url):
))
async def async_crawler(url, queue, headers=None):
class CrawlElem:
''' Describes a crawled element, to be assembled into a tree '''
def __init__(self, url, parent):
self.url = url
self.parent = parent
async def async_crawler(url, output_tree, headers=None):
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = settings.USER_AGENT
user_agent = headers['User-Agent']
queued = [CrawlElem(url, None)]
crawled = set()
crawl_tree = []
queued = [url]
crawled = []
while queued and (len(crawled) < HARD_LIMIT):
async with aiohttp.ClientSession(headers=headers) as session:
try:
url = queued.pop(0)
crawl_elt = queued.pop(0)
url = crawl_elt.url
except IndexError:
print("Error queue is empty")
return crawled
crawled.add(url)
parsed_url = urlparse(url)
print("Crawling {}".format(url))
html = await PageGetter(session, url, user_agent).get(ssl=False)
if html:
crawl_tree.append(crawl_elt)
new_urls = url_getter(
html,
url,
parsed_url.scheme + "://" + parsed_url.netloc
)
crawled += [url]
if new_urls:
sampled = sample(
new_urls,
randrange(min(MAX_PER_PAGE, len(new_urls)))
)
queued += [sample_url for sample_url in sampled if
sample_url not in queued and sample_url not in
crawled]
else:
print("No html received")
queued += [
CrawlElem(sample_url, crawl_elt)
for sample_url in sampled
if sample_url not in queued
and sample_url not in crawled
]
print(crawled)
queue.put(crawled)
output_tree += crawl_tree
if __name__ == '__main__':
queue = Queue()
crawl = CrawlingThread(None,
"https://google.com/search?q=fabriquer+masque+manif",
["https://google.com/search/"], queue)
crawl_tree = []
crawl = CrawlingThread(None, "https://google.com/search?q=fabriquer+masque+manif", crawl_tree)
crawl.start()
crawl.join()

View file

@ -5,6 +5,7 @@ import json
from datetime import datetime
from django.core.management.base import BaseCommand
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from profiles.models import Keyword, Interest, Place, Website, Event
def import_file(filename):
@ -19,15 +20,14 @@ def import_interest(_interest):
places = []
websites = []
for keyword in _interest.get("keywords", []):
if not Keyword.objects.get(keyword["keyword"]):
keywords.append(
Keyword(
text=keyword["keyword"]
)
)
print("New keyword %s" % new_keywords)
else:
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
try:
stored = Keyword.objects.get(text=keyword["keyword"])
keywords.append(stored)
except ObjectDoesNotExist:
new_keyword = Keyword(text=keyword["keyword"])
new_keyword.save()
keywords.append(new_keyword)
print("New keyword %s" % new_keyword)
for place in _interest.get("places", []):
places.append(Place.objects.get(name=place["place"]))
for website in _interest.get("websites", []):
@ -36,7 +36,9 @@ def import_interest(_interest):
interest = Interest(
name=_interest.get("name", ""),
)
interest.save()
for keyword in keywords:
print(keyword)
interest.keywords.add(keyword)
for place in places:
interest.places.add(place)
@ -46,4 +48,4 @@ def import_interest(_interest):
class Command(BaseCommand):
def handle(self, *args, **kwargs):
import_file("data/events.json")
import_file("data/interests.json")