merge commit from histories_tobast into histories_models
This commit is contained in:
commit
33bdae96e4
2 changed files with 40 additions and 29 deletions
|
@ -1,5 +1,4 @@
|
|||
from threading import Thread
|
||||
from queue import Queue
|
||||
from urllib.robotparser import RobotFileParser
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlparse
|
||||
|
@ -175,7 +174,7 @@ class CrawlingThread(Thread):
|
|||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||
since the thread will need its own event loop. """
|
||||
|
||||
def __init__(self, url, queue):
|
||||
def __init__(self, url, output_tree):
|
||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||
WebsiteScheduler.search_engines = engine_list
|
||||
|
||||
|
@ -184,7 +183,7 @@ class CrawlingThread(Thread):
|
|||
randint(0, nb_fingerprint - 1)]
|
||||
self.headers = fingerprint.serialize_headers()
|
||||
|
||||
self.queue = queue
|
||||
self.output_tree = output_tree
|
||||
super(CrawlingThread, self).__init__()
|
||||
self.url = url
|
||||
|
||||
|
@ -193,7 +192,7 @@ class CrawlingThread(Thread):
|
|||
|
||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||
#tasks.append(async_crawler('https://python.org/'))
|
||||
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
||||
tasks.append(async_crawler(self.url, self.output_tree))
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
@ -243,50 +242,60 @@ async def async_print(url):
|
|||
))
|
||||
|
||||
|
||||
async def async_crawler(url, queue, headers=None):
|
||||
|
||||
class CrawlElem:
|
||||
''' Describes a crawled element, to be assembled into a tree '''
|
||||
|
||||
def __init__(self, url, parent):
|
||||
self.url = url
|
||||
self.parent = parent
|
||||
|
||||
async def async_crawler(url, output_tree, headers=None):
|
||||
if headers is None:
|
||||
headers = {}
|
||||
if 'User-Agent' not in headers:
|
||||
headers['User-Agent'] = settings.USER_AGENT
|
||||
|
||||
user_agent = headers['User-Agent']
|
||||
queued = [CrawlElem(url, None)]
|
||||
crawled = set()
|
||||
crawl_tree = []
|
||||
|
||||
queued = [url]
|
||||
crawled = []
|
||||
while queued and (len(crawled) < HARD_LIMIT):
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
try:
|
||||
url = queued.pop(0)
|
||||
crawl_elt = queued.pop(0)
|
||||
url = crawl_elt.url
|
||||
except IndexError:
|
||||
print("Error queue is empty")
|
||||
return crawled
|
||||
crawled.add(url)
|
||||
parsed_url = urlparse(url)
|
||||
print("Crawling {}".format(url))
|
||||
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
||||
if html:
|
||||
crawl_tree.append(crawl_elt)
|
||||
new_urls = url_getter(
|
||||
html,
|
||||
url,
|
||||
parsed_url.scheme + "://" + parsed_url.netloc
|
||||
)
|
||||
crawled += [url]
|
||||
if new_urls:
|
||||
sampled = sample(
|
||||
new_urls,
|
||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||
)
|
||||
queued += [sample_url for sample_url in sampled if
|
||||
sample_url not in queued and sample_url not in
|
||||
crawled]
|
||||
else:
|
||||
print("No html received")
|
||||
queued += [
|
||||
CrawlElem(sample_url, crawl_elt)
|
||||
for sample_url in sampled
|
||||
if sample_url not in queued
|
||||
and sample_url not in crawled
|
||||
]
|
||||
print(crawled)
|
||||
queue.put(crawled)
|
||||
output_tree += crawl_tree
|
||||
|
||||
if __name__ == '__main__':
|
||||
queue = Queue()
|
||||
crawl = CrawlingThread(None,
|
||||
"https://google.com/search?q=fabriquer+masque+manif",
|
||||
["https://google.com/search/"], queue)
|
||||
crawl_tree = []
|
||||
crawl = CrawlingThread(None, "https://google.com/search?q=fabriquer+masque+manif", crawl_tree)
|
||||
crawl.start()
|
||||
crawl.join()
|
||||
|
|
|
@ -5,6 +5,7 @@ import json
|
|||
from datetime import datetime
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import models
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from profiles.models import Keyword, Interest, Place, Website, Event
|
||||
|
||||
def import_file(filename):
|
||||
|
@ -19,15 +20,14 @@ def import_interest(_interest):
|
|||
places = []
|
||||
websites = []
|
||||
for keyword in _interest.get("keywords", []):
|
||||
if not Keyword.objects.get(keyword["keyword"]):
|
||||
keywords.append(
|
||||
Keyword(
|
||||
text=keyword["keyword"]
|
||||
)
|
||||
)
|
||||
print("New keyword %s" % new_keywords)
|
||||
else:
|
||||
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
|
||||
try:
|
||||
stored = Keyword.objects.get(text=keyword["keyword"])
|
||||
keywords.append(stored)
|
||||
except ObjectDoesNotExist:
|
||||
new_keyword = Keyword(text=keyword["keyword"])
|
||||
new_keyword.save()
|
||||
keywords.append(new_keyword)
|
||||
print("New keyword %s" % new_keyword)
|
||||
for place in _interest.get("places", []):
|
||||
places.append(Place.objects.get(name=place["place"]))
|
||||
for website in _interest.get("websites", []):
|
||||
|
@ -36,7 +36,9 @@ def import_interest(_interest):
|
|||
interest = Interest(
|
||||
name=_interest.get("name", ""),
|
||||
)
|
||||
interest.save()
|
||||
for keyword in keywords:
|
||||
print(keyword)
|
||||
interest.keywords.add(keyword)
|
||||
for place in places:
|
||||
interest.places.add(place)
|
||||
|
@ -46,4 +48,4 @@ def import_interest(_interest):
|
|||
|
||||
class Command(BaseCommand):
|
||||
def handle(self, *args, **kwargs):
|
||||
import_file("data/events.json")
|
||||
import_file("data/interests.json")
|
||||
|
|
Loading…
Reference in a new issue