merge commit from histories_tobast into histories_models
This commit is contained in:
commit
33bdae96e4
2 changed files with 40 additions and 29 deletions
|
@ -1,5 +1,4 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from queue import Queue
|
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
from urllib.error import URLError
|
from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -175,7 +174,7 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self, url, queue):
|
def __init__(self, url, output_tree):
|
||||||
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
engine_list = [engine.url for engine in SearchEngine.objects.all()]
|
||||||
WebsiteScheduler.search_engines = engine_list
|
WebsiteScheduler.search_engines = engine_list
|
||||||
|
|
||||||
|
@ -184,7 +183,7 @@ class CrawlingThread(Thread):
|
||||||
randint(0, nb_fingerprint - 1)]
|
randint(0, nb_fingerprint - 1)]
|
||||||
self.headers = fingerprint.serialize_headers()
|
self.headers = fingerprint.serialize_headers()
|
||||||
|
|
||||||
self.queue = queue
|
self.output_tree = output_tree
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
|
@ -193,7 +192,7 @@ class CrawlingThread(Thread):
|
||||||
|
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
#tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
tasks.append(async_crawler(self.url, self.queue, self.headers))
|
tasks.append(async_crawler(self.url, self.output_tree))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
@ -243,50 +242,60 @@ async def async_print(url):
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
async def async_crawler(url, queue, headers=None):
|
|
||||||
|
class CrawlElem:
|
||||||
|
''' Describes a crawled element, to be assembled into a tree '''
|
||||||
|
|
||||||
|
def __init__(self, url, parent):
|
||||||
|
self.url = url
|
||||||
|
self.parent = parent
|
||||||
|
|
||||||
|
async def async_crawler(url, output_tree, headers=None):
|
||||||
if headers is None:
|
if headers is None:
|
||||||
headers = {}
|
headers = {}
|
||||||
if 'User-Agent' not in headers:
|
if 'User-Agent' not in headers:
|
||||||
headers['User-Agent'] = settings.USER_AGENT
|
headers['User-Agent'] = settings.USER_AGENT
|
||||||
|
|
||||||
user_agent = headers['User-Agent']
|
user_agent = headers['User-Agent']
|
||||||
|
queued = [CrawlElem(url, None)]
|
||||||
|
crawled = set()
|
||||||
|
crawl_tree = []
|
||||||
|
|
||||||
queued = [url]
|
|
||||||
crawled = []
|
|
||||||
while queued and (len(crawled) < HARD_LIMIT):
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
async with aiohttp.ClientSession(headers=headers) as session:
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
try:
|
try:
|
||||||
url = queued.pop(0)
|
crawl_elt = queued.pop(0)
|
||||||
|
url = crawl_elt.url
|
||||||
except IndexError:
|
except IndexError:
|
||||||
print("Error queue is empty")
|
print("Error queue is empty")
|
||||||
return crawled
|
return crawled
|
||||||
|
crawled.add(url)
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
print("Crawling {}".format(url))
|
print("Crawling {}".format(url))
|
||||||
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
html = await PageGetter(session, url, user_agent).get(ssl=False)
|
||||||
if html:
|
if html:
|
||||||
|
crawl_tree.append(crawl_elt)
|
||||||
new_urls = url_getter(
|
new_urls = url_getter(
|
||||||
html,
|
html,
|
||||||
url,
|
url,
|
||||||
parsed_url.scheme + "://" + parsed_url.netloc
|
parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
)
|
)
|
||||||
crawled += [url]
|
|
||||||
if new_urls:
|
if new_urls:
|
||||||
sampled = sample(
|
sampled = sample(
|
||||||
new_urls,
|
new_urls,
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
)
|
)
|
||||||
queued += [sample_url for sample_url in sampled if
|
queued += [
|
||||||
sample_url not in queued and sample_url not in
|
CrawlElem(sample_url, crawl_elt)
|
||||||
crawled]
|
for sample_url in sampled
|
||||||
else:
|
if sample_url not in queued
|
||||||
print("No html received")
|
and sample_url not in crawled
|
||||||
|
]
|
||||||
print(crawled)
|
print(crawled)
|
||||||
queue.put(crawled)
|
output_tree += crawl_tree
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
queue = Queue()
|
crawl_tree = []
|
||||||
crawl = CrawlingThread(None,
|
crawl = CrawlingThread(None, "https://google.com/search?q=fabriquer+masque+manif", crawl_tree)
|
||||||
"https://google.com/search?q=fabriquer+masque+manif",
|
|
||||||
["https://google.com/search/"], queue)
|
|
||||||
crawl.start()
|
crawl.start()
|
||||||
crawl.join()
|
crawl.join()
|
||||||
|
|
|
@ -5,6 +5,7 @@ import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db import models
|
from django.db import models
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist
|
||||||
from profiles.models import Keyword, Interest, Place, Website, Event
|
from profiles.models import Keyword, Interest, Place, Website, Event
|
||||||
|
|
||||||
def import_file(filename):
|
def import_file(filename):
|
||||||
|
@ -19,15 +20,14 @@ def import_interest(_interest):
|
||||||
places = []
|
places = []
|
||||||
websites = []
|
websites = []
|
||||||
for keyword in _interest.get("keywords", []):
|
for keyword in _interest.get("keywords", []):
|
||||||
if not Keyword.objects.get(keyword["keyword"]):
|
try:
|
||||||
keywords.append(
|
stored = Keyword.objects.get(text=keyword["keyword"])
|
||||||
Keyword(
|
keywords.append(stored)
|
||||||
text=keyword["keyword"]
|
except ObjectDoesNotExist:
|
||||||
)
|
new_keyword = Keyword(text=keyword["keyword"])
|
||||||
)
|
new_keyword.save()
|
||||||
print("New keyword %s" % new_keywords)
|
keywords.append(new_keyword)
|
||||||
else:
|
print("New keyword %s" % new_keyword)
|
||||||
keywords.append(Keyword.objects.get(text=keyword["keyword"]))
|
|
||||||
for place in _interest.get("places", []):
|
for place in _interest.get("places", []):
|
||||||
places.append(Place.objects.get(name=place["place"]))
|
places.append(Place.objects.get(name=place["place"]))
|
||||||
for website in _interest.get("websites", []):
|
for website in _interest.get("websites", []):
|
||||||
|
@ -36,7 +36,9 @@ def import_interest(_interest):
|
||||||
interest = Interest(
|
interest = Interest(
|
||||||
name=_interest.get("name", ""),
|
name=_interest.get("name", ""),
|
||||||
)
|
)
|
||||||
|
interest.save()
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
|
print(keyword)
|
||||||
interest.keywords.add(keyword)
|
interest.keywords.add(keyword)
|
||||||
for place in places:
|
for place in places:
|
||||||
interest.places.add(place)
|
interest.places.add(place)
|
||||||
|
@ -46,4 +48,4 @@ def import_interest(_interest):
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
def handle(self, *args, **kwargs):
|
def handle(self, *args, **kwargs):
|
||||||
import_file("data/events.json")
|
import_file("data/interests.json")
|
||||||
|
|
Loading…
Reference in a new issue