Integration of crawl module in histories
This commit is contained in:
parent
60bfc8cb77
commit
bc7348f677
2 changed files with 34 additions and 16 deletions
|
@ -1,4 +1,5 @@
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
from queue import Queue
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
from urllib.error import URLError
|
from urllib.error import URLError
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -26,10 +27,10 @@ MAX_PER_PAGE = 10
|
||||||
FOOTER_URL = re.compile(".*footer.*")
|
FOOTER_URL = re.compile(".*footer.*")
|
||||||
|
|
||||||
class Settings:
|
class Settings:
|
||||||
USER_AGENT = 'Blah'
|
USER_AGENT = 'Default User'
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
startup_time = datetime.now()
|
startup_time = datetime.min
|
||||||
|
|
||||||
|
|
||||||
def url_getter(html, current_page, root_url):
|
def url_getter(html, current_page, root_url):
|
||||||
|
@ -161,16 +162,24 @@ class CrawlingThread(Thread):
|
||||||
""" A separate thread for the crawling task. This is needed to use asyncio,
|
""" A separate thread for the crawling task. This is needed to use asyncio,
|
||||||
since the thread will need its own event loop. """
|
since the thread will need its own event loop. """
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, user, url, queue):
|
||||||
|
global settings
|
||||||
|
self.queue = queue
|
||||||
super(CrawlingThread, self).__init__()
|
super(CrawlingThread, self).__init__()
|
||||||
|
if user:
|
||||||
|
settings.USER_AGENT = user.serialize_headers()
|
||||||
|
self.url = url
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
global startup_time
|
||||||
tasks = []
|
tasks = []
|
||||||
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
#tasks.append(async_crawler("http://plus.google.com/+Python"))
|
||||||
tasks.append(async_crawler('https://python.org/'))
|
#tasks.append(async_crawler('https://python.org/'))
|
||||||
|
tasks.append(async_crawler(self.url, self.queue))
|
||||||
|
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
|
startup_time = datetime.now()
|
||||||
loop.run_until_complete(asyncio.wait(tasks))
|
loop.run_until_complete(asyncio.wait(tasks))
|
||||||
loop.close()
|
loop.close()
|
||||||
|
|
||||||
|
@ -211,13 +220,13 @@ async def async_print(url):
|
||||||
url,
|
url,
|
||||||
datetime.now() - startup_time))
|
datetime.now() - startup_time))
|
||||||
|
|
||||||
async def async_crawler(url):
|
async def async_crawler(url, queue):
|
||||||
queue = [url]
|
queued = [url]
|
||||||
crawled = []
|
crawled = []
|
||||||
while queue and (len(crawled) < HARD_LIMIT):
|
while queued and (len(crawled) < HARD_LIMIT):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
try:
|
try:
|
||||||
url = queue.pop(0)
|
url = queued.pop(0)
|
||||||
except IndexError:
|
except IndexError:
|
||||||
print("Error queue is empty")
|
print("Error queue is empty")
|
||||||
return crawled
|
return crawled
|
||||||
|
@ -236,13 +245,14 @@ async def async_crawler(url):
|
||||||
new_urls,
|
new_urls,
|
||||||
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
randrange(min(MAX_PER_PAGE, len(new_urls)))
|
||||||
)
|
)
|
||||||
queue += [sample_url for sample_url in sampled if
|
queued += [sample_url for sample_url in sampled if
|
||||||
sample_url not in queue and sample_url not in
|
sample_url not in queued and sample_url not in
|
||||||
crawled]
|
crawled]
|
||||||
print(crawled)
|
print(crawled)
|
||||||
return crawled
|
queue.put(crawled)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawl = CrawlingThread()
|
queue = Queue()
|
||||||
|
crawl = CrawlingThread(None, "https://python.org/", queue)
|
||||||
crawl.start()
|
crawl.start()
|
||||||
crawl.join()
|
crawl.join()
|
||||||
|
|
|
@ -5,9 +5,10 @@ interests, keywords...
|
||||||
|
|
||||||
import random
|
import random
|
||||||
from math import floor
|
from math import floor
|
||||||
|
from queue import Queue
|
||||||
from django.db import models
|
from django.db import models
|
||||||
import profiles.models as profiles
|
import profiles.models as profiles
|
||||||
#from crawl import crawl
|
from crawl import crawl
|
||||||
from pinocchio.settings import HISTORY_MIN
|
from pinocchio.settings import HISTORY_MIN
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,7 +64,7 @@ class History(models.Model):
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
|
||||||
def generate_partial_history(user, t_start, history):
|
def generate_partial_history(user, t_start):
|
||||||
""" Generate the part of the history resulting from the crawl starting at
|
""" Generate the part of the history resulting from the crawl starting at
|
||||||
the given url.
|
the given url.
|
||||||
"""
|
"""
|
||||||
|
@ -71,7 +72,14 @@ def generate_partial_history(user, t_start, history):
|
||||||
basis = generate_first_url(user)
|
basis = generate_first_url(user)
|
||||||
result.append((basis, t_start))
|
result.append((basis, t_start))
|
||||||
t_start += 5* random.weibullvariate(1, 1.5)
|
t_start += 5* random.weibullvariate(1, 1.5)
|
||||||
#crawler = crawl.CrawlingThread()
|
queue = Queue()
|
||||||
|
crawler = crawl.CrawlingThread(user, basis, queue)
|
||||||
|
crawler.start()
|
||||||
|
crawler.join()
|
||||||
|
urls = queue.get()
|
||||||
|
for url in urls:
|
||||||
|
t_start += 5* random.weibullvariate(1, 1.5)
|
||||||
|
result.append((url, t_start)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def generate_first_url(user):
|
def generate_first_url(user):
|
||||||
|
@ -104,7 +112,7 @@ def generate_history(user, ts_start):
|
||||||
|
|
||||||
while history_line < length:
|
while history_line < length:
|
||||||
ts_start += 5 * random.weibullvariate(1, 2.8)
|
ts_start += 5 * random.weibullvariate(1, 2.8)
|
||||||
history_list = generate_partial_history(user, ts_start, history)
|
history_list = generate_partial_history(user, ts_start)
|
||||||
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
|
||||||
for (url, timestamp) in history_list:
|
for (url, timestamp) in history_list:
|
||||||
new_line = HistoryEntry(
|
new_line = HistoryEntry(
|
||||||
|
|
Loading…
Reference in a new issue