Integration of crawl module in histories

2018-02-24 23:17:24 +01:00 · 2018-02-24 23:17:24 +01:00 · bc7348f677
commit bc7348f677
parent 60bfc8cb77
2 changed files with 34 additions and 16 deletions
--- a/crawl/crawl.py
+++ b/crawl/crawl.py
@ -1,4 +1,5 @@
 from threading import Thread
+from queue import Queue
 from urllib.robotparser import RobotFileParser
 from urllib.error import URLError
 from urllib.parse import urlparse
@ -26,10 +27,10 @@ MAX_PER_PAGE = 10
 FOOTER_URL = re.compile(".*footer.*")

 class Settings:
-    USER_AGENT = 'Blah'
+    USER_AGENT = 'Default User'

 settings = Settings()
-startup_time = datetime.now()
+startup_time = datetime.min


 def url_getter(html, current_page, root_url):
@ -161,16 +162,24 @@ class CrawlingThread(Thread):
    """ A separate thread for the crawling task. This is needed to use asyncio,
    since the thread will need its own event loop. """

-    def __init__(self):
+    def __init__(self, user, url, queue):
+        global settings
+        self.queue = queue
        super(CrawlingThread, self).__init__()
+        if user:
+            settings.USER_AGENT = user.serialize_headers()
+        self.url = url

    def run(self):
+        global startup_time
        tasks = []
        #tasks.append(async_crawler("http://plus.google.com/+Python"))
-        tasks.append(async_crawler('https://python.org/'))
+        #tasks.append(async_crawler('https://python.org/'))
+        tasks.append(async_crawler(self.url, self.queue))

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
+        startup_time = datetime.now()
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

@ -211,13 +220,13 @@ async def async_print(url):
            url,
            datetime.now() - startup_time))

-async def async_crawler(url):
-    queue = [url]
+async def async_crawler(url, queue):
+    queued = [url]
    crawled = []
-    while queue and (len(crawled) < HARD_LIMIT):
+    while queued and (len(crawled) < HARD_LIMIT):
        async with aiohttp.ClientSession() as session:
            try:
-                url = queue.pop(0)
+                url = queued.pop(0)
            except IndexError:
                print("Error queue is empty")
                return crawled
@ -236,13 +245,14 @@ async def async_crawler(url):
                        new_urls,
                        randrange(min(MAX_PER_PAGE, len(new_urls)))
                    )
-                    queue += [sample_url for sample_url in sampled if
-                              sample_url not in queue and sample_url not in
+                    queued += [sample_url for sample_url in sampled if
+                              sample_url not in queued and sample_url not in
                              crawled]
    print(crawled)
-    return crawled
+    queue.put(crawled)

 if __name__ == '__main__':
-    crawl = CrawlingThread()
+    queue = Queue()
+    crawl = CrawlingThread(None, "https://python.org/", queue)
    crawl.start()
    crawl.join()
--- a/histories/models.py
+++ b/histories/models.py
@ -5,9 +5,10 @@ interests, keywords...

 import random
 from math import floor
+from queue import Queue
 from django.db import models
 import profiles.models as profiles
-#from crawl import crawl
+from crawl import crawl
 from pinocchio.settings import HISTORY_MIN


@ -63,7 +64,7 @@ class History(models.Model):
        self.save()


-def generate_partial_history(user, t_start, history):
+def generate_partial_history(user, t_start):
    """ Generate the part of the history resulting from the crawl starting at
    the given url.
    """
@ -71,7 +72,14 @@ def generate_partial_history(user, t_start, history):
    basis = generate_first_url(user)
    result.append((basis, t_start))
    t_start += 5* random.weibullvariate(1, 1.5)
-    #crawler = crawl.CrawlingThread()
+    queue = Queue()
+    crawler = crawl.CrawlingThread(user, basis, queue)
+    crawler.start()
+    crawler.join()
+    urls = queue.get()
+    for url in urls:
+        t_start += 5* random.weibullvariate(1, 1.5)
+        result.append((url, t_start)
    return result

 def generate_first_url(user):
@ -104,7 +112,7 @@ def generate_history(user, ts_start):

    while history_line < length:
        ts_start += 5 * random.weibullvariate(1, 2.8)
-        history_list = generate_partial_history(user, ts_start, history)
+        history_list = generate_partial_history(user, ts_start)
        ts_start = history_list[-1].timestamp + 5 * random.weibullvariate(1, 5)
        for (url, timestamp) in history_list:
            new_line = HistoryEntry(