Initial commit

2019-07-04 11:57:16 +02:00 · 2019-07-04 11:57:16 +02:00 · 7e0f0271d5
commit 7e0f0271d5
parent 9efd01daf2
8 changed files with 162 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 *.pyc
 __pycache__
 trumprank.txt
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "trump_tweet_data_archive"]
 	path = trump_tweet_data_archive
 	url = https://github.com/bpb27/trump_tweet_data_archive.git
--- a/README.md
+++ b/README.md
@ -1,3 +1,45 @@
-# trump-vocabulary
+# Trump vocabulary
-Assesses the vocabulary of our beloved, greatest president.
+**NOTE:** this was written in a few minutes without bothering with clean and robust
 code.
 This code goes through the tweets of Donald Trump and produces a ranked list of words
 used.
 The result (not much updated, though) can be found
 [here](https://tobast.fr/files/trumprank.txt).
 ## Install
 Clone this reopsitory with submodules: `git clone --recurse-submodules`
 Alternatively, if you already cloned the repo, you can run
 ```bash
 git submodule update --init --depth 1
 ```
 ## Get a shell
 You can explore the data in a shell by using `count_words.py` as an init script for
 your favorite shell, eg.
 ```bash
 ipython -i count_words.py
 ```
 The following will be available to you as variables:
 * `tweets`: the list of all tweets ever,
 * `occur`: python dictionary of occurrences of words in Trump's tweets
 * `ranked`: ranked list of occurrences of words in Trump's tweets
 ## Generating the list
 Simply run
 ```bash
 python ./generate_list.py [OUTPUT_FILE]
 ```
 If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`.
--- a/init.py
+++ b/init.py
--- a/bootstrap.py
+++ b/bootstrap.py
@ -0,0 +1,18 @@
 #!/usr/bin/env python3
 import json
 import zipfile
 from pathlib import Path
 def load_archive(path):
    with zipfile.ZipFile(str(path), "r") as archive:
        json_data = archive.read(path.stem)
    return json.loads(json_data)
 tweets = []
 for archive_path in Path("trump_tweet_data_archive/").iterdir():
    if not archive_path.match("condensed_*.json.zip"):
        continue
    tweets += load_archive(archive_path)
--- a/count_words.py
+++ b/count_words.py
@ -0,0 +1,78 @@
 import re
 class WordOccur:
    def __init__(self, word, count):
        self.word = word
        self.count = count
    def __str__(self):
        return "[{:05}] {}".format(self.count, self.word)
    def __repr__(self):
        return self.__str__()
 WORD_RE = re.compile(r"[ -]+")
 ACCEPT_WORD_RE = re.compile(r"[a-z']*")
 def normalize(word):
    norm = word.strip().lower()
    if norm[-2:] == "'s":
        norm = norm[:-2]
    return norm
 def splitwords(text):
    return list(map(normalize, WORD_RE.split(text)))
 def acceptword(word):
    return bool(ACCEPT_WORD_RE.fullmatch(word))
 def process(tweets):
    occurrences = {}
    for tweet in tweets:
        words = splitwords(tweet["text"])
        for word in words:
            if not acceptword(word):
                continue
            if word in occurrences:
                occurrences[word] += 1
            else:
                occurrences[word] = 1
    return occurrences
 def filter_unlikely(occur, threshold=4):
    out = {}
    for word in occur:
        count = occur[word]
        if count >= threshold:
            out[word] = count
    return out
 def ranked_list(occur):
    out = []
    for word in occur:
        out.append(WordOccur(word, occur[word]))
    out.sort(key=lambda x: x.count, reverse=True)
    return out
 def writeout(ranked, path):
    with open(path, "w") as handle:
        handle.write("<RANK> [OCCUR] WORD\n===================\n")
        for rank, entry in enumerate(ranked):
            handle.write("<{:04}> {}\n".format(rank + 1, entry))
 if __name__ == "__main__":
    from bootstrap import *
    occur = filter_unlikely(process(tweets))
    ranked = ranked_list(occur)
--- a/generate_list.py
+++ b/generate_list.py
@ -0,0 +1,15 @@
 #!/usr/bin/env python3
 import sys
 import bootstrap
 import count_words
 if __name__ == "__main__":
    occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets))
    ranked = count_words.ranked_list(occur)
    out_file = "trumprank.txt"
    if len(sys.argv) >= 2:
        out_file = sys.argv[1]
    print("Writing out to {}...".format(out_file))
    count_words.writeout(ranked, out_file)
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 4398599156418d650493b66a8c0898dfe9498062
		`@ -0,0 +1 @@`
							`Subproject commit 4398599156418d650493b66a8c0898dfe9498062`