Initial commit

2019-07-04 11:57:16 +02:00 · 2019-07-04 11:57:16 +02:00 · 7e0f0271d5
commit 7e0f0271d5
parent 9efd01daf2
8 changed files with 162 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.pyc
+__pycache__
+trumprank.txt
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "trump_tweet_data_archive"]
+	path = trump_tweet_data_archive
+	url = https://github.com/bpb27/trump_tweet_data_archive.git
--- a/README.md
+++ b/README.md
@ -1,3 +1,45 @@
-# trump-vocabulary
+# Trump vocabulary

-Assesses the vocabulary of our beloved, greatest president.
+**NOTE:** this was written in a few minutes without bothering with clean and robust
+code.
+
+This code goes through the tweets of Donald Trump and produces a ranked list of words
+used.
+
+The result (not much updated, though) can be found
+[here](https://tobast.fr/files/trumprank.txt).
+
+## Install
+
+Clone this reopsitory with submodules: `git clone --recurse-submodules`
+
+Alternatively, if you already cloned the repo, you can run
+
+```bash
+git submodule update --init --depth 1
+```
+
+## Get a shell
+
+You can explore the data in a shell by using `count_words.py` as an init script for
+your favorite shell, eg.
+
+```bash
+ipython -i count_words.py
+```
+
+The following will be available to you as variables:
+
+* `tweets`: the list of all tweets ever,
+* `occur`: python dictionary of occurrences of words in Trump's tweets
+* `ranked`: ranked list of occurrences of words in Trump's tweets
+
+## Generating the list
+
+Simply run
+
+```bash
+python ./generate_list.py [OUTPUT_FILE]
+```
+
+If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`.
--- a/init.py
+++ b/init.py
--- a/bootstrap.py
+++ b/bootstrap.py
@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import json
+import zipfile
+from pathlib import Path
+
+
+def load_archive(path):
+    with zipfile.ZipFile(str(path), "r") as archive:
+        json_data = archive.read(path.stem)
+    return json.loads(json_data)
+
+
+tweets = []
+for archive_path in Path("trump_tweet_data_archive/").iterdir():
+    if not archive_path.match("condensed_*.json.zip"):
+        continue
+    tweets += load_archive(archive_path)
--- a/count_words.py
+++ b/count_words.py
@ -0,0 +1,78 @@
+import re
+
+
+class WordOccur:
+    def __init__(self, word, count):
+        self.word = word
+        self.count = count
+
+    def __str__(self):
+        return "[{:05}] {}".format(self.count, self.word)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+WORD_RE = re.compile(r"[ -]+")
+ACCEPT_WORD_RE = re.compile(r"[a-z']*")
+
+
+def normalize(word):
+    norm = word.strip().lower()
+    if norm[-2:] == "'s":
+        norm = norm[:-2]
+    return norm
+
+
+def splitwords(text):
+    return list(map(normalize, WORD_RE.split(text)))
+
+
+def acceptword(word):
+    return bool(ACCEPT_WORD_RE.fullmatch(word))
+
+
+def process(tweets):
+    occurrences = {}
+    for tweet in tweets:
+        words = splitwords(tweet["text"])
+        for word in words:
+            if not acceptword(word):
+                continue
+
+            if word in occurrences:
+                occurrences[word] += 1
+            else:
+                occurrences[word] = 1
+    return occurrences
+
+
+def filter_unlikely(occur, threshold=4):
+    out = {}
+    for word in occur:
+        count = occur[word]
+        if count >= threshold:
+            out[word] = count
+    return out
+
+
+def ranked_list(occur):
+    out = []
+    for word in occur:
+        out.append(WordOccur(word, occur[word]))
+    out.sort(key=lambda x: x.count, reverse=True)
+    return out
+
+
+def writeout(ranked, path):
+    with open(path, "w") as handle:
+        handle.write("<RANK> [OCCUR] WORD\n===================\n")
+        for rank, entry in enumerate(ranked):
+            handle.write("<{:04}> {}\n".format(rank + 1, entry))
+
+
+if __name__ == "__main__":
+    from bootstrap import *
+
+    occur = filter_unlikely(process(tweets))
+    ranked = ranked_list(occur)
--- a/generate_list.py
+++ b/generate_list.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+import sys
+import bootstrap
+import count_words
+
+if __name__ == "__main__":
+    occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets))
+    ranked = count_words.ranked_list(occur)
+
+    out_file = "trumprank.txt"
+    if len(sys.argv) >= 2:
+        out_file = sys.argv[1]
+    print("Writing out to {}...".format(out_file))
+    count_words.writeout(ranked, out_file)
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 4398599156418d650493b66a8c0898dfe9498062
				`@ -0,0 +1 @@`
				`Subproject commit 4398599156418d650493b66a8c0898dfe9498062`