From 7e0f0271d5a15b5a8fa0683ad53a8c6347d9c5b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Thu, 4 Jul 2019 11:57:16 +0200 Subject: [PATCH] Initial commit --- .gitignore | 3 ++ .gitmodules | 3 ++ README.md | 46 ++++++++++++++++++++++-- __init__.py | 0 bootstrap.py | 18 ++++++++++ count_words.py | 78 ++++++++++++++++++++++++++++++++++++++++ generate_list.py | 15 ++++++++ trump_tweet_data_archive | 1 + 8 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 __init__.py create mode 100644 bootstrap.py create mode 100644 count_words.py create mode 100644 generate_list.py create mode 160000 trump_tweet_data_archive diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5c3ed74 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +__pycache__ +trumprank.txt diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d0aa3a0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "trump_tweet_data_archive"] + path = trump_tweet_data_archive + url = https://github.com/bpb27/trump_tweet_data_archive.git diff --git a/README.md b/README.md index be3fab9..6751f11 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,45 @@ -# trump-vocabulary +# Trump vocabulary -Assesses the vocabulary of our beloved, greatest president. \ No newline at end of file +**NOTE:** this was written in a few minutes without bothering with clean and robust +code. + +This code goes through the tweets of Donald Trump and produces a ranked list of words +used. + +The result (not much updated, though) can be found +[here](https://tobast.fr/files/trumprank.txt). + +## Install + +Clone this reopsitory with submodules: `git clone --recurse-submodules` + +Alternatively, if you already cloned the repo, you can run + +```bash +git submodule update --init --depth 1 +``` + +## Get a shell + +You can explore the data in a shell by using `count_words.py` as an init script for +your favorite shell, eg. + +```bash +ipython -i count_words.py +``` + +The following will be available to you as variables: + +* `tweets`: the list of all tweets ever, +* `occur`: python dictionary of occurrences of words in Trump's tweets +* `ranked`: ranked list of occurrences of words in Trump's tweets + +## Generating the list + +Simply run + +```bash +python ./generate_list.py [OUTPUT_FILE] +``` + +If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bootstrap.py b/bootstrap.py new file mode 100644 index 0000000..98a452b --- /dev/null +++ b/bootstrap.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import json +import zipfile +from pathlib import Path + + +def load_archive(path): + with zipfile.ZipFile(str(path), "r") as archive: + json_data = archive.read(path.stem) + return json.loads(json_data) + + +tweets = [] +for archive_path in Path("trump_tweet_data_archive/").iterdir(): + if not archive_path.match("condensed_*.json.zip"): + continue + tweets += load_archive(archive_path) diff --git a/count_words.py b/count_words.py new file mode 100644 index 0000000..cba4484 --- /dev/null +++ b/count_words.py @@ -0,0 +1,78 @@ +import re + + +class WordOccur: + def __init__(self, word, count): + self.word = word + self.count = count + + def __str__(self): + return "[{:05}] {}".format(self.count, self.word) + + def __repr__(self): + return self.__str__() + + +WORD_RE = re.compile(r"[ -]+") +ACCEPT_WORD_RE = re.compile(r"[a-z']*") + + +def normalize(word): + norm = word.strip().lower() + if norm[-2:] == "'s": + norm = norm[:-2] + return norm + + +def splitwords(text): + return list(map(normalize, WORD_RE.split(text))) + + +def acceptword(word): + return bool(ACCEPT_WORD_RE.fullmatch(word)) + + +def process(tweets): + occurrences = {} + for tweet in tweets: + words = splitwords(tweet["text"]) + for word in words: + if not acceptword(word): + continue + + if word in occurrences: + occurrences[word] += 1 + else: + occurrences[word] = 1 + return occurrences + + +def filter_unlikely(occur, threshold=4): + out = {} + for word in occur: + count = occur[word] + if count >= threshold: + out[word] = count + return out + + +def ranked_list(occur): + out = [] + for word in occur: + out.append(WordOccur(word, occur[word])) + out.sort(key=lambda x: x.count, reverse=True) + return out + + +def writeout(ranked, path): + with open(path, "w") as handle: + handle.write(" [OCCUR] WORD\n===================\n") + for rank, entry in enumerate(ranked): + handle.write("<{:04}> {}\n".format(rank + 1, entry)) + + +if __name__ == "__main__": + from bootstrap import * + + occur = filter_unlikely(process(tweets)) + ranked = ranked_list(occur) diff --git a/generate_list.py b/generate_list.py new file mode 100644 index 0000000..a9cb6d1 --- /dev/null +++ b/generate_list.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import sys +import bootstrap +import count_words + +if __name__ == "__main__": + occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets)) + ranked = count_words.ranked_list(occur) + + out_file = "trumprank.txt" + if len(sys.argv) >= 2: + out_file = sys.argv[1] + print("Writing out to {}...".format(out_file)) + count_words.writeout(ranked, out_file) diff --git a/trump_tweet_data_archive b/trump_tweet_data_archive new file mode 160000 index 0000000..4398599 --- /dev/null +++ b/trump_tweet_data_archive @@ -0,0 +1 @@ +Subproject commit 4398599156418d650493b66a8c0898dfe9498062