8 changed files with 162 additions and 2 deletions
@ -0,0 +1,3 @@
@@ -0,0 +1,3 @@
|
||||
*.pyc |
||||
__pycache__ |
||||
trumprank.txt |
@ -0,0 +1,3 @@
@@ -0,0 +1,3 @@
|
||||
[submodule "trump_tweet_data_archive"] |
||||
path = trump_tweet_data_archive |
||||
url = https://github.com/bpb27/trump_tweet_data_archive.git |
@ -1,3 +1,45 @@
@@ -1,3 +1,45 @@
|
||||
# trump-vocabulary |
||||
# Trump vocabulary |
||||
|
||||
Assesses the vocabulary of our beloved, greatest president. |
||||
**NOTE:** this was written in a few minutes without bothering with clean and robust |
||||
code. |
||||
|
||||
This code goes through the tweets of Donald Trump and produces a ranked list of words |
||||
used. |
||||
|
||||
The result (not much updated, though) can be found |
||||
[here](https://tobast.fr/files/trumprank.txt). |
||||
|
||||
## Install |
||||
|
||||
Clone this reopsitory with submodules: `git clone --recurse-submodules` |
||||
|
||||
Alternatively, if you already cloned the repo, you can run |
||||
|
||||
```bash |
||||
git submodule update --init --depth 1 |
||||
``` |
||||
|
||||
## Get a shell |
||||
|
||||
You can explore the data in a shell by using `count_words.py` as an init script for |
||||
your favorite shell, eg. |
||||
|
||||
```bash |
||||
ipython -i count_words.py |
||||
``` |
||||
|
||||
The following will be available to you as variables: |
||||
|
||||
* `tweets`: the list of all tweets ever, |
||||
* `occur`: python dictionary of occurrences of words in Trump's tweets |
||||
* `ranked`: ranked list of occurrences of words in Trump's tweets |
||||
|
||||
## Generating the list |
||||
|
||||
Simply run |
||||
|
||||
```bash |
||||
python ./generate_list.py [OUTPUT_FILE] |
||||
``` |
||||
|
||||
If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`. |
||||
|
@ -0,0 +1,18 @@
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env python3 |
||||
|
||||
import json |
||||
import zipfile |
||||
from pathlib import Path |
||||
|
||||
|
||||
def load_archive(path): |
||||
with zipfile.ZipFile(str(path), "r") as archive: |
||||
json_data = archive.read(path.stem) |
||||
return json.loads(json_data) |
||||
|
||||
|
||||
tweets = [] |
||||
for archive_path in Path("trump_tweet_data_archive/").iterdir(): |
||||
if not archive_path.match("condensed_*.json.zip"): |
||||
continue |
||||
tweets += load_archive(archive_path) |
@ -0,0 +1,78 @@
@@ -0,0 +1,78 @@
|
||||
import re |
||||
|
||||
|
||||
class WordOccur: |
||||
def __init__(self, word, count): |
||||
self.word = word |
||||
self.count = count |
||||
|
||||
def __str__(self): |
||||
return "[{:05}] {}".format(self.count, self.word) |
||||
|
||||
def __repr__(self): |
||||
return self.__str__() |
||||
|
||||
|
||||
WORD_RE = re.compile(r"[ -]+") |
||||
ACCEPT_WORD_RE = re.compile(r"[a-z']*") |
||||
|
||||
|
||||
def normalize(word): |
||||
norm = word.strip().lower() |
||||
if norm[-2:] == "'s": |
||||
norm = norm[:-2] |
||||
return norm |
||||
|
||||
|
||||
def splitwords(text): |
||||
return list(map(normalize, WORD_RE.split(text))) |
||||
|
||||
|
||||
def acceptword(word): |
||||
return bool(ACCEPT_WORD_RE.fullmatch(word)) |
||||
|
||||
|
||||
def process(tweets): |
||||
occurrences = {} |
||||
for tweet in tweets: |
||||
words = splitwords(tweet["text"]) |
||||
for word in words: |
||||
if not acceptword(word): |
||||
continue |
||||
|
||||
if word in occurrences: |
||||
occurrences[word] += 1 |
||||
else: |
||||
occurrences[word] = 1 |
||||
return occurrences |
||||
|
||||
|
||||
def filter_unlikely(occur, threshold=4): |
||||
out = {} |
||||
for word in occur: |
||||
count = occur[word] |
||||
if count >= threshold: |
||||
out[word] = count |
||||
return out |
||||
|
||||
|
||||
def ranked_list(occur): |
||||
out = [] |
||||
for word in occur: |
||||
out.append(WordOccur(word, occur[word])) |
||||
out.sort(key=lambda x: x.count, reverse=True) |
||||
return out |
||||
|
||||
|
||||
def writeout(ranked, path): |
||||
with open(path, "w") as handle: |
||||
handle.write("<RANK> [OCCUR] WORD\n===================\n") |
||||
for rank, entry in enumerate(ranked): |
||||
handle.write("<{:04}> {}\n".format(rank + 1, entry)) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
from bootstrap import * |
||||
|
||||
occur = filter_unlikely(process(tweets)) |
||||
ranked = ranked_list(occur) |
@ -0,0 +1,15 @@
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3 |
||||
|
||||
import sys |
||||
import bootstrap |
||||
import count_words |
||||
|
||||
if __name__ == "__main__": |
||||
occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets)) |
||||
ranked = count_words.ranked_list(occur) |
||||
|
||||
out_file = "trumprank.txt" |
||||
if len(sys.argv) >= 2: |
||||
out_file = sys.argv[1] |
||||
print("Writing out to {}...".format(out_file)) |
||||
count_words.writeout(ranked, out_file) |
Loading…
Reference in new issue