Initial commit
This commit is contained in:
parent
9efd01daf2
commit
7e0f0271d5
8 changed files with 162 additions and 2 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
*.pyc
|
||||||
|
__pycache__
|
||||||
|
trumprank.txt
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "trump_tweet_data_archive"]
|
||||||
|
path = trump_tweet_data_archive
|
||||||
|
url = https://github.com/bpb27/trump_tweet_data_archive.git
|
46
README.md
46
README.md
|
@ -1,3 +1,45 @@
|
||||||
# trump-vocabulary
|
# Trump vocabulary
|
||||||
|
|
||||||
Assesses the vocabulary of our beloved, greatest president.
|
**NOTE:** this was written in a few minutes without bothering with clean and robust
|
||||||
|
code.
|
||||||
|
|
||||||
|
This code goes through the tweets of Donald Trump and produces a ranked list of words
|
||||||
|
used.
|
||||||
|
|
||||||
|
The result (not much updated, though) can be found
|
||||||
|
[here](https://tobast.fr/files/trumprank.txt).
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
Clone this reopsitory with submodules: `git clone --recurse-submodules`
|
||||||
|
|
||||||
|
Alternatively, if you already cloned the repo, you can run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git submodule update --init --depth 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Get a shell
|
||||||
|
|
||||||
|
You can explore the data in a shell by using `count_words.py` as an init script for
|
||||||
|
your favorite shell, eg.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ipython -i count_words.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The following will be available to you as variables:
|
||||||
|
|
||||||
|
* `tweets`: the list of all tweets ever,
|
||||||
|
* `occur`: python dictionary of occurrences of words in Trump's tweets
|
||||||
|
* `ranked`: ranked list of occurrences of words in Trump's tweets
|
||||||
|
|
||||||
|
## Generating the list
|
||||||
|
|
||||||
|
Simply run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python ./generate_list.py [OUTPUT_FILE]
|
||||||
|
```
|
||||||
|
|
||||||
|
If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`.
|
||||||
|
|
0
__init__.py
Normal file
0
__init__.py
Normal file
18
bootstrap.py
Normal file
18
bootstrap.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_archive(path):
|
||||||
|
with zipfile.ZipFile(str(path), "r") as archive:
|
||||||
|
json_data = archive.read(path.stem)
|
||||||
|
return json.loads(json_data)
|
||||||
|
|
||||||
|
|
||||||
|
tweets = []
|
||||||
|
for archive_path in Path("trump_tweet_data_archive/").iterdir():
|
||||||
|
if not archive_path.match("condensed_*.json.zip"):
|
||||||
|
continue
|
||||||
|
tweets += load_archive(archive_path)
|
78
count_words.py
Normal file
78
count_words.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class WordOccur:
|
||||||
|
def __init__(self, word, count):
|
||||||
|
self.word = word
|
||||||
|
self.count = count
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "[{:05}] {}".format(self.count, self.word)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
|
WORD_RE = re.compile(r"[ -]+")
|
||||||
|
ACCEPT_WORD_RE = re.compile(r"[a-z']*")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(word):
|
||||||
|
norm = word.strip().lower()
|
||||||
|
if norm[-2:] == "'s":
|
||||||
|
norm = norm[:-2]
|
||||||
|
return norm
|
||||||
|
|
||||||
|
|
||||||
|
def splitwords(text):
|
||||||
|
return list(map(normalize, WORD_RE.split(text)))
|
||||||
|
|
||||||
|
|
||||||
|
def acceptword(word):
|
||||||
|
return bool(ACCEPT_WORD_RE.fullmatch(word))
|
||||||
|
|
||||||
|
|
||||||
|
def process(tweets):
|
||||||
|
occurrences = {}
|
||||||
|
for tweet in tweets:
|
||||||
|
words = splitwords(tweet["text"])
|
||||||
|
for word in words:
|
||||||
|
if not acceptword(word):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if word in occurrences:
|
||||||
|
occurrences[word] += 1
|
||||||
|
else:
|
||||||
|
occurrences[word] = 1
|
||||||
|
return occurrences
|
||||||
|
|
||||||
|
|
||||||
|
def filter_unlikely(occur, threshold=4):
|
||||||
|
out = {}
|
||||||
|
for word in occur:
|
||||||
|
count = occur[word]
|
||||||
|
if count >= threshold:
|
||||||
|
out[word] = count
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def ranked_list(occur):
|
||||||
|
out = []
|
||||||
|
for word in occur:
|
||||||
|
out.append(WordOccur(word, occur[word]))
|
||||||
|
out.sort(key=lambda x: x.count, reverse=True)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def writeout(ranked, path):
|
||||||
|
with open(path, "w") as handle:
|
||||||
|
handle.write("<RANK> [OCCUR] WORD\n===================\n")
|
||||||
|
for rank, entry in enumerate(ranked):
|
||||||
|
handle.write("<{:04}> {}\n".format(rank + 1, entry))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from bootstrap import *
|
||||||
|
|
||||||
|
occur = filter_unlikely(process(tweets))
|
||||||
|
ranked = ranked_list(occur)
|
15
generate_list.py
Normal file
15
generate_list.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import bootstrap
|
||||||
|
import count_words
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets))
|
||||||
|
ranked = count_words.ranked_list(occur)
|
||||||
|
|
||||||
|
out_file = "trumprank.txt"
|
||||||
|
if len(sys.argv) >= 2:
|
||||||
|
out_file = sys.argv[1]
|
||||||
|
print("Writing out to {}...".format(out_file))
|
||||||
|
count_words.writeout(ranked, out_file)
|
1
trump_tweet_data_archive
Submodule
1
trump_tweet_data_archive
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 4398599156418d650493b66a8c0898dfe9498062
|
Loading…
Reference in a new issue