Initial commit

This commit is contained in:
Théophile Bastian 2019-07-04 11:57:16 +02:00
parent 9efd01daf2
commit 7e0f0271d5
8 changed files with 162 additions and 2 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.pyc
__pycache__
trumprank.txt

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "trump_tweet_data_archive"]
path = trump_tweet_data_archive
url = https://github.com/bpb27/trump_tweet_data_archive.git

View File

@ -1,3 +1,45 @@
# trump-vocabulary
# Trump vocabulary
Assesses the vocabulary of our beloved, greatest president.
**NOTE:** this was written in a few minutes without bothering with clean and robust
code.
This code goes through the tweets of Donald Trump and produces a ranked list of words
used.
The result (not much updated, though) can be found
[here](https://tobast.fr/files/trumprank.txt).
## Install
Clone this reopsitory with submodules: `git clone --recurse-submodules`
Alternatively, if you already cloned the repo, you can run
```bash
git submodule update --init --depth 1
```
## Get a shell
You can explore the data in a shell by using `count_words.py` as an init script for
your favorite shell, eg.
```bash
ipython -i count_words.py
```
The following will be available to you as variables:
* `tweets`: the list of all tweets ever,
* `occur`: python dictionary of occurrences of words in Trump's tweets
* `ranked`: ranked list of occurrences of words in Trump's tweets
## Generating the list
Simply run
```bash
python ./generate_list.py [OUTPUT_FILE]
```
If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`.

0
__init__.py Normal file
View File

18
bootstrap.py Normal file
View File

@ -0,0 +1,18 @@
#!/usr/bin/env python3
import json
import zipfile
from pathlib import Path
def load_archive(path):
with zipfile.ZipFile(str(path), "r") as archive:
json_data = archive.read(path.stem)
return json.loads(json_data)
tweets = []
for archive_path in Path("trump_tweet_data_archive/").iterdir():
if not archive_path.match("condensed_*.json.zip"):
continue
tweets += load_archive(archive_path)

78
count_words.py Normal file
View File

@ -0,0 +1,78 @@
import re
class WordOccur:
def __init__(self, word, count):
self.word = word
self.count = count
def __str__(self):
return "[{:05}] {}".format(self.count, self.word)
def __repr__(self):
return self.__str__()
WORD_RE = re.compile(r"[ -]+")
ACCEPT_WORD_RE = re.compile(r"[a-z']*")
def normalize(word):
norm = word.strip().lower()
if norm[-2:] == "'s":
norm = norm[:-2]
return norm
def splitwords(text):
return list(map(normalize, WORD_RE.split(text)))
def acceptword(word):
return bool(ACCEPT_WORD_RE.fullmatch(word))
def process(tweets):
occurrences = {}
for tweet in tweets:
words = splitwords(tweet["text"])
for word in words:
if not acceptword(word):
continue
if word in occurrences:
occurrences[word] += 1
else:
occurrences[word] = 1
return occurrences
def filter_unlikely(occur, threshold=4):
out = {}
for word in occur:
count = occur[word]
if count >= threshold:
out[word] = count
return out
def ranked_list(occur):
out = []
for word in occur:
out.append(WordOccur(word, occur[word]))
out.sort(key=lambda x: x.count, reverse=True)
return out
def writeout(ranked, path):
with open(path, "w") as handle:
handle.write("<RANK> [OCCUR] WORD\n===================\n")
for rank, entry in enumerate(ranked):
handle.write("<{:04}> {}\n".format(rank + 1, entry))
if __name__ == "__main__":
from bootstrap import *
occur = filter_unlikely(process(tweets))
ranked = ranked_list(occur)

15
generate_list.py Normal file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env python3
import sys
import bootstrap
import count_words
if __name__ == "__main__":
occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets))
ranked = count_words.ranked_list(occur)
out_file = "trumprank.txt"
if len(sys.argv) >= 2:
out_file = sys.argv[1]
print("Writing out to {}...".format(out_file))
count_words.writeout(ranked, out_file)

@ -0,0 +1 @@
Subproject commit 4398599156418d650493b66a8c0898dfe9498062