Initial commit
This commit is contained in:
parent
9efd01daf2
commit
7e0f0271d5
8 changed files with 162 additions and 2 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
*.pyc
|
||||
__pycache__
|
||||
trumprank.txt
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "trump_tweet_data_archive"]
|
||||
path = trump_tweet_data_archive
|
||||
url = https://github.com/bpb27/trump_tweet_data_archive.git
|
46
README.md
46
README.md
|
@ -1,3 +1,45 @@
|
|||
# trump-vocabulary
|
||||
# Trump vocabulary
|
||||
|
||||
Assesses the vocabulary of our beloved, greatest president.
|
||||
**NOTE:** this was written in a few minutes without bothering with clean and robust
|
||||
code.
|
||||
|
||||
This code goes through the tweets of Donald Trump and produces a ranked list of words
|
||||
used.
|
||||
|
||||
The result (not much updated, though) can be found
|
||||
[here](https://tobast.fr/files/trumprank.txt).
|
||||
|
||||
## Install
|
||||
|
||||
Clone this reopsitory with submodules: `git clone --recurse-submodules`
|
||||
|
||||
Alternatively, if you already cloned the repo, you can run
|
||||
|
||||
```bash
|
||||
git submodule update --init --depth 1
|
||||
```
|
||||
|
||||
## Get a shell
|
||||
|
||||
You can explore the data in a shell by using `count_words.py` as an init script for
|
||||
your favorite shell, eg.
|
||||
|
||||
```bash
|
||||
ipython -i count_words.py
|
||||
```
|
||||
|
||||
The following will be available to you as variables:
|
||||
|
||||
* `tweets`: the list of all tweets ever,
|
||||
* `occur`: python dictionary of occurrences of words in Trump's tweets
|
||||
* `ranked`: ranked list of occurrences of words in Trump's tweets
|
||||
|
||||
## Generating the list
|
||||
|
||||
Simply run
|
||||
|
||||
```bash
|
||||
python ./generate_list.py [OUTPUT_FILE]
|
||||
```
|
||||
|
||||
If you omit `OUTPUT_FILE`, the list will be generated to `trumprank.txt`.
|
||||
|
|
0
__init__.py
Normal file
0
__init__.py
Normal file
18
bootstrap.py
Normal file
18
bootstrap.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_archive(path):
|
||||
with zipfile.ZipFile(str(path), "r") as archive:
|
||||
json_data = archive.read(path.stem)
|
||||
return json.loads(json_data)
|
||||
|
||||
|
||||
tweets = []
|
||||
for archive_path in Path("trump_tweet_data_archive/").iterdir():
|
||||
if not archive_path.match("condensed_*.json.zip"):
|
||||
continue
|
||||
tweets += load_archive(archive_path)
|
78
count_words.py
Normal file
78
count_words.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import re
|
||||
|
||||
|
||||
class WordOccur:
|
||||
def __init__(self, word, count):
|
||||
self.word = word
|
||||
self.count = count
|
||||
|
||||
def __str__(self):
|
||||
return "[{:05}] {}".format(self.count, self.word)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
WORD_RE = re.compile(r"[ -]+")
|
||||
ACCEPT_WORD_RE = re.compile(r"[a-z']*")
|
||||
|
||||
|
||||
def normalize(word):
|
||||
norm = word.strip().lower()
|
||||
if norm[-2:] == "'s":
|
||||
norm = norm[:-2]
|
||||
return norm
|
||||
|
||||
|
||||
def splitwords(text):
|
||||
return list(map(normalize, WORD_RE.split(text)))
|
||||
|
||||
|
||||
def acceptword(word):
|
||||
return bool(ACCEPT_WORD_RE.fullmatch(word))
|
||||
|
||||
|
||||
def process(tweets):
|
||||
occurrences = {}
|
||||
for tweet in tweets:
|
||||
words = splitwords(tweet["text"])
|
||||
for word in words:
|
||||
if not acceptword(word):
|
||||
continue
|
||||
|
||||
if word in occurrences:
|
||||
occurrences[word] += 1
|
||||
else:
|
||||
occurrences[word] = 1
|
||||
return occurrences
|
||||
|
||||
|
||||
def filter_unlikely(occur, threshold=4):
|
||||
out = {}
|
||||
for word in occur:
|
||||
count = occur[word]
|
||||
if count >= threshold:
|
||||
out[word] = count
|
||||
return out
|
||||
|
||||
|
||||
def ranked_list(occur):
|
||||
out = []
|
||||
for word in occur:
|
||||
out.append(WordOccur(word, occur[word]))
|
||||
out.sort(key=lambda x: x.count, reverse=True)
|
||||
return out
|
||||
|
||||
|
||||
def writeout(ranked, path):
|
||||
with open(path, "w") as handle:
|
||||
handle.write("<RANK> [OCCUR] WORD\n===================\n")
|
||||
for rank, entry in enumerate(ranked):
|
||||
handle.write("<{:04}> {}\n".format(rank + 1, entry))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from bootstrap import *
|
||||
|
||||
occur = filter_unlikely(process(tweets))
|
||||
ranked = ranked_list(occur)
|
15
generate_list.py
Normal file
15
generate_list.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import bootstrap
|
||||
import count_words
|
||||
|
||||
if __name__ == "__main__":
|
||||
occur = count_words.filter_unlikely(count_words.process(bootstrap.tweets))
|
||||
ranked = count_words.ranked_list(occur)
|
||||
|
||||
out_file = "trumprank.txt"
|
||||
if len(sys.argv) >= 2:
|
||||
out_file = sys.argv[1]
|
||||
print("Writing out to {}...".format(out_file))
|
||||
count_words.writeout(ranked, out_file)
|
1
trump_tweet_data_archive
Submodule
1
trump_tweet_data_archive
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 4398599156418d650493b66a8c0898dfe9498062
|
Loading…
Reference in a new issue