trump-vocabulary/count_words.py

79 lines
1.7 KiB
Python

import re
class WordOccur:
def __init__(self, word, count):
self.word = word
self.count = count
def __str__(self):
return "[{:05}] {}".format(self.count, self.word)
def __repr__(self):
return self.__str__()
WORD_RE = re.compile(r"[ -]+")
ACCEPT_WORD_RE = re.compile(r"[a-z']*")
def normalize(word):
norm = word.strip().lower()
if norm[-2:] == "'s":
norm = norm[:-2]
return norm
def splitwords(text):
return list(map(normalize, WORD_RE.split(text)))
def acceptword(word):
return bool(ACCEPT_WORD_RE.fullmatch(word))
def process(tweets):
occurrences = {}
for tweet in tweets:
words = splitwords(tweet["text"])
for word in words:
if not acceptword(word):
continue
if word in occurrences:
occurrences[word] += 1
else:
occurrences[word] = 1
return occurrences
def filter_unlikely(occur, threshold=4):
out = {}
for word in occur:
count = occur[word]
if count >= threshold:
out[word] = count
return out
def ranked_list(occur):
out = []
for word in occur:
out.append(WordOccur(word, occur[word]))
out.sort(key=lambda x: x.count, reverse=True)
return out
def writeout(ranked, path):
with open(path, "w") as handle:
handle.write("<RANK> [OCCUR] WORD\n===================\n")
for rank, entry in enumerate(ranked):
handle.write("<{:04}> {}\n".format(rank + 1, entry))
if __name__ == "__main__":
from bootstrap import *
occur = filter_unlikely(process(tweets))
ranked = ranked_list(occur)