trump-vocabulary/count_words.py

import re


class WordOccur:
    def __init__(self, word, count):
        self.word = word
        self.count = count

    def __str__(self):
        return "[{:05}] {}".format(self.count, self.word)

    def __repr__(self):
        return self.__str__()


WORD_RE = re.compile(r"[ -]+")
ACCEPT_WORD_RE = re.compile(r"[a-z']*")


def normalize(word):
    norm = word.strip().lower()
    if norm[-2:] == "'s":
        norm = norm[:-2]
    return norm


def splitwords(text):
    return list(map(normalize, WORD_RE.split(text)))


def acceptword(word):
    return bool(ACCEPT_WORD_RE.fullmatch(word))


def process(tweets):
    occurrences = {}
    for tweet in tweets:
        words = splitwords(tweet["text"])
        for word in words:
            if not acceptword(word):
                continue

            if word in occurrences:
                occurrences[word] += 1
            else:
                occurrences[word] = 1
    return occurrences


def filter_unlikely(occur, threshold=4):
    out = {}
    for word in occur:
        count = occur[word]
        if count >= threshold:
            out[word] = count
    return out


def ranked_list(occur):
    out = []
    for word in occur:
        out.append(WordOccur(word, occur[word]))
    out.sort(key=lambda x: x.count, reverse=True)
    return out


def writeout(ranked, path):
    with open(path, "w") as handle:
        handle.write("<RANK> [OCCUR] WORD\n===================\n")
        for rank, entry in enumerate(ranked):
            handle.write("<{:04}> {}\n".format(rank + 1, entry))


if __name__ == "__main__":
    from bootstrap import *

    occur = filter_unlikely(process(tweets))
    ranked = ranked_list(occur)