import re class WordOccur: def __init__(self, word, count): self.word = word self.count = count def __str__(self): return "[{:05}] {}".format(self.count, self.word) def __repr__(self): return self.__str__() WORD_RE = re.compile(r"[ -]+") ACCEPT_WORD_RE = re.compile(r"[a-z']*") def normalize(word): norm = word.strip().lower() if norm[-2:] == "'s": norm = norm[:-2] return norm def splitwords(text): return list(map(normalize, WORD_RE.split(text))) def acceptword(word): return bool(ACCEPT_WORD_RE.fullmatch(word)) def process(tweets): occurrences = {} for tweet in tweets: words = splitwords(tweet["text"]) for word in words: if not acceptword(word): continue if word in occurrences: occurrences[word] += 1 else: occurrences[word] = 1 return occurrences def filter_unlikely(occur, threshold=4): out = {} for word in occur: count = occur[word] if count >= threshold: out[word] = count return out def ranked_list(occur): out = [] for word in occur: out.append(WordOccur(word, occur[word])) out.sort(key=lambda x: x.count, reverse=True) return out def writeout(ranked, path): with open(path, "w") as handle: handle.write(" [OCCUR] WORD\n===================\n") for rank, entry in enumerate(ranked): handle.write("<{:04}> {}\n".format(rank + 1, entry)) if __name__ == "__main__": from bootstrap import * occur = filter_unlikely(process(tweets)) ranked = ranked_list(occur)