79 lines
1.7 KiB
Python
79 lines
1.7 KiB
Python
import re
|
|
|
|
|
|
class WordOccur:
|
|
def __init__(self, word, count):
|
|
self.word = word
|
|
self.count = count
|
|
|
|
def __str__(self):
|
|
return "[{:05}] {}".format(self.count, self.word)
|
|
|
|
def __repr__(self):
|
|
return self.__str__()
|
|
|
|
|
|
WORD_RE = re.compile(r"[ -]+")
|
|
ACCEPT_WORD_RE = re.compile(r"[a-z']*")
|
|
|
|
|
|
def normalize(word):
|
|
norm = word.strip().lower()
|
|
if norm[-2:] == "'s":
|
|
norm = norm[:-2]
|
|
return norm
|
|
|
|
|
|
def splitwords(text):
|
|
return list(map(normalize, WORD_RE.split(text)))
|
|
|
|
|
|
def acceptword(word):
|
|
return bool(ACCEPT_WORD_RE.fullmatch(word))
|
|
|
|
|
|
def process(tweets):
|
|
occurrences = {}
|
|
for tweet in tweets:
|
|
words = splitwords(tweet["text"])
|
|
for word in words:
|
|
if not acceptword(word):
|
|
continue
|
|
|
|
if word in occurrences:
|
|
occurrences[word] += 1
|
|
else:
|
|
occurrences[word] = 1
|
|
return occurrences
|
|
|
|
|
|
def filter_unlikely(occur, threshold=4):
|
|
out = {}
|
|
for word in occur:
|
|
count = occur[word]
|
|
if count >= threshold:
|
|
out[word] = count
|
|
return out
|
|
|
|
|
|
def ranked_list(occur):
|
|
out = []
|
|
for word in occur:
|
|
out.append(WordOccur(word, occur[word]))
|
|
out.sort(key=lambda x: x.count, reverse=True)
|
|
return out
|
|
|
|
|
|
def writeout(ranked, path):
|
|
with open(path, "w") as handle:
|
|
handle.write("<RANK> [OCCUR] WORD\n===================\n")
|
|
for rank, entry in enumerate(ranked):
|
|
handle.write("<{:04}> {}\n".format(rank + 1, entry))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from bootstrap import *
|
|
|
|
occur = filter_unlikely(process(tweets))
|
|
ranked = ranked_list(occur)
|