You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
78 lines
1.7 KiB
78 lines
1.7 KiB
import re |
|
|
|
|
|
class WordOccur: |
|
def __init__(self, word, count): |
|
self.word = word |
|
self.count = count |
|
|
|
def __str__(self): |
|
return "[{:05}] {}".format(self.count, self.word) |
|
|
|
def __repr__(self): |
|
return self.__str__() |
|
|
|
|
|
WORD_RE = re.compile(r"[ -]+") |
|
ACCEPT_WORD_RE = re.compile(r"[a-z']*") |
|
|
|
|
|
def normalize(word): |
|
norm = word.strip().lower() |
|
if norm[-2:] == "'s": |
|
norm = norm[:-2] |
|
return norm |
|
|
|
|
|
def splitwords(text): |
|
return list(map(normalize, WORD_RE.split(text))) |
|
|
|
|
|
def acceptword(word): |
|
return bool(ACCEPT_WORD_RE.fullmatch(word)) |
|
|
|
|
|
def process(tweets): |
|
occurrences = {} |
|
for tweet in tweets: |
|
words = splitwords(tweet["text"]) |
|
for word in words: |
|
if not acceptword(word): |
|
continue |
|
|
|
if word in occurrences: |
|
occurrences[word] += 1 |
|
else: |
|
occurrences[word] = 1 |
|
return occurrences |
|
|
|
|
|
def filter_unlikely(occur, threshold=4): |
|
out = {} |
|
for word in occur: |
|
count = occur[word] |
|
if count >= threshold: |
|
out[word] = count |
|
return out |
|
|
|
|
|
def ranked_list(occur): |
|
out = [] |
|
for word in occur: |
|
out.append(WordOccur(word, occur[word])) |
|
out.sort(key=lambda x: x.count, reverse=True) |
|
return out |
|
|
|
|
|
def writeout(ranked, path): |
|
with open(path, "w") as handle: |
|
handle.write("<RANK> [OCCUR] WORD\n===================\n") |
|
for rank, entry in enumerate(ranked): |
|
handle.write("<{:04}> {}\n".format(rank + 1, entry)) |
|
|
|
|
|
if __name__ == "__main__": |
|
from bootstrap import * |
|
|
|
occur = filter_unlikely(process(tweets)) |
|
ranked = ranked_list(occur)
|
|
|