|
1 | 1 | from collections import Counter
|
2 |
| -from string import punctuation |
| 2 | +from string import punctuation, whitespace |
3 | 3 | import sys
|
4 | 4 |
|
5 | 5 |
|
6 |
| -def strip_punctuation(word): |
7 |
| - '''Remove punctuation from a word''' |
8 |
| - return "".join(c for c in word if c not in punctuation) |
| 6 | +def most_common_str(s, n=None): |
| 7 | + words = s.lower().translate(str.maketrans('', '', punctuation)).split() |
| 8 | + return Counter(words).most_common(n) |
9 | 9 |
|
10 | 10 |
|
11 |
| -def get_words(text): |
12 |
| - '''Converts text into set of words without punctuation''' |
13 |
| - with open(text) as f: |
14 |
| - words = f.read().lower().split() |
15 |
| - words = [strip_punctuation(word) for word in words] |
16 |
| - # could remove stopwords but requires nltk.corpus |
17 |
| - return filter(None, words) |
| 11 | +def most_common_re(s, n=None): |
| 12 | + return Counter(re.findall(rf'[^{punctuation}{whitespace}]+', |
| 13 | + s.lower())).most_common(n) |
18 | 14 |
|
19 | 15 |
|
20 |
| -def get_most_common(words, n=None): |
21 |
| - '''Return n common words, if n is None, return all (also singles)''' |
22 |
| - return Counter(words).most_common(n) |
| 16 | +def most_common_iter(s, n=None): |
| 17 | + return Counter(''.join(c for c in w if c not in punctuation) |
| 18 | + for w in s.lower().split()).most_common(n) |
23 | 19 |
|
24 | 20 |
|
25 | 21 | if __name__ == "__main__":
|
26 | 22 | try:
|
27 |
| - harry = sys.argv[1] |
| 23 | + file = sys.argv[1] |
28 | 24 | except IndexError:
|
29 |
| - harry = 'harry.txt' |
| 25 | + file = 'harry.txt' |
30 | 26 |
|
31 |
| - words = get_words(harry) |
32 |
| - common_words = get_most_common(words, n=20) |
| 27 | + with open(file) as f: |
| 28 | + common_words = most_common_str(f.read(), n=20) |
33 | 29 |
|
34 | 30 | for word, count in common_words:
|
35 | 31 | print('{:<4} {}'.format(count, word))
|
0 commit comments