Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96166
License: OTHER
1
"""This module contains a code example related to
2
3
Think Python, 2nd Edition
4
by Allen Downey
5
http://thinkpython2.com
6
7
Copyright 2015 Allen Downey
8
9
License: http://creativecommons.org/licenses/by/4.0/
10
"""
11
12
from __future__ import print_function, division
13
14
import random
15
import string
16
17
def process_file(filename, skip_header):
18
"""Makes a histogram that contains the words from a file.
19
20
filename: string
21
skip_header: boolean, whether to skip the Gutenberg header
22
23
returns: map from each word to the number of times it appears.
24
"""
25
hist = {}
26
fp = open(filename)
27
28
if skip_header:
29
skip_gutenberg_header(fp)
30
31
for line in fp:
32
if line.startswith('*** END OF THIS'):
33
break
34
35
process_line(line, hist)
36
37
return hist
38
39
40
def skip_gutenberg_header(fp):
41
"""Reads from fp until it finds the line that ends the header.
42
43
fp: open file object
44
"""
45
for line in fp:
46
if line.startswith('*** START OF THIS'):
47
break
48
49
50
def process_line(line, hist):
51
"""Adds the words in the line to the histogram.
52
53
Modifies hist.
54
55
line: string
56
hist: histogram (map from word to frequency)
57
"""
58
# TODO: rewrite using Counter
59
60
# replace hyphens with spaces before splitting
61
line = line.replace('-', ' ')
62
strippables = string.punctuation + string.whitespace
63
64
for word in line.split():
65
# remove punctuation and convert to lowercase
66
word = word.strip(strippables)
67
word = word.lower()
68
69
# update the histogram
70
hist[word] = hist.get(word, 0) + 1
71
72
73
def most_common(hist):
74
"""Makes a list of word-freq pairs in descending order of frequency.
75
76
hist: map from word to frequency
77
78
returns: list of (frequency, word) pairs
79
"""
80
t = []
81
for key, value in hist.items():
82
t.append((value, key))
83
84
t.sort()
85
t.reverse()
86
return t
87
88
89
def print_most_common(hist, num=10):
90
"""Prints the most commons words in a histgram and their frequencies.
91
92
hist: histogram (map from word to frequency)
93
num: number of words to print
94
"""
95
t = most_common(hist)
96
print('The most common words are:')
97
for freq, word in t[:num]:
98
print(word, '\t', freq)
99
100
101
def subtract(d1, d2):
102
"""Returns a dictionary with all keys that appear in d1 but not d2.
103
104
d1, d2: dictionaries
105
"""
106
# TODO: reimplement using Counter
107
res = {}
108
for key in d1:
109
if key not in d2:
110
res[key] = None
111
return res
112
113
114
def total_words(hist):
115
"""Returns the total of the frequencies in a histogram."""
116
return sum(hist.values())
117
118
119
def different_words(hist):
120
"""Returns the number of different words in a histogram."""
121
return len(hist)
122
123
124
def random_word(hist):
125
"""Chooses a random word from a histogram.
126
127
The probability of each word is proportional to its frequency.
128
"""
129
# TODO: rewrite using Counter
130
t = []
131
for word, freq in hist.items():
132
t.extend([word] * freq)
133
134
return random.choice(t)
135
136
137
def main():
138
hist = process_file('158-0.txt', skip_header=True)
139
print('Total number of words:', total_words(hist))
140
print('Number of different words:', different_words(hist))
141
142
t = most_common(hist)
143
print('The most common words are:')
144
for freq, word in t[0:20]:
145
print(word, '\t', freq)
146
147
words = process_file('words.txt', skip_header=False)
148
149
diff = subtract(hist, words)
150
print("The words in the book that aren't in the word list are:")
151
for word in diff.keys():
152
print(word, end=' ')
153
154
print("\n\nHere are some random words from the book")
155
for i in range(100):
156
print(random_word(hist), end=' ')
157
158
159
if __name__ == '__main__':
160
main()
161
162
163
164