CoCalc -- analyze

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / think-python-2ed / code / analyze_book1.py
Views: ⁹⁶¹⁶⁶
License: OTHER
1
"""This module contains a code example related to
2

3
Think Python, 2nd Edition
4
by Allen Downey
5
http://thinkpython2.com
6

7
Copyright 2015 Allen Downey
8

9
License: http://creativecommons.org/licenses/by/4.0/
10
"""
11

12
from __future__ import print_function, division
13

14
import random
15
import string
16

17
def process_file(filename, skip_header):
18
    """Makes a histogram that contains the words from a file.
19

20
    filename: string
21
    skip_header: boolean, whether to skip the Gutenberg header
22
   
23
    returns: map from each word to the number of times it appears.
24
    """
25
    hist = {}
26
    fp = open(filename)
27

28
    if skip_header:
29
        skip_gutenberg_header(fp)
30

31
    for line in fp:
32
        if line.startswith('*** END OF THIS'):
33
            break
34

35
        process_line(line, hist)
36

37
    return hist
38

39

40
def skip_gutenberg_header(fp):
41
    """Reads from fp until it finds the line that ends the header.
42

43
    fp: open file object
44
    """
45
    for line in fp:
46
        if line.startswith('*** START OF THIS'):
47
            break
48

49

50
def process_line(line, hist):
51
    """Adds the words in the line to the histogram.
52

53
    Modifies hist.
54

55
    line: string
56
    hist: histogram (map from word to frequency)
57
    """
58
    # TODO: rewrite using Counter
59

60
    # replace hyphens with spaces before splitting
61
    line = line.replace('-', ' ')
62
    strippables = string.punctuation + string.whitespace
63

64
    for word in line.split():
65
        # remove punctuation and convert to lowercase
66
        word = word.strip(strippables)
67
        word = word.lower()
68

69
        # update the histogram
70
        hist[word] = hist.get(word, 0) + 1
71

72

73
def most_common(hist):
74
    """Makes a list of word-freq pairs in descending order of frequency.
75

76
    hist: map from word to frequency
77

78
    returns: list of (frequency, word) pairs
79
    """
80
    t = []
81
    for key, value in hist.items():
82
        t.append((value, key))
83

84
    t.sort()
85
    t.reverse()
86
    return t
87

88

89
def print_most_common(hist, num=10):
90
    """Prints the most commons words in a histgram and their frequencies.
91
    
92
    hist: histogram (map from word to frequency)
93
    num: number of words to print
94
    """
95
    t = most_common(hist)
96
    print('The most common words are:')
97
    for freq, word in t[:num]:
98
        print(word, '\t', freq)
99

100

101
def subtract(d1, d2):
102
    """Returns a dictionary with all keys that appear in d1 but not d2.
103

104
    d1, d2: dictionaries
105
    """
106
    # TODO: reimplement using Counter
107
    res = {}
108
    for key in d1:
109
        if key not in d2:
110
            res[key] = None
111
    return res
112

113

114
def total_words(hist):
115
    """Returns the total of the frequencies in a histogram."""
116
    return sum(hist.values())
117

118

119
def different_words(hist):
120
    """Returns the number of different words in a histogram."""
121
    return len(hist)
122

123

124
def random_word(hist):
125
    """Chooses a random word from a histogram.
126

127
    The probability of each word is proportional to its frequency.
128
    """
129
    # TODO: rewrite using Counter
130
    t = []
131
    for word, freq in hist.items():
132
        t.extend([word] * freq)
133

134
    return random.choice(t)
135

136

137
def main():
138
    hist = process_file('158-0.txt', skip_header=True)
139
    print('Total number of words:', total_words(hist))
140
    print('Number of different words:', different_words(hist))
141

142
    t = most_common(hist)
143
    print('The most common words are:')
144
    for freq, word in t[0:20]:
145
        print(word, '\t', freq)
146

147
    words = process_file('words.txt', skip_header=False)
148

149
    diff = subtract(hist, words)
150
    print("The words in the book that aren't in the word list are:")
151
    for word in diff.keys():
152
        print(word, end=' ')
153

154
    print("\n\nHere are some random words from the book")
155
    for i in range(100):
156
        print(random_word(hist), end=' ')
157

158

159
if __name__ == '__main__':
160
    main()
161

162

163

164
Product

Resources

Company