"""This module contains a code example related to
Think Python, 2nd Edition
by Allen Downey
http://thinkpython2.com
Copyright 2015 Allen Downey
License: http://creativecommons.org/licenses/by/4.0/
"""
from __future__ import print_function, division
import random
import string
def process_file(filename, skip_header):
"""Makes a histogram that contains the words from a file.
filename: string
skip_header: boolean, whether to skip the Gutenberg header
returns: map from each word to the number of times it appears.
"""
hist = {}
fp = open(filename)
if skip_header:
skip_gutenberg_header(fp)
for line in fp:
if line.startswith('*** END OF THIS'):
break
process_line(line, hist)
return hist
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
"""
for line in fp:
if line.startswith('*** START OF THIS'):
break
def process_line(line, hist):
"""Adds the words in the line to the histogram.
Modifies hist.
line: string
hist: histogram (map from word to frequency)
"""
line = line.replace('-', ' ')
strippables = string.punctuation + string.whitespace
for word in line.split():
word = word.strip(strippables)
word = word.lower()
hist[word] = hist.get(word, 0) + 1
def most_common(hist):
"""Makes a list of word-freq pairs in descending order of frequency.
hist: map from word to frequency
returns: list of (frequency, word) pairs
"""
t = []
for key, value in hist.items():
t.append((value, key))
t.sort()
t.reverse()
return t
def print_most_common(hist, num=10):
"""Prints the most commons words in a histgram and their frequencies.
hist: histogram (map from word to frequency)
num: number of words to print
"""
t = most_common(hist)
print('The most common words are:')
for freq, word in t[:num]:
print(word, '\t', freq)
def subtract(d1, d2):
"""Returns a dictionary with all keys that appear in d1 but not d2.
d1, d2: dictionaries
"""
res = {}
for key in d1:
if key not in d2:
res[key] = None
return res
def total_words(hist):
"""Returns the total of the frequencies in a histogram."""
return sum(hist.values())
def different_words(hist):
"""Returns the number of different words in a histogram."""
return len(hist)
def random_word(hist):
"""Chooses a random word from a histogram.
The probability of each word is proportional to its frequency.
"""
t = []
for word, freq in hist.items():
t.extend([word] * freq)
return random.choice(t)
def main():
hist = process_file('158-0.txt', skip_header=True)
print('Total number of words:', total_words(hist))
print('Number of different words:', different_words(hist))
t = most_common(hist)
print('The most common words are:')
for freq, word in t[0:20]:
print(word, '\t', freq)
words = process_file('words.txt', skip_header=False)
diff = subtract(hist, words)
print("The words in the book that aren't in the word list are:")
for word in diff.keys():
print(word, end=' ')
print("\n\nHere are some random words from the book")
for i in range(100):
print(random_word(hist), end=' ')
if __name__ == '__main__':
main()