Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7115
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import sys
12
import pandas
13
import numpy as np
14
15
import thinkstats2
16
import thinkplot
17
18
19
def Summarize(df, column, title):
20
"""Print summary statistics male, female and all."""
21
22
items = [
23
('all', df[column]),
24
('male', df[df.sex == 1][column]),
25
('female', df[df.sex == 2][column]),
26
]
27
28
print(title)
29
print('key\tn\tmean\tvar\tstd\tcv')
30
for key, series in items:
31
mean, var = series.mean(), series.var()
32
std = math.sqrt(var)
33
cv = std / mean
34
t = key, len(series), mean, var, std, cv
35
print('%s\t%d\t%4.2f\t%4.2f\t%4.2f\t%4.4f' % t)
36
37
38
def CleanBrfssFrame(df):
39
"""Recodes BRFSS variables.
40
41
df: DataFrame
42
"""
43
# clean age
44
df.age.replace([7, 9], float('NaN'), inplace=True)
45
46
# clean height
47
df.htm3.replace([999], float('NaN'), inplace=True)
48
49
# clean weight
50
df.wtkg2.replace([99999], float('NaN'), inplace=True)
51
df.wtkg2 /= 100.0
52
53
# clean weight a year ago
54
df.wtyrago.replace([7777, 9999], float('NaN'), inplace=True)
55
df['wtyrago'] = df.wtyrago.apply(lambda x: x/2.2 if x < 9000 else x-9000)
56
57
58
def ReadBrfss(filename='CDBRFS08.ASC.gz', compression='gzip', nrows=None):
59
"""Reads the BRFSS data.
60
61
filename: string
62
compression: string
63
nrows: int number of rows to read, or None for all
64
65
returns: DataFrame
66
"""
67
var_info = [
68
('age', 101, 102, int),
69
('sex', 143, 143, int),
70
('wtyrago', 127, 130, int),
71
('finalwt', 799, 808, int),
72
('wtkg2', 1254, 1258, int),
73
('htm3', 1251, 1253, int),
74
]
75
columns = ['name', 'start', 'end', 'type']
76
variables = pandas.DataFrame(var_info, columns=columns)
77
variables.end += 1
78
dct = thinkstats2.FixedWidthVariables(variables, index_base=1)
79
80
df = dct.ReadFixedWidth(filename, compression=compression, nrows=nrows)
81
CleanBrfssFrame(df)
82
return df
83
84
85
def MakeNormalModel(weights):
86
"""Plots a CDF with a Normal model.
87
88
weights: sequence
89
"""
90
cdf = thinkstats2.Cdf(weights, label='weights')
91
92
mean, var = thinkstats2.TrimmedMeanVar(weights)
93
std = math.sqrt(var)
94
print('n, mean, std', len(weights), mean, std)
95
96
xmin = mean - 4 * std
97
xmax = mean + 4 * std
98
99
xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)
100
thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')
101
thinkplot.Cdf(cdf)
102
103
104
def MakeNormalPlot(weights):
105
"""Generates a normal probability plot of birth weights.
106
107
weights: sequence
108
"""
109
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
110
std = math.sqrt(var)
111
112
xs = [-5, 5]
113
xs, ys = thinkstats2.FitLine(xs, mean, std)
114
thinkplot.Plot(xs, ys, color='0.8', label='model')
115
116
xs, ys = thinkstats2.NormalProbability(weights)
117
thinkplot.Plot(xs, ys, label='weights')
118
119
120
def MakeFigures(df):
121
"""Generates CDFs and normal prob plots for weights and log weights."""
122
weights = df.wtkg2.dropna()
123
log_weights = np.log10(weights)
124
125
# plot weights on linear and log scales
126
thinkplot.PrePlot(cols=2)
127
MakeNormalModel(weights)
128
thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF')
129
130
thinkplot.SubPlot(2)
131
MakeNormalModel(log_weights)
132
thinkplot.Config(xlabel='adult weight (log10 kg)')
133
134
thinkplot.Save(root='brfss_weight')
135
136
# make normal probability plots on linear and log scales
137
thinkplot.PrePlot(cols=2)
138
MakeNormalPlot(weights)
139
thinkplot.Config(xlabel='z', ylabel='weights (kg)')
140
141
thinkplot.SubPlot(2)
142
MakeNormalPlot(log_weights)
143
thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)')
144
145
thinkplot.Save(root='brfss_weight_normal')
146
147
148
def main(script, nrows=1000):
149
"""Tests the functions in this module.
150
151
script: string script name
152
"""
153
thinkstats2.RandomSeed(17)
154
155
nrows = int(nrows)
156
df = ReadBrfss(nrows=nrows)
157
MakeFigures(df)
158
159
Summarize(df, 'htm3', 'Height (cm):')
160
Summarize(df, 'wtkg2', 'Weight (kg):')
161
Summarize(df, 'wtyrago', 'Weight year ago (kg):')
162
163
if nrows == 1000:
164
assert(df.age.value_counts()[40] == 28)
165
assert(df.sex.value_counts()[2] == 668)
166
assert(df.wtkg2.value_counts()[90.91] == 49)
167
assert(df.wtyrago.value_counts()[160/2.2] == 49)
168
assert(df.htm3.value_counts()[163] == 103)
169
assert(df.finalwt.value_counts()[185.870345] == 13)
170
print('%s: All tests passed.' % script)
171
172
173
if __name__ == '__main__':
174
main(*sys.argv)
175
176