Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7114
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
12
import numpy as np
13
import pandas
14
15
import nsfg
16
import thinkplot
17
import thinkstats2
18
19
20
def ParetoMedian(xmin, alpha):
21
"""Computes the median of a Pareto distribution."""
22
return xmin * pow(2, 1/alpha)
23
24
25
def MakeExpoCdf():
26
"""Generates a plot of the exponential CDF."""
27
28
thinkplot.PrePlot(3)
29
for lam in [2.0, 1, 0.5]:
30
xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50)
31
label = r'$\lambda=%g$' % lam
32
thinkplot.Plot(xs, ps, label=label)
33
34
thinkplot.Save(root='analytic_expo_cdf',
35
title='Exponential CDF',
36
xlabel='x',
37
ylabel='CDF')
38
39
40
def ReadBabyBoom(filename='babyboom.dat'):
41
"""Reads the babyboom data.
42
43
filename: string
44
45
returns: DataFrame
46
"""
47
var_info = [
48
('time', 1, 8, int),
49
('sex', 9, 16, int),
50
('weight_g', 17, 24, int),
51
('minutes', 25, 32, int),
52
]
53
columns = ['name', 'start', 'end', 'type']
54
variables = pandas.DataFrame(var_info, columns=columns)
55
variables.end += 1
56
dct = thinkstats2.FixedWidthVariables(variables, index_base=1)
57
58
df = dct.ReadFixedWidth(filename, skiprows=59)
59
return df
60
61
62
def MakeBabyBoom():
63
"""Plot CDF of interarrival time on log and linear scales.
64
"""
65
# compute the interarrival times
66
df = ReadBabyBoom()
67
diffs = df.minutes.diff()
68
cdf = thinkstats2.Cdf(diffs, label='actual')
69
70
thinkplot.PrePlot(cols=2)
71
thinkplot.Cdf(cdf)
72
thinkplot.Config(xlabel='minutes',
73
ylabel='CDF',
74
legend=False)
75
76
thinkplot.SubPlot(2)
77
thinkplot.Cdf(cdf, complement=True)
78
thinkplot.Config(xlabel='minutes',
79
ylabel='CCDF',
80
yscale='log',
81
legend=False)
82
83
thinkplot.Save(root='analytic_interarrivals',
84
legend=False)
85
86
87
def MakeParetoCdf():
88
"""Generates a plot of the Pareto CDF."""
89
xmin = 0.5
90
91
thinkplot.PrePlot(3)
92
for alpha in [2.0, 1.0, 0.5]:
93
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 10.0, n=100)
94
thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha)
95
96
thinkplot.Save(root='analytic_pareto_cdf',
97
title='Pareto CDF',
98
xlabel='x',
99
ylabel='CDF')
100
101
102
def MakeParetoCdf2():
103
"""Generates a plot of the CDF of height in Pareto World."""
104
xmin = 100
105
alpha = 1.7
106
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 1000.0, n=100)
107
thinkplot.Plot(xs, ps)
108
109
thinkplot.Save(root='analytic_pareto_height',
110
title='Pareto CDF',
111
xlabel='height (cm)',
112
ylabel='CDF',
113
legend=False)
114
115
116
def MakeNormalCdf():
117
"""Generates a plot of the normal CDF."""
118
119
thinkplot.PrePlot(3)
120
121
mus = [1.0, 2.0, 3.0]
122
sigmas = [0.5, 0.4, 0.3]
123
for mu, sigma in zip(mus, sigmas):
124
xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma,
125
low=-1.0, high=4.0)
126
label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)
127
thinkplot.Plot(xs, ps, label=label)
128
129
thinkplot.Save(root='analytic_normal_cdf',
130
title='Normal CDF',
131
xlabel='x',
132
ylabel='CDF',
133
loc=2)
134
135
136
def MakeNormalModel(weights):
137
"""Plot the CDF of birthweights with a normal model."""
138
139
# estimate parameters: trimming outliers yields a better fit
140
mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
141
print('Mean, Var', mu, var)
142
143
# plot the model
144
sigma = math.sqrt(var)
145
print('Sigma', sigma)
146
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)
147
148
thinkplot.Plot(xs, ps, label='model', color='0.8')
149
150
# plot the data
151
cdf = thinkstats2.Cdf(weights, label='data')
152
153
thinkplot.PrePlot(1)
154
thinkplot.Cdf(cdf)
155
thinkplot.Save(root='analytic_birthwgt_model',
156
title='Birth weights',
157
xlabel='birth weight (lbs)',
158
ylabel='CDF')
159
160
161
def MakeExampleNormalPlot():
162
"""Generates a sample normal probability plot.
163
"""
164
n = 1000
165
thinkplot.PrePlot(3)
166
167
mus = [0, 1, 5]
168
sigmas = [1, 1, 2]
169
for mu, sigma in zip(mus, sigmas):
170
sample = np.random.normal(mu, sigma, n)
171
xs, ys = thinkstats2.NormalProbability(sample)
172
label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma)
173
thinkplot.Plot(xs, ys, label=label)
174
175
thinkplot.Save(root='analytic_normal_prob_example',
176
title='Normal probability plot',
177
xlabel='standard normal sample',
178
ylabel='sample values')
179
180
181
def MakeNormalPlot(weights, term_weights):
182
"""Generates a normal probability plot of birth weights."""
183
184
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
185
std = math.sqrt(var)
186
187
xs = [-4, 4]
188
fxs, fys = thinkstats2.FitLine(xs, mean, std)
189
thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')
190
191
thinkplot.PrePlot(2)
192
xs, ys = thinkstats2.NormalProbability(weights)
193
thinkplot.Plot(xs, ys, label='all live')
194
195
xs, ys = thinkstats2.NormalProbability(term_weights)
196
thinkplot.Plot(xs, ys, label='full term')
197
thinkplot.Save(root='analytic_birthwgt_normal',
198
title='Normal probability plot',
199
xlabel='Standard deviations from mean',
200
ylabel='Birth weight (lbs)')
201
202
203
def main():
204
thinkstats2.RandomSeed(18)
205
MakeExampleNormalPlot()
206
207
# make the analytic CDFs
208
MakeExpoCdf()
209
MakeBabyBoom()
210
211
MakeParetoCdf()
212
MakeParetoCdf2()
213
MakeNormalCdf()
214
215
# test the distribution of birth weights for normality
216
preg = nsfg.ReadFemPreg()
217
full_term = preg[preg.prglngth >= 37]
218
219
weights = preg.totalwgt_lb.dropna()
220
term_weights = full_term.totalwgt_lb.dropna()
221
222
MakeNormalModel(weights)
223
MakeNormalPlot(weights, term_weights)
224
225
226
if __name__ == "__main__":
227
main()
228
229