Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import numpy as np
12
13
import first
14
import thinkplot
15
import thinkstats2
16
17
18
def Summarize(estimates, actual=None):
19
"""Prints standard error and 90% confidence interval.
20
21
estimates: sequence of estimates
22
actual: float actual value
23
"""
24
mean = thinkstats2.Mean(estimates)
25
stderr = thinkstats2.Std(estimates, mu=actual)
26
cdf = thinkstats2.Cdf(estimates)
27
ci = cdf.ConfidenceInterval(90)
28
print('mean, SE, CI', mean, stderr, ci)
29
30
31
def SamplingDistributions(live, iters=101):
32
"""Estimates sampling distributions by resampling rows.
33
34
live: DataFrame
35
iters: number of times to run simulations
36
37
returns: pair of sequences (inters, slopes)
38
"""
39
t = []
40
for _ in range(iters):
41
sample = thinkstats2.ResampleRows(live)
42
ages = sample.agepreg
43
weights = sample.totalwgt_lb
44
estimates = thinkstats2.LeastSquares(ages, weights)
45
t.append(estimates)
46
47
inters, slopes = zip(*t)
48
return inters, slopes
49
50
51
def PlotConfidenceIntervals(xs, inters, slopes,
52
res=None, percent=90, **options):
53
"""Plots the 90% confidence intervals for weights based on ages.
54
55
xs: sequence
56
inters: estimated intercepts
57
slopes: estimated slopes
58
res: residuals
59
percent: what percentile range to show
60
"""
61
fys_seq = []
62
for inter, slope in zip(inters, slopes):
63
fxs, fys = thinkstats2.FitLine(xs, inter, slope)
64
if res is not None:
65
fys += np.random.permutation(res)
66
fys_seq.append(fys)
67
68
p = (100 - percent) / 2
69
percents = p, 100 - p
70
low, high = thinkstats2.PercentileRows(fys_seq, percents)
71
thinkplot.FillBetween(fxs, low, high, **options)
72
73
74
def PlotSamplingDistributions(live):
75
"""Plots confidence intervals for the fitted curve and sampling dists.
76
77
live: DataFrame
78
"""
79
ages = live.agepreg
80
weights = live.totalwgt_lb
81
inter, slope = thinkstats2.LeastSquares(ages, weights)
82
res = thinkstats2.Residuals(ages, weights, inter, slope)
83
r2 = thinkstats2.CoefDetermination(weights, res)
84
85
print('rho', thinkstats2.Corr(ages, weights))
86
print('R2', r2)
87
print('R', math.sqrt(r2))
88
print('Std(ys)', thinkstats2.Std(weights))
89
print('Std(res)', thinkstats2.Std(res))
90
91
# plot the confidence intervals
92
inters, slopes = SamplingDistributions(live, iters=1001)
93
PlotConfidenceIntervals(ages, inters, slopes, percent=90,
94
alpha=0.3, label='90% CI')
95
thinkplot.Text(42, 7.53, '90%')
96
PlotConfidenceIntervals(ages, inters, slopes, percent=50,
97
alpha=0.5, label='50% CI')
98
thinkplot.Text(42, 7.59, '50%')
99
100
thinkplot.Save(root='linear3',
101
xlabel='age (years)',
102
ylabel='birth weight (lbs)',
103
legend=False)
104
105
# plot the confidence intervals
106
thinkplot.PrePlot(2)
107
thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
108
PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
109
PlotConfidenceIntervals(ages, inters, slopes)
110
thinkplot.Save(root='linear5',
111
xlabel='age (years)',
112
ylabel='birth weight (lbs)',
113
title='90% CI',
114
axis=[10, 45, 0, 15],
115
legend=False)
116
117
# plot the sampling distribution of slope under null hypothesis
118
# and alternate hypothesis
119
sampling_cdf = thinkstats2.Cdf(slopes)
120
print('p-value, sampling distribution', sampling_cdf[0])
121
122
ht = SlopeTest((ages, weights))
123
pvalue = ht.PValue()
124
print('p-value, slope test', pvalue)
125
126
print('inter', inter, thinkstats2.Mean(inters))
127
Summarize(inters, inter)
128
print('slope', slope, thinkstats2.Mean(slopes))
129
Summarize(slopes, slope)
130
131
thinkplot.PrePlot(2)
132
thinkplot.Plot([0, 0], [0, 1], color='0.8')
133
ht.PlotCdf(label='null hypothesis')
134
thinkplot.Cdf(sampling_cdf, label='sampling distribution')
135
thinkplot.Save(root='linear4',
136
xlabel='slope (lbs / year)',
137
ylabel='CDF',
138
xlim=[-0.03, 0.03],
139
loc='upper left')
140
141
142
def PlotFit(live):
143
"""Plots a scatter plot and fitted curve.
144
145
live: DataFrame
146
"""
147
ages = live.agepreg
148
weights = live.totalwgt_lb
149
inter, slope = thinkstats2.LeastSquares(ages, weights)
150
fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)
151
152
thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
153
thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
154
thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
155
thinkplot.Save(root='linear1',
156
xlabel='age (years)',
157
ylabel='birth weight (lbs)',
158
axis=[10, 45, 0, 15],
159
legend=False)
160
161
162
def PlotResiduals(live):
163
"""Plots percentiles of the residuals.
164
165
live: DataFrame
166
"""
167
ages = live.agepreg
168
weights = live.totalwgt_lb
169
inter, slope = thinkstats2.LeastSquares(ages, weights)
170
live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope)
171
172
bins = np.arange(10, 48, 3)
173
indices = np.digitize(live.agepreg, bins)
174
groups = live.groupby(indices)
175
176
ages = [group.agepreg.mean() for _, group in groups][1:-1]
177
cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]
178
179
thinkplot.PrePlot(3)
180
for percent in [75, 50, 25]:
181
weights = [cdf.Percentile(percent) for cdf in cdfs]
182
label = '%dth' % percent
183
thinkplot.Plot(ages, weights, label=label)
184
185
thinkplot.Save(root='linear2',
186
xlabel='age (years)',
187
ylabel='residual (lbs)',
188
xlim=[10, 45])
189
190
191
class SlopeTest(thinkstats2.HypothesisTest):
192
"""Tests the slope of a linear least squares fit. """
193
194
def TestStatistic(self, data):
195
"""Computes the test statistic.
196
197
data: data in whatever form is relevant
198
"""
199
ages, weights = data
200
_, slope = thinkstats2.LeastSquares(ages, weights)
201
return slope
202
203
def MakeModel(self):
204
"""Builds a model of the null hypothesis.
205
"""
206
_, weights = self.data
207
self.ybar = weights.mean()
208
self.res = weights - self.ybar
209
210
def RunModel(self):
211
"""Runs the model of the null hypothesis.
212
213
returns: simulated data
214
"""
215
ages, _ = self.data
216
weights = self.ybar + np.random.permutation(self.res)
217
return ages, weights
218
219
220
def ResampleRowsWeighted(df, attr='finalwgt'):
221
"""Resamples a DataFrame using probabilities proportional to finalwgt.
222
223
df: DataFrame
224
attr: string column name to use as weights
225
226
returns: DataFrame
227
"""
228
weights = df[attr]
229
cdf = thinkstats2.Pmf(weights).MakeCdf()
230
indices = cdf.Sample(len(weights))
231
sample = df.loc[indices]
232
return sample
233
234
235
def EstimateBirthWeight(live, iters=1001):
236
"""Estimate mean birth weight by resampling, with and without weights.
237
238
live: DataFrame
239
iters: number of experiments to run
240
"""
241
242
mean = live.totalwgt_lb.mean()
243
print('mean', mean)
244
245
estimates = [thinkstats2.ResampleRows(live).totalwgt_lb.mean()
246
for _ in range(iters)]
247
Summarize(estimates)
248
249
estimates = [ResampleRowsWeighted(live).totalwgt_lb.mean()
250
for _ in range(iters)]
251
Summarize(estimates)
252
253
254
def main():
255
thinkstats2.RandomSeed(17)
256
257
live, _, _ = first.MakeFrames()
258
EstimateBirthWeight(live)
259
260
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
261
PlotSamplingDistributions(live)
262
263
PlotFit(live)
264
PlotResiduals(live)
265
266
267
if __name__ == '__main__':
268
main()
269
270