Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import nsfg
11
import nsfg2
12
import first
13
14
import thinkstats2
15
import thinkplot
16
17
import copy
18
import random
19
import numpy as np
20
import matplotlib.pyplot as pyplot
21
22
23
class CoinTest(thinkstats2.HypothesisTest):
24
"""Tests the hypothesis that a coin is fair."""
25
26
def TestStatistic(self, data):
27
"""Computes the test statistic.
28
29
data: data in whatever form is relevant
30
"""
31
heads, tails = data
32
test_stat = abs(heads - tails)
33
return test_stat
34
35
def RunModel(self):
36
"""Run the model of the null hypothesis.
37
38
returns: simulated data
39
"""
40
heads, tails = self.data
41
n = heads + tails
42
sample = [random.choice('HT') for _ in range(n)]
43
hist = thinkstats2.Hist(sample)
44
data = hist['H'], hist['T']
45
return data
46
47
48
class DiffMeansPermute(thinkstats2.HypothesisTest):
49
"""Tests a difference in means by permutation."""
50
51
def TestStatistic(self, data):
52
"""Computes the test statistic.
53
54
data: data in whatever form is relevant
55
"""
56
group1, group2 = data
57
test_stat = abs(group1.mean() - group2.mean())
58
return test_stat
59
60
def MakeModel(self):
61
"""Build a model of the null hypothesis.
62
"""
63
group1, group2 = self.data
64
self.n, self.m = len(group1), len(group2)
65
self.pool = np.hstack((group1, group2))
66
67
def RunModel(self):
68
"""Run the model of the null hypothesis.
69
70
returns: simulated data
71
"""
72
np.random.shuffle(self.pool)
73
data = self.pool[:self.n], self.pool[self.n:]
74
return data
75
76
77
class DiffMeansOneSided(DiffMeansPermute):
78
"""Tests a one-sided difference in means by permutation."""
79
80
def TestStatistic(self, data):
81
"""Computes the test statistic.
82
83
data: data in whatever form is relevant
84
"""
85
group1, group2 = data
86
test_stat = group1.mean() - group2.mean()
87
return test_stat
88
89
90
class DiffStdPermute(DiffMeansPermute):
91
"""Tests a one-sided difference in standard deviation by permutation."""
92
93
def TestStatistic(self, data):
94
"""Computes the test statistic.
95
96
data: data in whatever form is relevant
97
"""
98
group1, group2 = data
99
test_stat = group1.std() - group2.std()
100
return test_stat
101
102
103
class CorrelationPermute(thinkstats2.HypothesisTest):
104
"""Tests correlations by permutation."""
105
106
def TestStatistic(self, data):
107
"""Computes the test statistic.
108
109
data: tuple of xs and ys
110
"""
111
xs, ys = data
112
test_stat = abs(thinkstats2.Corr(xs, ys))
113
return test_stat
114
115
def RunModel(self):
116
"""Run the model of the null hypothesis.
117
118
returns: simulated data
119
"""
120
xs, ys = self.data
121
xs = np.random.permutation(xs)
122
return xs, ys
123
124
125
class DiceTest(thinkstats2.HypothesisTest):
126
"""Tests whether a six-sided die is fair."""
127
128
def TestStatistic(self, data):
129
"""Computes the test statistic.
130
131
data: list of frequencies
132
"""
133
observed = data
134
n = sum(observed)
135
expected = np.ones(6) * n / 6
136
test_stat = sum(abs(observed - expected))
137
return test_stat
138
139
def RunModel(self):
140
"""Run the model of the null hypothesis.
141
142
returns: simulated data
143
"""
144
n = sum(self.data)
145
values = [1,2,3,4,5,6]
146
rolls = np.random.choice(values, n, replace=True)
147
hist = thinkstats2.Hist(rolls)
148
freqs = hist.Freqs(values)
149
return freqs
150
151
152
class DiceChiTest(DiceTest):
153
"""Tests a six-sided die using a chi-squared statistic."""
154
155
def TestStatistic(self, data):
156
"""Computes the test statistic.
157
158
data: list of frequencies
159
"""
160
observed = data
161
n = sum(observed)
162
expected = np.ones(6) * n / 6
163
test_stat = sum((observed - expected)**2 / expected)
164
return test_stat
165
166
167
class PregLengthTest(thinkstats2.HypothesisTest):
168
"""Tests difference in pregnancy length using a chi-squared statistic."""
169
170
def TestStatistic(self, data):
171
"""Computes the test statistic.
172
173
data: pair of lists of pregnancy lengths
174
"""
175
firsts, others = data
176
stat = self.ChiSquared(firsts) + self.ChiSquared(others)
177
return stat
178
179
def ChiSquared(self, lengths):
180
"""Computes the chi-squared statistic.
181
182
lengths: sequence of lengths
183
184
returns: float
185
"""
186
hist = thinkstats2.Hist(lengths)
187
observed = np.array(hist.Freqs(self.values))
188
expected = self.expected_probs * len(lengths)
189
stat = sum((observed - expected)**2 / expected)
190
return stat
191
192
def MakeModel(self):
193
"""Build a model of the null hypothesis.
194
"""
195
firsts, others = self.data
196
self.n = len(firsts)
197
self.pool = np.hstack((firsts, others))
198
199
pmf = thinkstats2.Pmf(self.pool)
200
self.values = range(35, 44)
201
self.expected_probs = np.array(pmf.Probs(self.values))
202
203
def RunModel(self):
204
"""Run the model of the null hypothesis.
205
206
returns: simulated data
207
"""
208
np.random.shuffle(self.pool)
209
data = self.pool[:self.n], self.pool[self.n:]
210
return data
211
212
213
def RunDiceTest():
214
"""Tests whether a die is fair.
215
"""
216
data = [8, 9, 19, 5, 8, 11]
217
dt = DiceTest(data)
218
print('dice test', dt.PValue(iters=10000))
219
dt = DiceChiTest(data)
220
print('dice chi test', dt.PValue(iters=10000))
221
222
223
def FalseNegRate(data, num_runs=1000):
224
"""Computes the chance of a false negative based on resampling.
225
226
data: pair of sequences
227
num_runs: how many experiments to simulate
228
229
returns: float false negative rate
230
"""
231
group1, group2 = data
232
count = 0
233
234
for i in range(num_runs):
235
sample1 = thinkstats2.Resample(group1)
236
sample2 = thinkstats2.Resample(group2)
237
ht = DiffMeansPermute((sample1, sample2))
238
p_value = ht.PValue(iters=101)
239
if p_value > 0.05:
240
count += 1
241
242
return count / num_runs
243
244
245
def PrintTest(p_value, ht):
246
"""Prints results from a hypothesis test.
247
248
p_value: float
249
ht: HypothesisTest
250
"""
251
print('p-value =', p_value)
252
print('actual =', ht.actual)
253
print('ts max =', ht.MaxTestStat())
254
255
256
def RunTests(data, iters=1000):
257
"""Runs several tests on the given data.
258
259
data: pair of sequences
260
iters: number of iterations to run
261
"""
262
263
# test the difference in means
264
ht = DiffMeansPermute(data)
265
p_value = ht.PValue(iters=iters)
266
print('\nmeans permute two-sided')
267
PrintTest(p_value, ht)
268
269
ht.PlotCdf()
270
thinkplot.Save(root='hypothesis1',
271
title='Permutation test',
272
xlabel='difference in means (weeks)',
273
ylabel='CDF',
274
legend=False)
275
276
# test the difference in means one-sided
277
ht = DiffMeansOneSided(data)
278
p_value = ht.PValue(iters=iters)
279
print('\nmeans permute one-sided')
280
PrintTest(p_value, ht)
281
282
# test the difference in std
283
ht = DiffStdPermute(data)
284
p_value = ht.PValue(iters=iters)
285
print('\nstd permute one-sided')
286
PrintTest(p_value, ht)
287
288
289
def ReplicateTests():
290
"""Replicates tests with the new NSFG data."""
291
292
live, firsts, others = nsfg2.MakeFrames()
293
294
# compare pregnancy lengths
295
print('\nprglngth2')
296
data = firsts.prglngth.values, others.prglngth.values
297
ht = DiffMeansPermute(data)
298
p_value = ht.PValue(iters=1000)
299
print('means permute two-sided')
300
PrintTest(p_value, ht)
301
302
print('\nbirth weight 2')
303
data = (firsts.totalwgt_lb.dropna().values,
304
others.totalwgt_lb.dropna().values)
305
ht = DiffMeansPermute(data)
306
p_value = ht.PValue(iters=1000)
307
print('means permute two-sided')
308
PrintTest(p_value, ht)
309
310
# test correlation
311
live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
312
data = live2.agepreg.values, live2.totalwgt_lb.values
313
ht = CorrelationPermute(data)
314
p_value = ht.PValue()
315
print('\nage weight correlation 2')
316
PrintTest(p_value, ht)
317
318
# compare pregnancy lengths (chi-squared)
319
data = firsts.prglngth.values, others.prglngth.values
320
ht = PregLengthTest(data)
321
p_value = ht.PValue()
322
print('\npregnancy length chi-squared 2')
323
PrintTest(p_value, ht)
324
325
326
def main():
327
thinkstats2.RandomSeed(17)
328
329
# run the coin test
330
ct = CoinTest((140, 110))
331
pvalue = ct.PValue()
332
print('coin test p-value', pvalue)
333
334
# compare pregnancy lengths
335
print('\nprglngth')
336
live, firsts, others = first.MakeFrames()
337
data = firsts.prglngth.values, others.prglngth.values
338
RunTests(data)
339
340
# compare birth weights
341
print('\nbirth weight')
342
data = (firsts.totalwgt_lb.dropna().values,
343
others.totalwgt_lb.dropna().values)
344
ht = DiffMeansPermute(data)
345
p_value = ht.PValue(iters=1000)
346
print('means permute two-sided')
347
PrintTest(p_value, ht)
348
349
# test correlation
350
live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
351
data = live2.agepreg.values, live2.totalwgt_lb.values
352
ht = CorrelationPermute(data)
353
p_value = ht.PValue()
354
print('\nage weight correlation')
355
print('n=', len(live2))
356
PrintTest(p_value, ht)
357
358
# run the dice test
359
RunDiceTest()
360
361
# compare pregnancy lengths (chi-squared)
362
data = firsts.prglngth.values, others.prglngth.values
363
ht = PregLengthTest(data)
364
p_value = ht.PValue()
365
print('\npregnancy length chi-squared')
366
PrintTest(p_value, ht)
367
368
# compute the false negative rate for difference in pregnancy length
369
data = firsts.prglngth.values, others.prglngth.values
370
neg_rate = FalseNegRate(data)
371
print('false neg rate', neg_rate)
372
373
# run the tests with new nsfg data
374
ReplicateTests()
375
376
377
if __name__ == "__main__":
378
main()
379
380