CoCalc -- hypothesis.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Path: think-stats-code / hypothesis.py
Views: ⁷¹¹⁹
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
import nsfg
11
import nsfg2
12
import first
13

14
import thinkstats2
15
import thinkplot
16

17
import copy
18
import random
19
import numpy as np
20
import matplotlib.pyplot as pyplot
21

22

23
class CoinTest(thinkstats2.HypothesisTest):
24
    """Tests the hypothesis that a coin is fair."""
25

26
    def TestStatistic(self, data):
27
        """Computes the test statistic.
28

29
        data: data in whatever form is relevant        
30
        """
31
        heads, tails = data
32
        test_stat = abs(heads - tails)
33
        return test_stat
34

35
    def RunModel(self):
36
        """Run the model of the null hypothesis.
37

38
        returns: simulated data
39
        """
40
        heads, tails = self.data
41
        n = heads + tails
42
        sample = [random.choice('HT') for _ in range(n)]
43
        hist = thinkstats2.Hist(sample)
44
        data = hist['H'], hist['T']
45
        return data
46

47

48
class DiffMeansPermute(thinkstats2.HypothesisTest):
49
    """Tests a difference in means by permutation."""
50

51
    def TestStatistic(self, data):
52
        """Computes the test statistic.
53

54
        data: data in whatever form is relevant        
55
        """
56
        group1, group2 = data
57
        test_stat = abs(group1.mean() - group2.mean())
58
        return test_stat
59

60
    def MakeModel(self):
61
        """Build a model of the null hypothesis.
62
        """
63
        group1, group2 = self.data
64
        self.n, self.m = len(group1), len(group2)
65
        self.pool = np.hstack((group1, group2))
66

67
    def RunModel(self):
68
        """Run the model of the null hypothesis.
69

70
        returns: simulated data
71
        """
72
        np.random.shuffle(self.pool)
73
        data = self.pool[:self.n], self.pool[self.n:]
74
        return data
75

76

77
class DiffMeansOneSided(DiffMeansPermute):
78
    """Tests a one-sided difference in means by permutation."""
79

80
    def TestStatistic(self, data):
81
        """Computes the test statistic.
82

83
        data: data in whatever form is relevant        
84
        """
85
        group1, group2 = data
86
        test_stat = group1.mean() - group2.mean()
87
        return test_stat
88

89

90
class DiffStdPermute(DiffMeansPermute):
91
    """Tests a one-sided difference in standard deviation by permutation."""
92

93
    def TestStatistic(self, data):
94
        """Computes the test statistic.
95

96
        data: data in whatever form is relevant        
97
        """
98
        group1, group2 = data
99
        test_stat = group1.std() - group2.std()
100
        return test_stat
101

102

103
class CorrelationPermute(thinkstats2.HypothesisTest):
104
    """Tests correlations by permutation."""
105

106
    def TestStatistic(self, data):
107
        """Computes the test statistic.
108

109
        data: tuple of xs and ys
110
        """
111
        xs, ys = data
112
        test_stat = abs(thinkstats2.Corr(xs, ys))
113
        return test_stat
114

115
    def RunModel(self):
116
        """Run the model of the null hypothesis.
117

118
        returns: simulated data
119
        """
120
        xs, ys = self.data
121
        xs = np.random.permutation(xs)
122
        return xs, ys
123

124

125
class DiceTest(thinkstats2.HypothesisTest):
126
    """Tests whether a six-sided die is fair."""
127

128
    def TestStatistic(self, data):
129
        """Computes the test statistic.
130

131
        data: list of frequencies
132
        """
133
        observed = data
134
        n = sum(observed)
135
        expected = np.ones(6) * n / 6
136
        test_stat = sum(abs(observed - expected))
137
        return test_stat
138

139
    def RunModel(self):
140
        """Run the model of the null hypothesis.
141

142
        returns: simulated data
143
        """
144
        n = sum(self.data)
145
        values = [1,2,3,4,5,6]
146
        rolls = np.random.choice(values, n, replace=True)
147
        hist = thinkstats2.Hist(rolls)
148
        freqs = hist.Freqs(values)
149
        return freqs
150

151

152
class DiceChiTest(DiceTest):
153
    """Tests a six-sided die using a chi-squared statistic."""
154

155
    def TestStatistic(self, data):
156
        """Computes the test statistic.
157

158
        data: list of frequencies
159
        """
160
        observed = data
161
        n = sum(observed)
162
        expected = np.ones(6) * n / 6
163
        test_stat = sum((observed - expected)**2 / expected)
164
        return test_stat
165

166

167
class PregLengthTest(thinkstats2.HypothesisTest):
168
    """Tests difference in pregnancy length using a chi-squared statistic."""
169

170
    def TestStatistic(self, data):
171
        """Computes the test statistic.
172

173
        data: pair of lists of pregnancy lengths
174
        """
175
        firsts, others = data
176
        stat = self.ChiSquared(firsts) + self.ChiSquared(others)
177
        return stat
178

179
    def ChiSquared(self, lengths):
180
        """Computes the chi-squared statistic.
181
        
182
        lengths: sequence of lengths
183

184
        returns: float
185
        """
186
        hist = thinkstats2.Hist(lengths)
187
        observed = np.array(hist.Freqs(self.values))
188
        expected = self.expected_probs * len(lengths)
189
        stat = sum((observed - expected)**2 / expected)
190
        return stat
191

192
    def MakeModel(self):
193
        """Build a model of the null hypothesis.
194
        """
195
        firsts, others = self.data
196
        self.n = len(firsts)
197
        self.pool = np.hstack((firsts, others))
198

199
        pmf = thinkstats2.Pmf(self.pool)
200
        self.values = range(35, 44)
201
        self.expected_probs = np.array(pmf.Probs(self.values))
202

203
    def RunModel(self):
204
        """Run the model of the null hypothesis.
205

206
        returns: simulated data
207
        """
208
        np.random.shuffle(self.pool)
209
        data = self.pool[:self.n], self.pool[self.n:]
210
        return data
211

212

213
def RunDiceTest():
214
    """Tests whether a die is fair.
215
    """
216
    data = [8, 9, 19, 5, 8, 11]
217
    dt = DiceTest(data)
218
    print('dice test', dt.PValue(iters=10000))
219
    dt = DiceChiTest(data)
220
    print('dice chi test', dt.PValue(iters=10000))
221

222

223
def FalseNegRate(data, num_runs=1000):
224
    """Computes the chance of a false negative based on resampling.
225

226
    data: pair of sequences
227
    num_runs: how many experiments to simulate
228

229
    returns: float false negative rate
230
    """
231
    group1, group2 = data
232
    count = 0
233

234
    for i in range(num_runs):
235
        sample1 = thinkstats2.Resample(group1)
236
        sample2 = thinkstats2.Resample(group2)
237
        ht = DiffMeansPermute((sample1, sample2))
238
        p_value = ht.PValue(iters=101)
239
        if p_value > 0.05:
240
            count += 1
241

242
    return count / num_runs
243

244

245
def PrintTest(p_value, ht):
246
    """Prints results from a hypothesis test.
247

248
    p_value: float
249
    ht: HypothesisTest
250
    """
251
    print('p-value =', p_value)
252
    print('actual =', ht.actual)
253
    print('ts max =', ht.MaxTestStat())
254

255

256
def RunTests(data, iters=1000):
257
    """Runs several tests on the given data.
258

259
    data: pair of sequences
260
    iters: number of iterations to run
261
    """
262

263
    # test the difference in means
264
    ht = DiffMeansPermute(data)
265
    p_value = ht.PValue(iters=iters)
266
    print('\nmeans permute two-sided')
267
    PrintTest(p_value, ht)
268

269
    ht.PlotCdf()
270
    thinkplot.Save(root='hypothesis1',
271
                   title='Permutation test',
272
                   xlabel='difference in means (weeks)',
273
                   ylabel='CDF',
274
                   legend=False) 
275
    
276
    # test the difference in means one-sided
277
    ht = DiffMeansOneSided(data)
278
    p_value = ht.PValue(iters=iters)
279
    print('\nmeans permute one-sided')
280
    PrintTest(p_value, ht)
281

282
    # test the difference in std
283
    ht = DiffStdPermute(data)
284
    p_value = ht.PValue(iters=iters)
285
    print('\nstd permute one-sided')
286
    PrintTest(p_value, ht)
287

288

289
def ReplicateTests():    
290
    """Replicates tests with the new NSFG data."""
291

292
    live, firsts, others = nsfg2.MakeFrames()
293

294
    # compare pregnancy lengths
295
    print('\nprglngth2')
296
    data = firsts.prglngth.values, others.prglngth.values
297
    ht = DiffMeansPermute(data)
298
    p_value = ht.PValue(iters=1000)
299
    print('means permute two-sided')
300
    PrintTest(p_value, ht)
301

302
    print('\nbirth weight 2')
303
    data = (firsts.totalwgt_lb.dropna().values,
304
            others.totalwgt_lb.dropna().values)
305
    ht = DiffMeansPermute(data)
306
    p_value = ht.PValue(iters=1000)
307
    print('means permute two-sided')
308
    PrintTest(p_value, ht)
309

310
    # test correlation
311
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
312
    data = live2.agepreg.values, live2.totalwgt_lb.values
313
    ht = CorrelationPermute(data)
314
    p_value = ht.PValue()
315
    print('\nage weight correlation 2')
316
    PrintTest(p_value, ht)
317

318
    # compare pregnancy lengths (chi-squared)
319
    data = firsts.prglngth.values, others.prglngth.values
320
    ht = PregLengthTest(data)
321
    p_value = ht.PValue()
322
    print('\npregnancy length chi-squared 2')
323
    PrintTest(p_value, ht)
324

325

326
def main():
327
    thinkstats2.RandomSeed(17)
328

329
    # run the coin test
330
    ct = CoinTest((140, 110))
331
    pvalue = ct.PValue()
332
    print('coin test p-value', pvalue)
333

334
    # compare pregnancy lengths
335
    print('\nprglngth')
336
    live, firsts, others = first.MakeFrames()
337
    data = firsts.prglngth.values, others.prglngth.values
338
    RunTests(data)
339

340
    # compare birth weights
341
    print('\nbirth weight')
342
    data = (firsts.totalwgt_lb.dropna().values,
343
            others.totalwgt_lb.dropna().values)
344
    ht = DiffMeansPermute(data)
345
    p_value = ht.PValue(iters=1000)
346
    print('means permute two-sided')
347
    PrintTest(p_value, ht)
348

349
    # test correlation
350
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
351
    data = live2.agepreg.values, live2.totalwgt_lb.values
352
    ht = CorrelationPermute(data)
353
    p_value = ht.PValue()
354
    print('\nage weight correlation')
355
    print('n=', len(live2))
356
    PrintTest(p_value, ht)
357

358
    # run the dice test
359
    RunDiceTest()
360

361
    # compare pregnancy lengths (chi-squared)
362
    data = firsts.prglngth.values, others.prglngth.values
363
    ht = PregLengthTest(data)
364
    p_value = ht.PValue()
365
    print('\npregnancy length chi-squared')
366
    PrintTest(p_value, ht)
367

368
    # compute the false negative rate for difference in pregnancy length
369
    data = firsts.prglngth.values, others.prglngth.values
370
    neg_rate = FalseNegRate(data)
371
    print('false neg rate', neg_rate)
372

373
    # run the tests with new nsfg data
374
    ReplicateTests()
375

376

377
if __name__ == "__main__":
378
    main()
379

380