CoCalc -- chap14soln.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Path: think-stats-code / chap14soln.py
Views: ⁷¹¹⁵
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
import numpy as np
11
import random
12

13
import first
14
import normal
15
import thinkstats2
16
import thinkplot
17

18

19
def PlotPregLengths(live, firsts, others):
20
    """Plots sampling distributions under the null and alternate hypotheses.
21

22
    live, firsts, others: DataFrames
23

24
    Results:  
25
    null hypothesis N(0, 0.00319708)
26
    0.0837707042554 0.0837707042554     (90% CI)
27

28
    estimated params N(0.0780373, 0.00321144)
29
    -0.0151758158699 0.171250349425     (90% CI)
30

31
    Sampling distribution under the null hypothesis is centered
32
    around 0.
33

34
    Sampling distribution under the null hypothesis is centered
35
    around the observed difference, 0.078.
36

37
    The variance of the two distributions is very similar; in practice,
38
    you could reasonably compute whichever one is easier.
39

40
    """
41
    print('prglngth example')
42
    delta = firsts.prglngth.mean() - others.prglngth.mean()
43
    print(delta)
44

45
    dist1 = normal.SamplingDistMean(live.prglngth, len(firsts))
46
    dist2 = normal.SamplingDistMean(live.prglngth, len(others))
47
    dist = dist1 - dist2
48
    print('null hypothesis', dist)
49
    print(dist.Prob(-delta), 1 - dist.Prob(delta))
50

51
    thinkplot.PrePlot(2)
52
    thinkplot.Plot(dist, label='null hypothesis')
53

54
    dist1 = normal.SamplingDistMean(firsts.prglngth, len(firsts))
55
    dist2 = normal.SamplingDistMean(others.prglngth, len(others))
56
    dist = dist1 - dist2
57
    print('estimated params', dist)
58
    print(dist.Percentile(5), dist.Percentile(95))
59

60
    thinkplot.Plot(dist, label='estimated params')
61
    thinkplot.Show(xlabel='difference in means (weeks)',
62
                   ylabel='CDF')
63

64

65
def GenerateAdultWeight(birth_weights, n):
66
    """Generate a random adult weight by simulating annual gain.
67

68
    birth_weights: sequence of birth weights in lbs
69
    n: number of years to simulate
70

71
    returns: adult weight in lbs
72
    """
73
    bw = random.choice(birth_weights)
74
    factors = np.random.normal(1.09, 0.03, n)
75
    aw = bw * np.prod(factors)
76
    return aw
77

78

79
def PlotAdultWeights(live):
80
    """Makes a normal probability plot of log10 adult weight.
81

82
    live: DataFrame of live births
83

84
    results: 
85

86
    With n=40 the distribution is approximately lognormal except for
87
    the lowest weights.
88

89
    Actual distribution might deviate from lognormal because it is
90
    a mixture of people at different ages, or because annual weight
91
    gains are correlated.
92
    """
93
    birth_weights = live.totalwgt_lb.dropna().values
94
    aws = [GenerateAdultWeight(birth_weights, 40) for _ in range(1000)]
95
    log_aws = np.log10(aws)
96
    thinkstats2.NormalProbabilityPlot(log_aws)
97
    thinkplot.Show(xlabel='standard normal values',
98
                   ylabel='adult weight (log10 lbs)')
99

100

101
def TestIntervention():
102
    """Tests whether reported changes are statistically significant.
103

104
    Results:
105
    -1.66 4.73095323208e-05
106
    -0.26 0.125267987207
107
     1.4 0.00182694836898
108

109
    Conclusions:
110

111
    1) Gender gap before intervention was 1.66 points (p-value 5e-5)
112

113
    2) Genger gap after was 0.26 points (p-value 0.13, no significant)
114

115
    3) Change in gender gap was 1.4 points (p-value 0.002, significant).
116
    """
117
    male_before = normal.Normal(3.57, 0.28**2)
118
    male_after = normal.Normal(3.44, 0.16**2)
119

120
    female_before = normal.Normal(1.91, 0.32**2)
121
    female_after = normal.Normal(3.18, 0.16**2)
122

123
    diff_before = female_before - male_before
124
    print('mean, p-value', diff_before.mu, 1-diff_before.Prob(0))
125
    print('CI', diff_before.Percentile(5), diff_before.Percentile(95))
126
    print('stderr', diff_before.sigma)
127

128
    diff_after = female_after - male_after
129
    print('mean, p-value', diff_after.mu, 1-diff_after.Prob(0))
130
    print('CI', diff_after.Percentile(5), diff_after.Percentile(95))
131
    print('stderr', diff_after.sigma)
132

133
    diff = diff_after - diff_before
134
    print('mean, p-value', diff.mu, diff.Prob(0))
135
    print('CI', diff.Percentile(5), diff.Percentile(95))
136
    print('stderr', diff.sigma)
137

138

139
def main():
140
    thinkstats2.RandomSeed(17)
141

142
    TestIntervention()
143
    return
144

145
    live, firsts, others = first.MakeFrames()
146
    PlotAdultWeights(live)
147

148
    PlotPregLengths(live, firsts, others)
149

150

151
if __name__ == '__main__':
152
    main()
153

154