CoCalc -- linear.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Views: ⁷¹¹⁹
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
import math
11
import numpy as np
12

13
import first
14
import thinkplot
15
import thinkstats2
16

17

18
def Summarize(estimates, actual=None):
19
    """Prints standard error and 90% confidence interval.
20

21
    estimates: sequence of estimates
22
    actual: float actual value
23
    """
24
    mean = thinkstats2.Mean(estimates)
25
    stderr = thinkstats2.Std(estimates, mu=actual)
26
    cdf = thinkstats2.Cdf(estimates)
27
    ci = cdf.ConfidenceInterval(90)
28
    print('mean, SE, CI', mean, stderr, ci)
29

30

31
def SamplingDistributions(live, iters=101):
32
    """Estimates sampling distributions by resampling rows.
33

34
    live: DataFrame
35
    iters: number of times to run simulations
36

37
    returns: pair of sequences (inters, slopes)
38
    """
39
    t = []
40
    for _ in range(iters):
41
        sample = thinkstats2.ResampleRows(live)
42
        ages = sample.agepreg
43
        weights = sample.totalwgt_lb
44
        estimates = thinkstats2.LeastSquares(ages, weights)
45
        t.append(estimates)
46

47
    inters, slopes = zip(*t)
48
    return inters, slopes
49

50

51
def PlotConfidenceIntervals(xs, inters, slopes, 
52
                            res=None, percent=90, **options):
53
    """Plots the 90% confidence intervals for weights based on ages.
54

55
    xs: sequence
56
    inters: estimated intercepts
57
    slopes: estimated slopes
58
    res: residuals
59
    percent: what percentile range to show
60
    """
61
    fys_seq = []
62
    for inter, slope in zip(inters, slopes):
63
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
64
        if res is not None:
65
            fys += np.random.permutation(res)
66
        fys_seq.append(fys)
67

68
    p = (100 - percent) / 2
69
    percents = p, 100 - p
70
    low, high = thinkstats2.PercentileRows(fys_seq, percents)
71
    thinkplot.FillBetween(fxs, low, high, **options)
72

73

74
def PlotSamplingDistributions(live):
75
    """Plots confidence intervals for the fitted curve and sampling dists.
76

77
    live: DataFrame
78
    """
79
    ages = live.agepreg
80
    weights = live.totalwgt_lb
81
    inter, slope = thinkstats2.LeastSquares(ages, weights)
82
    res = thinkstats2.Residuals(ages, weights, inter, slope)
83
    r2 = thinkstats2.CoefDetermination(weights, res)
84

85
    print('rho', thinkstats2.Corr(ages, weights))
86
    print('R2', r2)
87
    print('R', math.sqrt(r2))
88
    print('Std(ys)', thinkstats2.Std(weights))
89
    print('Std(res)', thinkstats2.Std(res))
90

91
    # plot the confidence intervals
92
    inters, slopes = SamplingDistributions(live, iters=1001)
93
    PlotConfidenceIntervals(ages, inters, slopes, percent=90, 
94
                            alpha=0.3, label='90% CI')
95
    thinkplot.Text(42, 7.53, '90%')
96
    PlotConfidenceIntervals(ages, inters, slopes, percent=50,
97
                            alpha=0.5, label='50% CI')
98
    thinkplot.Text(42, 7.59, '50%')
99

100
    thinkplot.Save(root='linear3',
101
                   xlabel='age (years)',
102
                   ylabel='birth weight (lbs)',
103
                   legend=False)
104

105
    # plot the confidence intervals
106
    thinkplot.PrePlot(2)
107
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
108
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
109
    PlotConfidenceIntervals(ages, inters, slopes)
110
    thinkplot.Save(root='linear5',
111
                   xlabel='age (years)',
112
                   ylabel='birth weight (lbs)',
113
                   title='90% CI',
114
                   axis=[10, 45, 0, 15],
115
                   legend=False)
116

117
    # plot the sampling distribution of slope under null hypothesis
118
    # and alternate hypothesis
119
    sampling_cdf = thinkstats2.Cdf(slopes)
120
    print('p-value, sampling distribution', sampling_cdf[0])
121

122
    ht = SlopeTest((ages, weights))
123
    pvalue = ht.PValue()
124
    print('p-value, slope test', pvalue)
125

126
    print('inter', inter, thinkstats2.Mean(inters))
127
    Summarize(inters, inter)
128
    print('slope', slope, thinkstats2.Mean(slopes))
129
    Summarize(slopes, slope)
130

131
    thinkplot.PrePlot(2)
132
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
133
    ht.PlotCdf(label='null hypothesis')
134
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
135
    thinkplot.Save(root='linear4',
136
                   xlabel='slope (lbs / year)',
137
                   ylabel='CDF',
138
                   xlim=[-0.03, 0.03],
139
                   loc='upper left')
140

141

142
def PlotFit(live):
143
    """Plots a scatter plot and fitted curve.
144

145
    live: DataFrame
146
    """
147
    ages = live.agepreg
148
    weights = live.totalwgt_lb
149
    inter, slope = thinkstats2.LeastSquares(ages, weights)
150
    fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)
151

152
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
153
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
154
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
155
    thinkplot.Save(root='linear1',
156
                   xlabel='age (years)',
157
                   ylabel='birth weight (lbs)',
158
                   axis=[10, 45, 0, 15],
159
                   legend=False)
160

161

162
def PlotResiduals(live):
163
    """Plots percentiles of the residuals.
164

165
    live: DataFrame
166
    """
167
    ages = live.agepreg
168
    weights = live.totalwgt_lb
169
    inter, slope = thinkstats2.LeastSquares(ages, weights)
170
    live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope)
171

172
    bins = np.arange(10, 48, 3)
173
    indices = np.digitize(live.agepreg, bins)
174
    groups = live.groupby(indices)
175

176
    ages = [group.agepreg.mean() for _, group in groups][1:-1]
177
    cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]
178

179
    thinkplot.PrePlot(3)
180
    for percent in [75, 50, 25]:
181
        weights = [cdf.Percentile(percent) for cdf in cdfs]
182
        label = '%dth' % percent
183
        thinkplot.Plot(ages, weights, label=label)
184

185
    thinkplot.Save(root='linear2',
186
                   xlabel='age (years)',
187
                   ylabel='residual (lbs)',
188
                   xlim=[10, 45])
189

190

191
class SlopeTest(thinkstats2.HypothesisTest):
192
    """Tests the slope of a linear least squares fit. """
193

194
    def TestStatistic(self, data):
195
        """Computes the test statistic.
196

197
        data: data in whatever form is relevant        
198
        """
199
        ages, weights = data
200
        _, slope = thinkstats2.LeastSquares(ages, weights)
201
        return slope
202

203
    def MakeModel(self):
204
        """Builds a model of the null hypothesis.
205
        """
206
        _, weights = self.data
207
        self.ybar = weights.mean()
208
        self.res = weights - self.ybar
209

210
    def RunModel(self):
211
        """Runs the model of the null hypothesis.
212

213
        returns: simulated data
214
        """
215
        ages, _ = self.data
216
        weights = self.ybar + np.random.permutation(self.res)
217
        return ages, weights
218

219

220
def ResampleRowsWeighted(df, attr='finalwgt'):
221
    """Resamples a DataFrame using probabilities proportional to finalwgt.
222

223
    df: DataFrame
224
    attr: string column name to use as weights
225

226
    returns: DataFrame
227
    """
228
    weights = df[attr]
229
    cdf = thinkstats2.Pmf(weights).MakeCdf()
230
    indices = cdf.Sample(len(weights))
231
    sample = df.loc[indices]
232
    return sample
233

234

235
def EstimateBirthWeight(live, iters=1001):
236
    """Estimate mean birth weight by resampling, with and without weights.
237

238
    live: DataFrame
239
    iters: number of experiments to run
240
    """
241

242
    mean = live.totalwgt_lb.mean()
243
    print('mean', mean)
244

245
    estimates = [thinkstats2.ResampleRows(live).totalwgt_lb.mean()
246
                 for _ in range(iters)]
247
    Summarize(estimates)
248

249
    estimates = [ResampleRowsWeighted(live).totalwgt_lb.mean()
250
                 for _ in range(iters)]
251
    Summarize(estimates)
252
    
253

254
def main():
255
    thinkstats2.RandomSeed(17)
256
    
257
    live, _, _ = first.MakeFrames()
258
    EstimateBirthWeight(live)
259

260
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
261
    PlotSamplingDistributions(live)
262
    
263
    PlotFit(live)
264
    PlotResiduals(live)
265

266

267
if __name__ == '__main__':
268
    main()
269

270