| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import math10import numpy as np1112import first13import thinkplot14import thinkstats2151617def Summarize(estimates, actual=None):18"""Prints standard error and 90% confidence interval.1920estimates: sequence of estimates21actual: float actual value22"""23mean = thinkstats2.Mean(estimates)24stderr = thinkstats2.Std(estimates, mu=actual)25cdf = thinkstats2.Cdf(estimates)26ci = cdf.ConfidenceInterval(90)27print('mean, SE, CI', mean, stderr, ci)282930def SamplingDistributions(live, iters=101):31"""Estimates sampling distributions by resampling rows.3233live: DataFrame34iters: number of times to run simulations3536returns: pair of sequences (inters, slopes)37"""38t = []39for _ in range(iters):40sample = thinkstats2.ResampleRows(live)41ages = sample.agepreg42weights = sample.totalwgt_lb43estimates = thinkstats2.LeastSquares(ages, weights)44t.append(estimates)4546inters, slopes = zip(*t)47return inters, slopes484950def PlotConfidenceIntervals(xs, inters, slopes,51res=None, percent=90, **options):52"""Plots the 90% confidence intervals for weights based on ages.5354xs: sequence55inters: estimated intercepts56slopes: estimated slopes57res: residuals58percent: what percentile range to show59"""60fys_seq = []61for inter, slope in zip(inters, slopes):62fxs, fys = thinkstats2.FitLine(xs, inter, slope)63if res is not None:64fys += np.random.permutation(res)65fys_seq.append(fys)6667p = (100 - percent) / 268percents = p, 100 - p69low, high = thinkstats2.PercentileRows(fys_seq, percents)70thinkplot.FillBetween(fxs, low, high, **options)717273def PlotSamplingDistributions(live):74"""Plots confidence intervals for the fitted curve and sampling dists.7576live: DataFrame77"""78ages = live.agepreg79weights = live.totalwgt_lb80inter, slope = thinkstats2.LeastSquares(ages, weights)81res = thinkstats2.Residuals(ages, weights, inter, slope)82r2 = thinkstats2.CoefDetermination(weights, res)8384print('rho', thinkstats2.Corr(ages, weights))85print('R2', r2)86print('R', math.sqrt(r2))87print('Std(ys)', thinkstats2.Std(weights))88print('Std(res)', thinkstats2.Std(res))8990# plot the confidence intervals91inters, slopes = SamplingDistributions(live, iters=1001)92PlotConfidenceIntervals(ages, inters, slopes, percent=90,93alpha=0.3, label='90% CI')94thinkplot.Text(42, 7.53, '90%')95PlotConfidenceIntervals(ages, inters, slopes, percent=50,96alpha=0.5, label='50% CI')97thinkplot.Text(42, 7.59, '50%')9899thinkplot.Save(root='linear3',100xlabel='age (years)',101ylabel='birth weight (lbs)',102legend=False)103104# plot the confidence intervals105thinkplot.PrePlot(2)106thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)107PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)108PlotConfidenceIntervals(ages, inters, slopes)109thinkplot.Save(root='linear5',110xlabel='age (years)',111ylabel='birth weight (lbs)',112title='90% CI',113axis=[10, 45, 0, 15],114legend=False)115116# plot the sampling distribution of slope under null hypothesis117# and alternate hypothesis118sampling_cdf = thinkstats2.Cdf(slopes)119print('p-value, sampling distribution', sampling_cdf[0])120121ht = SlopeTest((ages, weights))122pvalue = ht.PValue()123print('p-value, slope test', pvalue)124125print('inter', inter, thinkstats2.Mean(inters))126Summarize(inters, inter)127print('slope', slope, thinkstats2.Mean(slopes))128Summarize(slopes, slope)129130thinkplot.PrePlot(2)131thinkplot.Plot([0, 0], [0, 1], color='0.8')132ht.PlotCdf(label='null hypothesis')133thinkplot.Cdf(sampling_cdf, label='sampling distribution')134thinkplot.Save(root='linear4',135xlabel='slope (lbs / year)',136ylabel='CDF',137xlim=[-0.03, 0.03],138loc='upper left')139140141def PlotFit(live):142"""Plots a scatter plot and fitted curve.143144live: DataFrame145"""146ages = live.agepreg147weights = live.totalwgt_lb148inter, slope = thinkstats2.LeastSquares(ages, weights)149fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)150151thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)152thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)153thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)154thinkplot.Save(root='linear1',155xlabel='age (years)',156ylabel='birth weight (lbs)',157axis=[10, 45, 0, 15],158legend=False)159160161def PlotResiduals(live):162"""Plots percentiles of the residuals.163164live: DataFrame165"""166ages = live.agepreg167weights = live.totalwgt_lb168inter, slope = thinkstats2.LeastSquares(ages, weights)169live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope)170171bins = np.arange(10, 48, 3)172indices = np.digitize(live.agepreg, bins)173groups = live.groupby(indices)174175ages = [group.agepreg.mean() for _, group in groups][1:-1]176cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]177178thinkplot.PrePlot(3)179for percent in [75, 50, 25]:180weights = [cdf.Percentile(percent) for cdf in cdfs]181label = '%dth' % percent182thinkplot.Plot(ages, weights, label=label)183184thinkplot.Save(root='linear2',185xlabel='age (years)',186ylabel='residual (lbs)',187xlim=[10, 45])188189190class SlopeTest(thinkstats2.HypothesisTest):191"""Tests the slope of a linear least squares fit. """192193def TestStatistic(self, data):194"""Computes the test statistic.195196data: data in whatever form is relevant197"""198ages, weights = data199_, slope = thinkstats2.LeastSquares(ages, weights)200return slope201202def MakeModel(self):203"""Builds a model of the null hypothesis.204"""205_, weights = self.data206self.ybar = weights.mean()207self.res = weights - self.ybar208209def RunModel(self):210"""Runs the model of the null hypothesis.211212returns: simulated data213"""214ages, _ = self.data215weights = self.ybar + np.random.permutation(self.res)216return ages, weights217218219def ResampleRowsWeighted(df, attr='finalwgt'):220"""Resamples a DataFrame using probabilities proportional to finalwgt.221222df: DataFrame223attr: string column name to use as weights224225returns: DataFrame226"""227weights = df[attr]228cdf = thinkstats2.Pmf(weights).MakeCdf()229indices = cdf.Sample(len(weights))230sample = df.loc[indices]231return sample232233234def EstimateBirthWeight(live, iters=1001):235"""Estimate mean birth weight by resampling, with and without weights.236237live: DataFrame238iters: number of experiments to run239"""240241mean = live.totalwgt_lb.mean()242print('mean', mean)243244estimates = [thinkstats2.ResampleRows(live).totalwgt_lb.mean()245for _ in range(iters)]246Summarize(estimates)247248estimates = [ResampleRowsWeighted(live).totalwgt_lb.mean()249for _ in range(iters)]250Summarize(estimates)251252253def main():254thinkstats2.RandomSeed(17)255256live, _, _ = first.MakeFrames()257EstimateBirthWeight(live)258259live = live.dropna(subset=['agepreg', 'totalwgt_lb'])260PlotSamplingDistributions(live)261262PlotFit(live)263PlotResiduals(live)264265266if __name__ == '__main__':267main()268269270