| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7138License: GPL3
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import math10import numpy as np11import random12import scipy.stats1314import first15import hypothesis16import thinkstats217import thinkplot181920class Normal(object):21"""Represents a Normal distribution"""2223def __init__(self, mu, sigma2, label=''):24"""Initializes.2526mu: mean27sigma2: variance28"""29self.mu = mu30self.sigma2 = sigma231self.label = label3233def __repr__(self):34"""Returns a string representation."""35if self.label:36return 'Normal(%g, %g, %s)' % (self.mu, self.sigma2, self.label)37else:38return 'Normal(%g, %g)' % (self.mu, self.sigma2)3940__str__ = __repr__4142@property43def sigma(self):44"""Returns the standard deviation."""45return math.sqrt(self.sigma2)4647def __add__(self, other):48"""Adds a number or other Normal.4950other: number or Normal5152returns: new Normal53"""54if isinstance(other, Normal):55return Normal(self.mu + other.mu, self.sigma2 + other.sigma2)56else:57return Normal(self.mu + other, self.sigma2)5859__radd__ = __add__6061def __sub__(self, other):62"""Subtracts a number or other Normal.6364other: number or Normal6566returns: new Normal67"""68if isinstance(other, Normal):69return Normal(self.mu - other.mu, self.sigma2 + other.sigma2)70else:71return Normal(self.mu - other, self.sigma2)7273__rsub__ = __sub__7475def __mul__(self, factor):76"""Multiplies by a scalar.7778factor: number7980returns: new Normal81"""82return Normal(factor * self.mu, factor**2 * self.sigma2)8384__rmul__ = __mul__8586def __div__(self, divisor):87"""Divides by a scalar.8889divisor: number9091returns: new Normal92"""93return 1.0 / divisor * self9495__truediv__ = __div__9697def Sum(self, n):98"""Returns the distribution of the sum of n values.99100n: int101102returns: new Normal103"""104return Normal(n * self.mu, n * self.sigma2)105106def Render(self):107"""Returns pair of xs, ys suitable for plotting.108"""109mean, std = self.mu, self.sigma110low, high = mean - 3 * std, mean + 3 * std111xs, ys = thinkstats2.RenderNormalCdf(mean, std, low, high)112return xs, ys113114def Prob(self, x):115"""Cumulative probability of x.116117x: numeric118"""119return thinkstats2.EvalNormalCdf(x, self.mu, self.sigma)120121def Percentile(self, p):122"""Inverse CDF of p.123124p: percentile rank 0-100125"""126return thinkstats2.EvalNormalCdfInverse(p/100, self.mu, self.sigma)127128129def NormalPlotSamples(samples, plot=1, ylabel=''):130"""Makes normal probability plots for samples.131132samples: list of samples133label: string134"""135for n, sample in samples:136thinkplot.SubPlot(plot)137thinkstats2.NormalProbabilityPlot(sample)138139thinkplot.Config(title='n=%d' % n,140legend=False,141xticks=[],142yticks=[],143ylabel=ylabel)144plot += 1145146147def MakeExpoSamples(beta=2.0, iters=1000):148"""Generates samples from an exponential distribution.149150beta: parameter151iters: number of samples to generate for each size152153returns: list of samples154"""155samples = []156for n in [1, 10, 100]:157sample = [np.sum(np.random.exponential(beta, n))158for _ in range(iters)]159samples.append((n, sample))160return samples161162163def MakeLognormalSamples(mu=1.0, sigma=1.0, iters=1000):164"""Generates samples from a lognormal distribution.165166mu: parmeter167sigma: parameter168iters: number of samples to generate for each size169170returns: list of samples171"""172samples = []173for n in [1, 10, 100]:174sample = [np.sum(np.random.lognormal(mu, sigma, n))175for _ in range(iters)]176samples.append((n, sample))177return samples178179180def MakeParetoSamples(alpha=1.0, iters=1000):181"""Generates samples from a Pareto distribution.182183alpha: parameter184iters: number of samples to generate for each size185186returns: list of samples187"""188samples = []189190for n in [1, 10, 100]:191sample = [np.sum(np.random.pareto(alpha, n))192for _ in range(iters)]193samples.append((n, sample))194return samples195196197def GenerateCorrelated(rho, n):198"""Generates a sequence of correlated values from a standard normal dist.199200rho: coefficient of correlation201n: length of sequence202203returns: iterator204"""205x = random.gauss(0, 1)206yield x207208sigma = math.sqrt(1 - rho**2)209for _ in range(n-1):210x = random.gauss(x * rho, sigma)211yield x212213214def GenerateExpoCorrelated(rho, n):215"""Generates a sequence of correlated values from an exponential dist.216217rho: coefficient of correlation218n: length of sequence219220returns: NumPy array221"""222normal = list(GenerateCorrelated(rho, n))223uniform = scipy.stats.norm.cdf(normal)224expo = scipy.stats.expon.ppf(uniform)225return expo226227228def MakeCorrelatedSamples(rho=0.9, iters=1000):229"""Generates samples from a correlated exponential distribution.230231rho: correlation232iters: number of samples to generate for each size233234returns: list of samples235"""236samples = []237for n in [1, 10, 100]:238sample = [np.sum(GenerateExpoCorrelated(rho, n))239for _ in range(iters)]240samples.append((n, sample))241return samples242243244def SamplingDistMean(data, n):245"""Computes the sampling distribution of the mean.246247data: sequence of values representing the population248n: sample size249250returns: Normal object251"""252mean, var = data.mean(), data.var()253dist = Normal(mean, var)254return dist.Sum(n) / n255256257def PlotPregLengths(live, firsts, others):258"""Plots sampling distribution of difference in means.259260live, firsts, others: DataFrames261"""262print('prglngth example')263delta = firsts.prglngth.mean() - others.prglngth.mean()264print(delta)265266dist1 = SamplingDistMean(live.prglngth, len(firsts))267dist2 = SamplingDistMean(live.prglngth, len(others))268dist = dist1 - dist2269print('null hypothesis', dist)270print(dist.Prob(-delta), 1 - dist.Prob(delta))271272thinkplot.Plot(dist, label='null hypothesis')273thinkplot.Save(root='normal3',274xlabel='difference in means (weeks)',275ylabel='CDF')276277278class CorrelationPermute(hypothesis.CorrelationPermute):279"""Tests correlations by permutation."""280281def TestStatistic(self, data):282"""Computes the test statistic.283284data: tuple of xs and ys285"""286xs, ys = data287return np.corrcoef(xs, ys)[0][1]288289290def ResampleCorrelations(live):291"""Tests the correlation between birth weight and mother's age.292293live: DataFrame for live births294295returns: sample size, observed correlation, CDF of resampled correlations296"""297live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])298data = live2.agepreg.values, live2.totalwgt_lb.values299ht = CorrelationPermute(data)300p_value = ht.PValue()301return len(live2), ht.actual, ht.test_cdf302303304def StudentCdf(n):305"""Computes the CDF correlations from uncorrelated variables.306307n: sample size308309returns: Cdf310"""311ts = np.linspace(-3, 3, 101)312ps = scipy.stats.t.cdf(ts, df=n-2)313rs = ts / np.sqrt(n - 2 + ts**2)314return thinkstats2.Cdf(rs, ps)315316317def TestCorrelation(live):318"""Tests correlation analytically.319320live: DataFrame for live births321322"""323n, r, cdf = ResampleCorrelations(live)324325model = StudentCdf(n)326thinkplot.Plot(model.xs, model.ps, color='gray',327alpha=0.3, label='Student t')328thinkplot.Cdf(cdf, label='sample')329330thinkplot.Save(root='normal4',331xlabel='correlation',332ylabel='CDF')333334t = r * math.sqrt((n-2) / (1-r**2))335p_value = 1 - scipy.stats.t.cdf(t, df=n-2)336print(r, p_value)337338339def ChiSquaredCdf(n):340"""Discrete approximation of the chi-squared CDF with df=n-1.341342n: sample size343344returns: Cdf345"""346xs = np.linspace(0, 25, 101)347ps = scipy.stats.chi2.cdf(xs, df=n-1)348return thinkstats2.Cdf(xs, ps)349350351def TestChiSquared():352"""Performs a chi-squared test analytically.353"""354data = [8, 9, 19, 5, 8, 11]355dt = hypothesis.DiceChiTest(data)356p_value = dt.PValue(iters=1000)357n, chi2, cdf = len(data), dt.actual, dt.test_cdf358359model = ChiSquaredCdf(n)360thinkplot.Plot(model.xs, model.ps, color='gray',361alpha=0.3, label='chi squared')362thinkplot.Cdf(cdf, label='sample')363364thinkplot.Save(root='normal5',365xlabel='chi-squared statistic',366ylabel='CDF',367loc='lower right')368369# compute the p-value370p_value = 1 - scipy.stats.chi2.cdf(chi2, df=n-1)371print(chi2, p_value)372373374def MakeCltPlots():375"""Makes plot showing distributions of sums converging to normal.376"""377thinkplot.PrePlot(num=3, rows=2, cols=3)378samples = MakeExpoSamples()379NormalPlotSamples(samples, plot=1, ylabel='sum of expo values')380381thinkplot.PrePlot(num=3)382samples = MakeLognormalSamples()383NormalPlotSamples(samples, plot=4, ylabel='sum of lognormal values')384thinkplot.Save(root='normal1', legend=False)385386387thinkplot.PrePlot(num=3, rows=2, cols=3)388samples = MakeParetoSamples()389NormalPlotSamples(samples, plot=1, ylabel='sum of Pareto values')390391thinkplot.PrePlot(num=3)392samples = MakeCorrelatedSamples()393NormalPlotSamples(samples, plot=4, ylabel='sum of correlated expo values')394thinkplot.Save(root='normal2', legend=False)395396397def main():398thinkstats2.RandomSeed(17)399400MakeCltPlots()401402print('Gorilla example')403dist = Normal(90, 7.5**2)404print(dist)405dist_xbar = dist.Sum(9) / 9406print(dist_xbar.sigma)407print(dist_xbar.Percentile(5), dist_xbar.Percentile(95))408409live, firsts, others = first.MakeFrames()410TestCorrelation(live)411PlotPregLengths(live, firsts, others)412413TestChiSquared()414415416if __name__ == '__main__':417main()418419420421