| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import nsfg10import nsfg211import first1213import thinkstats214import thinkplot1516import copy17import random18import numpy as np19import matplotlib.pyplot as pyplot202122class CoinTest(thinkstats2.HypothesisTest):23"""Tests the hypothesis that a coin is fair."""2425def TestStatistic(self, data):26"""Computes the test statistic.2728data: data in whatever form is relevant29"""30heads, tails = data31test_stat = abs(heads - tails)32return test_stat3334def RunModel(self):35"""Run the model of the null hypothesis.3637returns: simulated data38"""39heads, tails = self.data40n = heads + tails41sample = [random.choice('HT') for _ in range(n)]42hist = thinkstats2.Hist(sample)43data = hist['H'], hist['T']44return data454647class DiffMeansPermute(thinkstats2.HypothesisTest):48"""Tests a difference in means by permutation."""4950def TestStatistic(self, data):51"""Computes the test statistic.5253data: data in whatever form is relevant54"""55group1, group2 = data56test_stat = abs(group1.mean() - group2.mean())57return test_stat5859def MakeModel(self):60"""Build a model of the null hypothesis.61"""62group1, group2 = self.data63self.n, self.m = len(group1), len(group2)64self.pool = np.hstack((group1, group2))6566def RunModel(self):67"""Run the model of the null hypothesis.6869returns: simulated data70"""71np.random.shuffle(self.pool)72data = self.pool[:self.n], self.pool[self.n:]73return data747576class DiffMeansOneSided(DiffMeansPermute):77"""Tests a one-sided difference in means by permutation."""7879def TestStatistic(self, data):80"""Computes the test statistic.8182data: data in whatever form is relevant83"""84group1, group2 = data85test_stat = group1.mean() - group2.mean()86return test_stat878889class DiffStdPermute(DiffMeansPermute):90"""Tests a one-sided difference in standard deviation by permutation."""9192def TestStatistic(self, data):93"""Computes the test statistic.9495data: data in whatever form is relevant96"""97group1, group2 = data98test_stat = group1.std() - group2.std()99return test_stat100101102class CorrelationPermute(thinkstats2.HypothesisTest):103"""Tests correlations by permutation."""104105def TestStatistic(self, data):106"""Computes the test statistic.107108data: tuple of xs and ys109"""110xs, ys = data111test_stat = abs(thinkstats2.Corr(xs, ys))112return test_stat113114def RunModel(self):115"""Run the model of the null hypothesis.116117returns: simulated data118"""119xs, ys = self.data120xs = np.random.permutation(xs)121return xs, ys122123124class DiceTest(thinkstats2.HypothesisTest):125"""Tests whether a six-sided die is fair."""126127def TestStatistic(self, data):128"""Computes the test statistic.129130data: list of frequencies131"""132observed = data133n = sum(observed)134expected = np.ones(6) * n / 6135test_stat = sum(abs(observed - expected))136return test_stat137138def RunModel(self):139"""Run the model of the null hypothesis.140141returns: simulated data142"""143n = sum(self.data)144values = [1,2,3,4,5,6]145rolls = np.random.choice(values, n, replace=True)146hist = thinkstats2.Hist(rolls)147freqs = hist.Freqs(values)148return freqs149150151class DiceChiTest(DiceTest):152"""Tests a six-sided die using a chi-squared statistic."""153154def TestStatistic(self, data):155"""Computes the test statistic.156157data: list of frequencies158"""159observed = data160n = sum(observed)161expected = np.ones(6) * n / 6162test_stat = sum((observed - expected)**2 / expected)163return test_stat164165166class PregLengthTest(thinkstats2.HypothesisTest):167"""Tests difference in pregnancy length using a chi-squared statistic."""168169def TestStatistic(self, data):170"""Computes the test statistic.171172data: pair of lists of pregnancy lengths173"""174firsts, others = data175stat = self.ChiSquared(firsts) + self.ChiSquared(others)176return stat177178def ChiSquared(self, lengths):179"""Computes the chi-squared statistic.180181lengths: sequence of lengths182183returns: float184"""185hist = thinkstats2.Hist(lengths)186observed = np.array(hist.Freqs(self.values))187expected = self.expected_probs * len(lengths)188stat = sum((observed - expected)**2 / expected)189return stat190191def MakeModel(self):192"""Build a model of the null hypothesis.193"""194firsts, others = self.data195self.n = len(firsts)196self.pool = np.hstack((firsts, others))197198pmf = thinkstats2.Pmf(self.pool)199self.values = range(35, 44)200self.expected_probs = np.array(pmf.Probs(self.values))201202def RunModel(self):203"""Run the model of the null hypothesis.204205returns: simulated data206"""207np.random.shuffle(self.pool)208data = self.pool[:self.n], self.pool[self.n:]209return data210211212def RunDiceTest():213"""Tests whether a die is fair.214"""215data = [8, 9, 19, 5, 8, 11]216dt = DiceTest(data)217print('dice test', dt.PValue(iters=10000))218dt = DiceChiTest(data)219print('dice chi test', dt.PValue(iters=10000))220221222def FalseNegRate(data, num_runs=1000):223"""Computes the chance of a false negative based on resampling.224225data: pair of sequences226num_runs: how many experiments to simulate227228returns: float false negative rate229"""230group1, group2 = data231count = 0232233for i in range(num_runs):234sample1 = thinkstats2.Resample(group1)235sample2 = thinkstats2.Resample(group2)236ht = DiffMeansPermute((sample1, sample2))237p_value = ht.PValue(iters=101)238if p_value > 0.05:239count += 1240241return count / num_runs242243244def PrintTest(p_value, ht):245"""Prints results from a hypothesis test.246247p_value: float248ht: HypothesisTest249"""250print('p-value =', p_value)251print('actual =', ht.actual)252print('ts max =', ht.MaxTestStat())253254255def RunTests(data, iters=1000):256"""Runs several tests on the given data.257258data: pair of sequences259iters: number of iterations to run260"""261262# test the difference in means263ht = DiffMeansPermute(data)264p_value = ht.PValue(iters=iters)265print('\nmeans permute two-sided')266PrintTest(p_value, ht)267268ht.PlotCdf()269thinkplot.Save(root='hypothesis1',270title='Permutation test',271xlabel='difference in means (weeks)',272ylabel='CDF',273legend=False)274275# test the difference in means one-sided276ht = DiffMeansOneSided(data)277p_value = ht.PValue(iters=iters)278print('\nmeans permute one-sided')279PrintTest(p_value, ht)280281# test the difference in std282ht = DiffStdPermute(data)283p_value = ht.PValue(iters=iters)284print('\nstd permute one-sided')285PrintTest(p_value, ht)286287288def ReplicateTests():289"""Replicates tests with the new NSFG data."""290291live, firsts, others = nsfg2.MakeFrames()292293# compare pregnancy lengths294print('\nprglngth2')295data = firsts.prglngth.values, others.prglngth.values296ht = DiffMeansPermute(data)297p_value = ht.PValue(iters=1000)298print('means permute two-sided')299PrintTest(p_value, ht)300301print('\nbirth weight 2')302data = (firsts.totalwgt_lb.dropna().values,303others.totalwgt_lb.dropna().values)304ht = DiffMeansPermute(data)305p_value = ht.PValue(iters=1000)306print('means permute two-sided')307PrintTest(p_value, ht)308309# test correlation310live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])311data = live2.agepreg.values, live2.totalwgt_lb.values312ht = CorrelationPermute(data)313p_value = ht.PValue()314print('\nage weight correlation 2')315PrintTest(p_value, ht)316317# compare pregnancy lengths (chi-squared)318data = firsts.prglngth.values, others.prglngth.values319ht = PregLengthTest(data)320p_value = ht.PValue()321print('\npregnancy length chi-squared 2')322PrintTest(p_value, ht)323324325def main():326thinkstats2.RandomSeed(17)327328# run the coin test329ct = CoinTest((140, 110))330pvalue = ct.PValue()331print('coin test p-value', pvalue)332333# compare pregnancy lengths334print('\nprglngth')335live, firsts, others = first.MakeFrames()336data = firsts.prglngth.values, others.prglngth.values337RunTests(data)338339# compare birth weights340print('\nbirth weight')341data = (firsts.totalwgt_lb.dropna().values,342others.totalwgt_lb.dropna().values)343ht = DiffMeansPermute(data)344p_value = ht.PValue(iters=1000)345print('means permute two-sided')346PrintTest(p_value, ht)347348# test correlation349live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])350data = live2.agepreg.values, live2.totalwgt_lb.values351ht = CorrelationPermute(data)352p_value = ht.PValue()353print('\nage weight correlation')354print('n=', len(live2))355PrintTest(p_value, ht)356357# run the dice test358RunDiceTest()359360# compare pregnancy lengths (chi-squared)361data = firsts.prglngth.values, others.prglngth.values362ht = PregLengthTest(data)363p_value = ht.PValue()364print('\npregnancy length chi-squared')365PrintTest(p_value, ht)366367# compute the false negative rate for difference in pregnancy length368data = firsts.prglngth.values, others.prglngth.values369neg_rate = FalseNegRate(data)370print('false neg rate', neg_rate)371372# run the tests with new nsfg data373ReplicateTests()374375376if __name__ == "__main__":377main()378379380