| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7138License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import math1011import numpy as np12import pandas1314import nsfg15import thinkplot16import thinkstats2171819def ParetoMedian(xmin, alpha):20"""Computes the median of a Pareto distribution."""21return xmin * pow(2, 1/alpha)222324def MakeExpoCdf():25"""Generates a plot of the exponential CDF."""2627thinkplot.PrePlot(3)28for lam in [2.0, 1, 0.5]:29xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50)30label = r'$\lambda=%g$' % lam31thinkplot.Plot(xs, ps, label=label)3233thinkplot.Save(root='analytic_expo_cdf',34title='Exponential CDF',35xlabel='x',36ylabel='CDF')373839def ReadBabyBoom(filename='babyboom.dat'):40"""Reads the babyboom data.4142filename: string4344returns: DataFrame45"""46var_info = [47('time', 1, 8, int),48('sex', 9, 16, int),49('weight_g', 17, 24, int),50('minutes', 25, 32, int),51]52columns = ['name', 'start', 'end', 'type']53variables = pandas.DataFrame(var_info, columns=columns)54variables.end += 155dct = thinkstats2.FixedWidthVariables(variables, index_base=1)5657df = dct.ReadFixedWidth(filename, skiprows=59)58return df596061def MakeBabyBoom():62"""Plot CDF of interarrival time on log and linear scales.63"""64# compute the interarrival times65df = ReadBabyBoom()66diffs = df.minutes.diff()67cdf = thinkstats2.Cdf(diffs, label='actual')6869thinkplot.PrePlot(cols=2)70thinkplot.Cdf(cdf)71thinkplot.Config(xlabel='minutes',72ylabel='CDF',73legend=False)7475thinkplot.SubPlot(2)76thinkplot.Cdf(cdf, complement=True)77thinkplot.Config(xlabel='minutes',78ylabel='CCDF',79yscale='log',80legend=False)8182thinkplot.Save(root='analytic_interarrivals',83legend=False)848586def MakeParetoCdf():87"""Generates a plot of the Pareto CDF."""88xmin = 0.58990thinkplot.PrePlot(3)91for alpha in [2.0, 1.0, 0.5]:92xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 10.0, n=100)93thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha)9495thinkplot.Save(root='analytic_pareto_cdf',96title='Pareto CDF',97xlabel='x',98ylabel='CDF')99100101def MakeParetoCdf2():102"""Generates a plot of the CDF of height in Pareto World."""103xmin = 100104alpha = 1.7105xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 1000.0, n=100)106thinkplot.Plot(xs, ps)107108thinkplot.Save(root='analytic_pareto_height',109title='Pareto CDF',110xlabel='height (cm)',111ylabel='CDF',112legend=False)113114115def MakeNormalCdf():116"""Generates a plot of the normal CDF."""117118thinkplot.PrePlot(3)119120mus = [1.0, 2.0, 3.0]121sigmas = [0.5, 0.4, 0.3]122for mu, sigma in zip(mus, sigmas):123xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma,124low=-1.0, high=4.0)125label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)126thinkplot.Plot(xs, ps, label=label)127128thinkplot.Save(root='analytic_normal_cdf',129title='Normal CDF',130xlabel='x',131ylabel='CDF',132loc=2)133134135def MakeNormalModel(weights):136"""Plot the CDF of birthweights with a normal model."""137138# estimate parameters: trimming outliers yields a better fit139mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)140print('Mean, Var', mu, var)141142# plot the model143sigma = math.sqrt(var)144print('Sigma', sigma)145xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)146147thinkplot.Plot(xs, ps, label='model', color='0.8')148149# plot the data150cdf = thinkstats2.Cdf(weights, label='data')151152thinkplot.PrePlot(1)153thinkplot.Cdf(cdf)154thinkplot.Save(root='analytic_birthwgt_model',155title='Birth weights',156xlabel='birth weight (lbs)',157ylabel='CDF')158159160def MakeExampleNormalPlot():161"""Generates a sample normal probability plot.162"""163n = 1000164thinkplot.PrePlot(3)165166mus = [0, 1, 5]167sigmas = [1, 1, 2]168for mu, sigma in zip(mus, sigmas):169sample = np.random.normal(mu, sigma, n)170xs, ys = thinkstats2.NormalProbability(sample)171label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma)172thinkplot.Plot(xs, ys, label=label)173174thinkplot.Save(root='analytic_normal_prob_example',175title='Normal probability plot',176xlabel='standard normal sample',177ylabel='sample values')178179180def MakeNormalPlot(weights, term_weights):181"""Generates a normal probability plot of birth weights."""182183mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)184std = math.sqrt(var)185186xs = [-4, 4]187fxs, fys = thinkstats2.FitLine(xs, mean, std)188thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')189190thinkplot.PrePlot(2)191xs, ys = thinkstats2.NormalProbability(weights)192thinkplot.Plot(xs, ys, label='all live')193194xs, ys = thinkstats2.NormalProbability(term_weights)195thinkplot.Plot(xs, ys, label='full term')196thinkplot.Save(root='analytic_birthwgt_normal',197title='Normal probability plot',198xlabel='Standard deviations from mean',199ylabel='Birth weight (lbs)')200201202def main():203thinkstats2.RandomSeed(18)204MakeExampleNormalPlot()205206# make the analytic CDFs207MakeExpoCdf()208MakeBabyBoom()209210MakeParetoCdf()211MakeParetoCdf2()212MakeNormalCdf()213214# test the distribution of birth weights for normality215preg = nsfg.ReadFemPreg()216full_term = preg[preg.prglngth >= 37]217218weights = preg.totalwgt_lb.dropna()219term_weights = full_term.totalwgt_lb.dropna()220221MakeNormalModel(weights)222MakeNormalPlot(weights, term_weights)223224225if __name__ == "__main__":226main()227228229