| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7115License: GPL3
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import math10import sys11import pandas12import numpy as np1314import thinkstats215import thinkplot161718def Summarize(df, column, title):19"""Print summary statistics male, female and all."""2021items = [22('all', df[column]),23('male', df[df.sex == 1][column]),24('female', df[df.sex == 2][column]),25]2627print(title)28print('key\tn\tmean\tvar\tstd\tcv')29for key, series in items:30mean, var = series.mean(), series.var()31std = math.sqrt(var)32cv = std / mean33t = key, len(series), mean, var, std, cv34print('%s\t%d\t%4.2f\t%4.2f\t%4.2f\t%4.4f' % t)353637def CleanBrfssFrame(df):38"""Recodes BRFSS variables.3940df: DataFrame41"""42# clean age43df.age.replace([7, 9], float('NaN'), inplace=True)4445# clean height46df.htm3.replace([999], float('NaN'), inplace=True)4748# clean weight49df.wtkg2.replace([99999], float('NaN'), inplace=True)50df.wtkg2 /= 100.05152# clean weight a year ago53df.wtyrago.replace([7777, 9999], float('NaN'), inplace=True)54df['wtyrago'] = df.wtyrago.apply(lambda x: x/2.2 if x < 9000 else x-9000)555657def ReadBrfss(filename='CDBRFS08.ASC.gz', compression='gzip', nrows=None):58"""Reads the BRFSS data.5960filename: string61compression: string62nrows: int number of rows to read, or None for all6364returns: DataFrame65"""66var_info = [67('age', 101, 102, int),68('sex', 143, 143, int),69('wtyrago', 127, 130, int),70('finalwt', 799, 808, int),71('wtkg2', 1254, 1258, int),72('htm3', 1251, 1253, int),73]74columns = ['name', 'start', 'end', 'type']75variables = pandas.DataFrame(var_info, columns=columns)76variables.end += 177dct = thinkstats2.FixedWidthVariables(variables, index_base=1)7879df = dct.ReadFixedWidth(filename, compression=compression, nrows=nrows)80CleanBrfssFrame(df)81return df828384def MakeNormalModel(weights):85"""Plots a CDF with a Normal model.8687weights: sequence88"""89cdf = thinkstats2.Cdf(weights, label='weights')9091mean, var = thinkstats2.TrimmedMeanVar(weights)92std = math.sqrt(var)93print('n, mean, std', len(weights), mean, std)9495xmin = mean - 4 * std96xmax = mean + 4 * std9798xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)99thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')100thinkplot.Cdf(cdf)101102103def MakeNormalPlot(weights):104"""Generates a normal probability plot of birth weights.105106weights: sequence107"""108mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)109std = math.sqrt(var)110111xs = [-5, 5]112xs, ys = thinkstats2.FitLine(xs, mean, std)113thinkplot.Plot(xs, ys, color='0.8', label='model')114115xs, ys = thinkstats2.NormalProbability(weights)116thinkplot.Plot(xs, ys, label='weights')117118119def MakeFigures(df):120"""Generates CDFs and normal prob plots for weights and log weights."""121weights = df.wtkg2.dropna()122log_weights = np.log10(weights)123124# plot weights on linear and log scales125thinkplot.PrePlot(cols=2)126MakeNormalModel(weights)127thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF')128129thinkplot.SubPlot(2)130MakeNormalModel(log_weights)131thinkplot.Config(xlabel='adult weight (log10 kg)')132133thinkplot.Save(root='brfss_weight')134135# make normal probability plots on linear and log scales136thinkplot.PrePlot(cols=2)137MakeNormalPlot(weights)138thinkplot.Config(xlabel='z', ylabel='weights (kg)')139140thinkplot.SubPlot(2)141MakeNormalPlot(log_weights)142thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)')143144thinkplot.Save(root='brfss_weight_normal')145146147def main(script, nrows=1000):148"""Tests the functions in this module.149150script: string script name151"""152thinkstats2.RandomSeed(17)153154nrows = int(nrows)155df = ReadBrfss(nrows=nrows)156MakeFigures(df)157158Summarize(df, 'htm3', 'Height (cm):')159Summarize(df, 'wtkg2', 'Weight (kg):')160Summarize(df, 'wtyrago', 'Weight year ago (kg):')161162if nrows == 1000:163assert(df.age.value_counts()[40] == 28)164assert(df.sex.value_counts()[2] == 668)165assert(df.wtkg2.value_counts()[90.91] == 49)166assert(df.wtyrago.value_counts()[160/2.2] == 49)167assert(df.htm3.value_counts()[163] == 103)168assert(df.finalwt.value_counts()[185.870345] == 13)169print('%s: All tests passed.' % script)170171172if __name__ == '__main__':173main(*sys.argv)174175176