| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function89import sys10import numpy as np11import math1213import brfss14import thinkplot15import thinkstats2161718def GetHeightWeight(df, hjitter=0.0, wjitter=0.0):19"""Get sequences of height and weight.2021df: DataFrame with htm3 and wtkg222hjitter: float magnitude of random noise added to heights23wjitter: float magnitude of random noise added to weights2425returns: tuple of sequences (heights, weights)26"""27heights = df.htm328if hjitter:29heights = thinkstats2.Jitter(heights, hjitter)3031weights = df.wtkg232if wjitter:33weights = thinkstats2.Jitter(weights, wjitter)3435return heights, weights363738def ScatterPlot(heights, weights, alpha=1.0):39"""Make a scatter plot and save it.4041heights: sequence of float42weights: sequence of float43alpha: float44"""45thinkplot.Scatter(heights, weights, alpha=alpha)46thinkplot.Config(xlabel='height (cm)',47ylabel='weight (kg)',48axis=[140, 210, 20, 200],49legend=False)505152def HexBin(heights, weights, bins=None):53"""Make a hexbin plot and save it.5455heights: sequence of float56weights: sequence of float57bins: 'log' or None for linear58"""59thinkplot.HexBin(heights, weights, bins=bins)60thinkplot.Config(xlabel='height (cm)',61ylabel='weight (kg)',62axis=[140, 210, 20, 200],63legend=False)646566def MakeFigures(df):67"""Make scatterplots.68"""69sample = thinkstats2.SampleRows(df, 5000)7071# simple scatter plot72thinkplot.PrePlot(cols=2)73heights, weights = GetHeightWeight(sample)74ScatterPlot(heights, weights)7576# scatter plot with jitter77thinkplot.SubPlot(2)78heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)79ScatterPlot(heights, weights)8081thinkplot.Save(root='scatter1')8283# with jitter and transparency84thinkplot.PrePlot(cols=2)85ScatterPlot(heights, weights, alpha=0.1)8687# hexbin plot88thinkplot.SubPlot(2)89heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)90HexBin(heights, weights)91thinkplot.Save(root='scatter2')929394def BinnedPercentiles(df):95"""Bin the data by height and plot percentiles of weight for eachbin.9697df: DataFrame98"""99cdf = thinkstats2.Cdf(df.htm3)100print('Fraction between 140 and 200 cm', cdf[200] - cdf[140])101102bins = np.arange(135, 210, 5)103indices = np.digitize(df.htm3, bins)104groups = df.groupby(indices)105106heights = [group.htm3.mean() for i, group in groups][1:-1]107cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]108109thinkplot.PrePlot(3)110for percent in [75, 50, 25]:111weights = [cdf.Percentile(percent) for cdf in cdfs]112label = '%dth' % percent113thinkplot.Plot(heights, weights, label=label)114115thinkplot.Save(root='scatter3',116xlabel='height (cm)',117ylabel='weight (kg)')118119120def Correlations(df):121print('pandas cov', df.htm3.cov(df.wtkg2))122#print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))123print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))124print()125126print('pandas corr', df.htm3.corr(df.wtkg2))127#print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))128print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))129print()130131print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))132print('thinkstats2 SpearmanCorr',133thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))134print('thinkstats2 SpearmanCorr log wtkg3',135thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))136print()137138print('thinkstats2 Corr log wtkg3',139thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))140print()141142143def main(script):144thinkstats2.RandomSeed(17)145146df = brfss.ReadBrfss(nrows=None)147df = df.dropna(subset=['htm3', 'wtkg2'])148Correlations(df)149return150151MakeFigures(df)152BinnedPercentiles(df)153154155if __name__ == '__main__':156main(*sys.argv)157158159