| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7115License: GPL3
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function89import sys10import numpy as np11import math1213import first14import thinkplot15import thinkstats2161718"""This file contains a solution to an exercise in Think Stats:1920Using data from the NSFG, make a scatter plot of birth weight21versus mother's age. Plot percentiles of birth weight22versus mother's age. Compute Pearson's and Spearman's correlations.23How would you characterize the relationship24between these variables?2526My conclusions:27281) The scatterplot shows a weak relationship between the variables.29302) The correlations support this. Pearson's is around 0.07, Spearman's31is around 0.09. The difference between them suggests some influence32of outliers or a non-linear relationsip.33343) Plotting percentiles of weight versus age suggests that the35relationship is non-linear. Birth weight increases more quickly36in the range of mother's age from 15 to 25. After that, the effect37is weaker.3839"""4041def ScatterPlot(ages, weights, alpha=1.0):42"""Make a scatter plot and save it.4344ages: sequence of float45weights: sequence of float46alpha: float47"""48thinkplot.Scatter(ages, weights, alpha=alpha)49thinkplot.Config(xlabel='age (years)',50ylabel='weight (lbs)',51xlim=[10, 45],52ylim=[0, 15],53legend=False)545556def HexBin(ages, weights, bins=None):57"""Make a hexbin plot and save it.5859ages: sequence of float60weights: sequence of float61bins: 'log' or None for linear62"""63thinkplot.HexBin(ages, weights, bins=bins)64thinkplot.Config(xlabel='age (years)',65ylabel='weight (lbs)',66legend=False)676869def BinnedPercentiles(df):70"""Bin the data by age and plot percentiles of weight for each bin.7172df: DataFrame73"""74bins = np.arange(10, 48, 3)75indices = np.digitize(df.agepreg, bins)76groups = df.groupby(indices)7778ages = [group.agepreg.mean() for i, group in groups][1:-1]79cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups][1:-1]8081thinkplot.PrePlot(3)82for percent in [75, 50, 25]:83weights = [cdf.Percentile(percent) for cdf in cdfs]84label = '%dth' % percent85thinkplot.Plot(ages, weights, label=label)8687thinkplot.Save(root='chap07scatter3',88formats=['jpg'],89xlabel="mother's age (years)",90ylabel='birth weight (lbs)')91929394def main(script):95thinkstats2.RandomSeed(17)9697live, firsts, others = first.MakeFrames()98live = live.dropna(subset=['agepreg', 'totalwgt_lb'])99BinnedPercentiles(live)100101ages = live.agepreg102weights = live.totalwgt_lb103print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))104print('thinkstats2 SpearmanCorr',105thinkstats2.SpearmanCorr(ages, weights))106107ScatterPlot(ages, weights, alpha=0.1)108thinkplot.Save(root='chap07scatter1',109legend=False,110formats=['jpg'])111112113if __name__ == '__main__':114main(*sys.argv)115116117