| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7115License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import numpy as np1011import nsfg12import first1314import thinkstats215import thinkplot161718def PercentileRank(scores, your_score):19"""Computes the percentile rank relative to a sample of scores."""20count = 021for score in scores:22if score <= your_score:23count += 12425percentile_rank = 100.0 * count / len(scores)26return percentile_rank2728scores = [55, 66, 77, 88, 99]29your_score = 883031print('score, percentile rank')32for score in scores:33print(score, PercentileRank(scores, score))34print()3536def Percentile(scores, percentile_rank):37"""Computes the value that corresponds to a given percentile rank. """38scores.sort()39for score in scores:40if PercentileRank(scores, score) >= percentile_rank:41return score4243def Percentile2(scores, percentile_rank):44"""Computes the value that corresponds to a given percentile rank.4546Slightly more efficient.47"""48scores.sort()49index = percentile_rank * (len(scores)-1) // 10050return scores[index]5152print('prank, score, score')53for percentile_rank in [0, 20, 25, 40, 50, 60, 75, 80, 100]:54print(percentile_rank,55Percentile(scores, percentile_rank),56Percentile2(scores, percentile_rank))575859def EvalCdf(sample, x):60"""Computes CDF(x) in a sample.6162sample: sequence63x: value6465returns: cumulative probability66"""67count = 0.068for value in sample:69if value <= x:70count += 1.07172prob = count / len(sample)73return prob7475sample = [1, 2, 2, 3, 5]7677print('x', 'CDF(x)')78for x in range(0, 7):79print(x, EvalCdf(sample, x))80818283def PositionToPercentile(position, field_size):84"""Converts from position in the field to percentile.8586position: int87field_size: int88"""89beat = field_size - position + 190percentile = 100.0 * beat / field_size91return percentile929394def PercentileToPosition(percentile, field_size):95"""Converts from percentile to hypothetical position in the field.9697percentile: 0-10098field_size: int99"""100beat = percentile * field_size / 100.0101position = field_size - beat + 1102return position103104105# my time 42:44106print('Percentile rank in field', PositionToPercentile(97, 1633))107print('Percentile rank in age group', PositionToPercentile(26, 256))108109percentile = PositionToPercentile(26, 256)110print('Equivalent position in M50-59', PercentileToPosition(percentile, 171))111# 17th place = 46:05112print('Equivalent position in F20-29', PercentileToPosition(percentile, 448))113# 48:28114115116def MakeExample():117"""Makes a simple example CDF."""118t = [2, 1, 3, 2, 5]119cdf = thinkstats2.Cdf(t)120thinkplot.Clf()121thinkplot.Cdf(cdf)122thinkplot.Save(root='cumulative_example_cdf',123xlabel='x',124ylabel='CDF',125axis=[0, 6, 0, 1],126legend=False)127128129def MakeFigures(live, firsts, others):130"""Creates several figures for the book.131132live: DataFrame133firsts: DataFrame134others: DataFrame135"""136137first_wgt = firsts.totalwgt_lb138first_wgt_dropna = first_wgt.dropna()139print('Firsts', len(first_wgt), len(first_wgt_dropna))140#assert len(first_wgt_dropna) == 4381141142other_wgt = others.totalwgt_lb143other_wgt_dropna = other_wgt.dropna()144print('Others', len(other_wgt), len(other_wgt_dropna))145#assert len(other_wgt_dropna) == 4706146147first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first')148other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other')149150width = 0.4 / 16151152# plot PMFs of birth weights for first babies and others153thinkplot.PrePlot(2)154thinkplot.Hist(first_pmf, align='right', width=width)155thinkplot.Hist(other_pmf, align='left', width=width)156thinkplot.Save(root='cumulative_birthwgt_pmf',157title='Birth weight',158xlabel='weight (pounds)',159ylabel='PMF')160161# plot CDFs of birth weights for first babies and others162first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first')163other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other')164165thinkplot.PrePlot(2)166thinkplot.Cdfs([first_cdf, other_cdf])167thinkplot.Save(root='cumulative_birthwgt_cdf',168title='Birth weight',169xlabel='weight (pounds)',170ylabel='CDF',171axis=[0, 12.5, 0, 1]172)173174175def MakeCdf(live):176"""Plot the CDF of pregnancy lengths for live births.177178live: DataFrame for live births179"""180cdf = thinkstats2.Cdf(live.prglngth, label='prglngth')181thinkplot.Cdf(cdf)182thinkplot.Save('cumulative_prglngth_cdf',183title='Pregnancy length',184xlabel='weeks',185ylabel='CDF')186187188def RandomFigure(live):189weights = live.totalwgt_lb190cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')191192sample = np.random.choice(weights, 100, replace=True)193ranks = [cdf.PercentileRank(x) for x in sample]194195rank_cdf = thinkstats2.Cdf(ranks, label='percentile ranks')196thinkplot.Cdf(rank_cdf)197thinkplot.Save(root='cumulative_random',198xlabel='percentile rank',199ylabel='CDF')200201202def TestSample(live):203"""Plots the distribution of weights against a random sample.204205live: DataFrame for live births206"""207weights = live.totalwgt_lb208cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')209210sample = cdf.Sample(1000)211sample_cdf = thinkstats2.Cdf(sample, label='sample')212213thinkplot.PrePlot(2)214thinkplot.Cdfs([cdf, sample_cdf])215thinkplot.Save(root='cumulative_sample',216xlabel='weight (pounds)',217ylabel='CDF')218219220def main(name, data_dir=''):221thinkstats2.RandomSeed(17)222223MakeExample()224live, firsts, others = first.MakeFrames()225RandomFigure(live)226TestSample(live)227MakeCdf(live)228MakeFigures(live, firsts, others)229230231if __name__ == '__main__':232import sys233main(*sys.argv)234235236