| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function89import csv10import logging11import sys12import numpy as np13import pandas1415import thinkplot16import thinkstats2171819def ReadData(filename='PEP_2012_PEPANNRES_with_ann.csv'):20"""Reads filename and returns populations in thousands2122filename: string2324returns: pandas Series of populations in thousands25"""26df = pandas.read_csv(filename, header=None, skiprows=2,27encoding='iso-8859-1')28populations = df[7]29populations.replace(0, np.nan, inplace=True)30return populations.dropna()313233def MakeFigures():34"""Plots the CDF of populations in several forms.3536On a log-log scale the tail of the CCDF looks like a straight line,37which suggests a Pareto distribution, but that turns out to be misleading.3839On a log-x scale the distribution has the characteristic sigmoid of40a lognormal distribution.4142The normal probability plot of log(sizes) confirms that the data fit the43lognormal model very well.4445Many phenomena that have been described with Pareto models can be described46as well, or better, with lognormal models.47"""48pops = ReadData()49print('Number of cities/towns', len(pops))5051log_pops = np.log10(pops)52cdf = thinkstats2.Cdf(pops, label='data')53cdf_log = thinkstats2.Cdf(log_pops, label='data')5455# pareto plot56xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7)57thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8')5859thinkplot.Cdf(cdf_log, complement=True)60thinkplot.Config(xlabel='log10 population',61ylabel='CCDF',62yscale='log')63thinkplot.Save(root='populations_pareto')6465# lognormal plot66thinkplot.PrePlot(cols=2)6768mu, sigma = log_pops.mean(), log_pops.std()69xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8)70thinkplot.Plot(xs, ps, label='model', color='0.8')7172thinkplot.Cdf(cdf_log)73thinkplot.Config(xlabel='log10 population',74ylabel='CDF')7576thinkplot.SubPlot(2)77thinkstats2.NormalProbabilityPlot(log_pops, label='data')78thinkplot.Config(xlabel='z',79ylabel='log10 population',80xlim=[-5, 5])8182thinkplot.Save(root='populations_normal')838485def main():86thinkstats2.RandomSeed(17)87MakeFigures()888990if __name__ == "__main__":91main()929394