Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import csv
11
import logging
12
import sys
13
import numpy as np
14
import pandas
15
16
import thinkplot
17
import thinkstats2
18
19
20
def ReadData(filename='PEP_2012_PEPANNRES_with_ann.csv'):
21
"""Reads filename and returns populations in thousands
22
23
filename: string
24
25
returns: pandas Series of populations in thousands
26
"""
27
df = pandas.read_csv(filename, header=None, skiprows=2,
28
encoding='iso-8859-1')
29
populations = df[7]
30
populations.replace(0, np.nan, inplace=True)
31
return populations.dropna()
32
33
34
def MakeFigures():
35
"""Plots the CDF of populations in several forms.
36
37
On a log-log scale the tail of the CCDF looks like a straight line,
38
which suggests a Pareto distribution, but that turns out to be misleading.
39
40
On a log-x scale the distribution has the characteristic sigmoid of
41
a lognormal distribution.
42
43
The normal probability plot of log(sizes) confirms that the data fit the
44
lognormal model very well.
45
46
Many phenomena that have been described with Pareto models can be described
47
as well, or better, with lognormal models.
48
"""
49
pops = ReadData()
50
print('Number of cities/towns', len(pops))
51
52
log_pops = np.log10(pops)
53
cdf = thinkstats2.Cdf(pops, label='data')
54
cdf_log = thinkstats2.Cdf(log_pops, label='data')
55
56
# pareto plot
57
xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7)
58
thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8')
59
60
thinkplot.Cdf(cdf_log, complement=True)
61
thinkplot.Config(xlabel='log10 population',
62
ylabel='CCDF',
63
yscale='log')
64
thinkplot.Save(root='populations_pareto')
65
66
# lognormal plot
67
thinkplot.PrePlot(cols=2)
68
69
mu, sigma = log_pops.mean(), log_pops.std()
70
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8)
71
thinkplot.Plot(xs, ps, label='model', color='0.8')
72
73
thinkplot.Cdf(cdf_log)
74
thinkplot.Config(xlabel='log10 population',
75
ylabel='CDF')
76
77
thinkplot.SubPlot(2)
78
thinkstats2.NormalProbabilityPlot(log_pops, label='data')
79
thinkplot.Config(xlabel='z',
80
ylabel='log10 population',
81
xlim=[-5, 5])
82
83
thinkplot.Save(root='populations_normal')
84
85
86
def main():
87
thinkstats2.RandomSeed(17)
88
MakeFigures()
89
90
91
if __name__ == "__main__":
92
main()
93
94