Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7115
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import numpy as np
11
12
import nsfg
13
import first
14
15
import thinkstats2
16
import thinkplot
17
18
19
def PercentileRank(scores, your_score):
20
"""Computes the percentile rank relative to a sample of scores."""
21
count = 0
22
for score in scores:
23
if score <= your_score:
24
count += 1
25
26
percentile_rank = 100.0 * count / len(scores)
27
return percentile_rank
28
29
scores = [55, 66, 77, 88, 99]
30
your_score = 88
31
32
print('score, percentile rank')
33
for score in scores:
34
print(score, PercentileRank(scores, score))
35
print()
36
37
def Percentile(scores, percentile_rank):
38
"""Computes the value that corresponds to a given percentile rank. """
39
scores.sort()
40
for score in scores:
41
if PercentileRank(scores, score) >= percentile_rank:
42
return score
43
44
def Percentile2(scores, percentile_rank):
45
"""Computes the value that corresponds to a given percentile rank.
46
47
Slightly more efficient.
48
"""
49
scores.sort()
50
index = percentile_rank * (len(scores)-1) // 100
51
return scores[index]
52
53
print('prank, score, score')
54
for percentile_rank in [0, 20, 25, 40, 50, 60, 75, 80, 100]:
55
print(percentile_rank,
56
Percentile(scores, percentile_rank),
57
Percentile2(scores, percentile_rank))
58
59
60
def EvalCdf(sample, x):
61
"""Computes CDF(x) in a sample.
62
63
sample: sequence
64
x: value
65
66
returns: cumulative probability
67
"""
68
count = 0.0
69
for value in sample:
70
if value <= x:
71
count += 1.0
72
73
prob = count / len(sample)
74
return prob
75
76
sample = [1, 2, 2, 3, 5]
77
78
print('x', 'CDF(x)')
79
for x in range(0, 7):
80
print(x, EvalCdf(sample, x))
81
82
83
84
def PositionToPercentile(position, field_size):
85
"""Converts from position in the field to percentile.
86
87
position: int
88
field_size: int
89
"""
90
beat = field_size - position + 1
91
percentile = 100.0 * beat / field_size
92
return percentile
93
94
95
def PercentileToPosition(percentile, field_size):
96
"""Converts from percentile to hypothetical position in the field.
97
98
percentile: 0-100
99
field_size: int
100
"""
101
beat = percentile * field_size / 100.0
102
position = field_size - beat + 1
103
return position
104
105
106
# my time 42:44
107
print('Percentile rank in field', PositionToPercentile(97, 1633))
108
print('Percentile rank in age group', PositionToPercentile(26, 256))
109
110
percentile = PositionToPercentile(26, 256)
111
print('Equivalent position in M50-59', PercentileToPosition(percentile, 171))
112
# 17th place = 46:05
113
print('Equivalent position in F20-29', PercentileToPosition(percentile, 448))
114
# 48:28
115
116
117
def MakeExample():
118
"""Makes a simple example CDF."""
119
t = [2, 1, 3, 2, 5]
120
cdf = thinkstats2.Cdf(t)
121
thinkplot.Clf()
122
thinkplot.Cdf(cdf)
123
thinkplot.Save(root='cumulative_example_cdf',
124
xlabel='x',
125
ylabel='CDF',
126
axis=[0, 6, 0, 1],
127
legend=False)
128
129
130
def MakeFigures(live, firsts, others):
131
"""Creates several figures for the book.
132
133
live: DataFrame
134
firsts: DataFrame
135
others: DataFrame
136
"""
137
138
first_wgt = firsts.totalwgt_lb
139
first_wgt_dropna = first_wgt.dropna()
140
print('Firsts', len(first_wgt), len(first_wgt_dropna))
141
#assert len(first_wgt_dropna) == 4381
142
143
other_wgt = others.totalwgt_lb
144
other_wgt_dropna = other_wgt.dropna()
145
print('Others', len(other_wgt), len(other_wgt_dropna))
146
#assert len(other_wgt_dropna) == 4706
147
148
first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first')
149
other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other')
150
151
width = 0.4 / 16
152
153
# plot PMFs of birth weights for first babies and others
154
thinkplot.PrePlot(2)
155
thinkplot.Hist(first_pmf, align='right', width=width)
156
thinkplot.Hist(other_pmf, align='left', width=width)
157
thinkplot.Save(root='cumulative_birthwgt_pmf',
158
title='Birth weight',
159
xlabel='weight (pounds)',
160
ylabel='PMF')
161
162
# plot CDFs of birth weights for first babies and others
163
first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first')
164
other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other')
165
166
thinkplot.PrePlot(2)
167
thinkplot.Cdfs([first_cdf, other_cdf])
168
thinkplot.Save(root='cumulative_birthwgt_cdf',
169
title='Birth weight',
170
xlabel='weight (pounds)',
171
ylabel='CDF',
172
axis=[0, 12.5, 0, 1]
173
)
174
175
176
def MakeCdf(live):
177
"""Plot the CDF of pregnancy lengths for live births.
178
179
live: DataFrame for live births
180
"""
181
cdf = thinkstats2.Cdf(live.prglngth, label='prglngth')
182
thinkplot.Cdf(cdf)
183
thinkplot.Save('cumulative_prglngth_cdf',
184
title='Pregnancy length',
185
xlabel='weeks',
186
ylabel='CDF')
187
188
189
def RandomFigure(live):
190
weights = live.totalwgt_lb
191
cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')
192
193
sample = np.random.choice(weights, 100, replace=True)
194
ranks = [cdf.PercentileRank(x) for x in sample]
195
196
rank_cdf = thinkstats2.Cdf(ranks, label='percentile ranks')
197
thinkplot.Cdf(rank_cdf)
198
thinkplot.Save(root='cumulative_random',
199
xlabel='percentile rank',
200
ylabel='CDF')
201
202
203
def TestSample(live):
204
"""Plots the distribution of weights against a random sample.
205
206
live: DataFrame for live births
207
"""
208
weights = live.totalwgt_lb
209
cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')
210
211
sample = cdf.Sample(1000)
212
sample_cdf = thinkstats2.Cdf(sample, label='sample')
213
214
thinkplot.PrePlot(2)
215
thinkplot.Cdfs([cdf, sample_cdf])
216
thinkplot.Save(root='cumulative_sample',
217
xlabel='weight (pounds)',
218
ylabel='CDF')
219
220
221
def main(name, data_dir=''):
222
thinkstats2.RandomSeed(17)
223
224
MakeExample()
225
live, firsts, others = first.MakeFrames()
226
RandomFigure(live)
227
TestSample(live)
228
MakeCdf(live)
229
MakeFigures(live, firsts, others)
230
231
232
if __name__ == '__main__':
233
import sys
234
main(*sys.argv)
235
236