Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7120
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import numpy as np
12
import random
13
import scipy.stats
14
15
import first
16
import hypothesis
17
import thinkstats2
18
import thinkplot
19
20
21
class Normal(object):
22
"""Represents a Normal distribution"""
23
24
def __init__(self, mu, sigma2, label=''):
25
"""Initializes.
26
27
mu: mean
28
sigma2: variance
29
"""
30
self.mu = mu
31
self.sigma2 = sigma2
32
self.label = label
33
34
def __repr__(self):
35
"""Returns a string representation."""
36
if self.label:
37
return 'Normal(%g, %g, %s)' % (self.mu, self.sigma2, self.label)
38
else:
39
return 'Normal(%g, %g)' % (self.mu, self.sigma2)
40
41
__str__ = __repr__
42
43
@property
44
def sigma(self):
45
"""Returns the standard deviation."""
46
return math.sqrt(self.sigma2)
47
48
def __add__(self, other):
49
"""Adds a number or other Normal.
50
51
other: number or Normal
52
53
returns: new Normal
54
"""
55
if isinstance(other, Normal):
56
return Normal(self.mu + other.mu, self.sigma2 + other.sigma2)
57
else:
58
return Normal(self.mu + other, self.sigma2)
59
60
__radd__ = __add__
61
62
def __sub__(self, other):
63
"""Subtracts a number or other Normal.
64
65
other: number or Normal
66
67
returns: new Normal
68
"""
69
if isinstance(other, Normal):
70
return Normal(self.mu - other.mu, self.sigma2 + other.sigma2)
71
else:
72
return Normal(self.mu - other, self.sigma2)
73
74
__rsub__ = __sub__
75
76
def __mul__(self, factor):
77
"""Multiplies by a scalar.
78
79
factor: number
80
81
returns: new Normal
82
"""
83
return Normal(factor * self.mu, factor**2 * self.sigma2)
84
85
__rmul__ = __mul__
86
87
def __div__(self, divisor):
88
"""Divides by a scalar.
89
90
divisor: number
91
92
returns: new Normal
93
"""
94
return 1.0 / divisor * self
95
96
__truediv__ = __div__
97
98
def Sum(self, n):
99
"""Returns the distribution of the sum of n values.
100
101
n: int
102
103
returns: new Normal
104
"""
105
return Normal(n * self.mu, n * self.sigma2)
106
107
def Render(self):
108
"""Returns pair of xs, ys suitable for plotting.
109
"""
110
mean, std = self.mu, self.sigma
111
low, high = mean - 3 * std, mean + 3 * std
112
xs, ys = thinkstats2.RenderNormalCdf(mean, std, low, high)
113
return xs, ys
114
115
def Prob(self, x):
116
"""Cumulative probability of x.
117
118
x: numeric
119
"""
120
return thinkstats2.EvalNormalCdf(x, self.mu, self.sigma)
121
122
def Percentile(self, p):
123
"""Inverse CDF of p.
124
125
p: percentile rank 0-100
126
"""
127
return thinkstats2.EvalNormalCdfInverse(p/100, self.mu, self.sigma)
128
129
130
def NormalPlotSamples(samples, plot=1, ylabel=''):
131
"""Makes normal probability plots for samples.
132
133
samples: list of samples
134
label: string
135
"""
136
for n, sample in samples:
137
thinkplot.SubPlot(plot)
138
thinkstats2.NormalProbabilityPlot(sample)
139
140
thinkplot.Config(title='n=%d' % n,
141
legend=False,
142
xticks=[],
143
yticks=[],
144
ylabel=ylabel)
145
plot += 1
146
147
148
def MakeExpoSamples(beta=2.0, iters=1000):
149
"""Generates samples from an exponential distribution.
150
151
beta: parameter
152
iters: number of samples to generate for each size
153
154
returns: list of samples
155
"""
156
samples = []
157
for n in [1, 10, 100]:
158
sample = [np.sum(np.random.exponential(beta, n))
159
for _ in range(iters)]
160
samples.append((n, sample))
161
return samples
162
163
164
def MakeLognormalSamples(mu=1.0, sigma=1.0, iters=1000):
165
"""Generates samples from a lognormal distribution.
166
167
mu: parmeter
168
sigma: parameter
169
iters: number of samples to generate for each size
170
171
returns: list of samples
172
"""
173
samples = []
174
for n in [1, 10, 100]:
175
sample = [np.sum(np.random.lognormal(mu, sigma, n))
176
for _ in range(iters)]
177
samples.append((n, sample))
178
return samples
179
180
181
def MakeParetoSamples(alpha=1.0, iters=1000):
182
"""Generates samples from a Pareto distribution.
183
184
alpha: parameter
185
iters: number of samples to generate for each size
186
187
returns: list of samples
188
"""
189
samples = []
190
191
for n in [1, 10, 100]:
192
sample = [np.sum(np.random.pareto(alpha, n))
193
for _ in range(iters)]
194
samples.append((n, sample))
195
return samples
196
197
198
def GenerateCorrelated(rho, n):
199
"""Generates a sequence of correlated values from a standard normal dist.
200
201
rho: coefficient of correlation
202
n: length of sequence
203
204
returns: iterator
205
"""
206
x = random.gauss(0, 1)
207
yield x
208
209
sigma = math.sqrt(1 - rho**2)
210
for _ in range(n-1):
211
x = random.gauss(x * rho, sigma)
212
yield x
213
214
215
def GenerateExpoCorrelated(rho, n):
216
"""Generates a sequence of correlated values from an exponential dist.
217
218
rho: coefficient of correlation
219
n: length of sequence
220
221
returns: NumPy array
222
"""
223
normal = list(GenerateCorrelated(rho, n))
224
uniform = scipy.stats.norm.cdf(normal)
225
expo = scipy.stats.expon.ppf(uniform)
226
return expo
227
228
229
def MakeCorrelatedSamples(rho=0.9, iters=1000):
230
"""Generates samples from a correlated exponential distribution.
231
232
rho: correlation
233
iters: number of samples to generate for each size
234
235
returns: list of samples
236
"""
237
samples = []
238
for n in [1, 10, 100]:
239
sample = [np.sum(GenerateExpoCorrelated(rho, n))
240
for _ in range(iters)]
241
samples.append((n, sample))
242
return samples
243
244
245
def SamplingDistMean(data, n):
246
"""Computes the sampling distribution of the mean.
247
248
data: sequence of values representing the population
249
n: sample size
250
251
returns: Normal object
252
"""
253
mean, var = data.mean(), data.var()
254
dist = Normal(mean, var)
255
return dist.Sum(n) / n
256
257
258
def PlotPregLengths(live, firsts, others):
259
"""Plots sampling distribution of difference in means.
260
261
live, firsts, others: DataFrames
262
"""
263
print('prglngth example')
264
delta = firsts.prglngth.mean() - others.prglngth.mean()
265
print(delta)
266
267
dist1 = SamplingDistMean(live.prglngth, len(firsts))
268
dist2 = SamplingDistMean(live.prglngth, len(others))
269
dist = dist1 - dist2
270
print('null hypothesis', dist)
271
print(dist.Prob(-delta), 1 - dist.Prob(delta))
272
273
thinkplot.Plot(dist, label='null hypothesis')
274
thinkplot.Save(root='normal3',
275
xlabel='difference in means (weeks)',
276
ylabel='CDF')
277
278
279
class CorrelationPermute(hypothesis.CorrelationPermute):
280
"""Tests correlations by permutation."""
281
282
def TestStatistic(self, data):
283
"""Computes the test statistic.
284
285
data: tuple of xs and ys
286
"""
287
xs, ys = data
288
return np.corrcoef(xs, ys)[0][1]
289
290
291
def ResampleCorrelations(live):
292
"""Tests the correlation between birth weight and mother's age.
293
294
live: DataFrame for live births
295
296
returns: sample size, observed correlation, CDF of resampled correlations
297
"""
298
live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
299
data = live2.agepreg.values, live2.totalwgt_lb.values
300
ht = CorrelationPermute(data)
301
p_value = ht.PValue()
302
return len(live2), ht.actual, ht.test_cdf
303
304
305
def StudentCdf(n):
306
"""Computes the CDF correlations from uncorrelated variables.
307
308
n: sample size
309
310
returns: Cdf
311
"""
312
ts = np.linspace(-3, 3, 101)
313
ps = scipy.stats.t.cdf(ts, df=n-2)
314
rs = ts / np.sqrt(n - 2 + ts**2)
315
return thinkstats2.Cdf(rs, ps)
316
317
318
def TestCorrelation(live):
319
"""Tests correlation analytically.
320
321
live: DataFrame for live births
322
323
"""
324
n, r, cdf = ResampleCorrelations(live)
325
326
model = StudentCdf(n)
327
thinkplot.Plot(model.xs, model.ps, color='gray',
328
alpha=0.3, label='Student t')
329
thinkplot.Cdf(cdf, label='sample')
330
331
thinkplot.Save(root='normal4',
332
xlabel='correlation',
333
ylabel='CDF')
334
335
t = r * math.sqrt((n-2) / (1-r**2))
336
p_value = 1 - scipy.stats.t.cdf(t, df=n-2)
337
print(r, p_value)
338
339
340
def ChiSquaredCdf(n):
341
"""Discrete approximation of the chi-squared CDF with df=n-1.
342
343
n: sample size
344
345
returns: Cdf
346
"""
347
xs = np.linspace(0, 25, 101)
348
ps = scipy.stats.chi2.cdf(xs, df=n-1)
349
return thinkstats2.Cdf(xs, ps)
350
351
352
def TestChiSquared():
353
"""Performs a chi-squared test analytically.
354
"""
355
data = [8, 9, 19, 5, 8, 11]
356
dt = hypothesis.DiceChiTest(data)
357
p_value = dt.PValue(iters=1000)
358
n, chi2, cdf = len(data), dt.actual, dt.test_cdf
359
360
model = ChiSquaredCdf(n)
361
thinkplot.Plot(model.xs, model.ps, color='gray',
362
alpha=0.3, label='chi squared')
363
thinkplot.Cdf(cdf, label='sample')
364
365
thinkplot.Save(root='normal5',
366
xlabel='chi-squared statistic',
367
ylabel='CDF',
368
loc='lower right')
369
370
# compute the p-value
371
p_value = 1 - scipy.stats.chi2.cdf(chi2, df=n-1)
372
print(chi2, p_value)
373
374
375
def MakeCltPlots():
376
"""Makes plot showing distributions of sums converging to normal.
377
"""
378
thinkplot.PrePlot(num=3, rows=2, cols=3)
379
samples = MakeExpoSamples()
380
NormalPlotSamples(samples, plot=1, ylabel='sum of expo values')
381
382
thinkplot.PrePlot(num=3)
383
samples = MakeLognormalSamples()
384
NormalPlotSamples(samples, plot=4, ylabel='sum of lognormal values')
385
thinkplot.Save(root='normal1', legend=False)
386
387
388
thinkplot.PrePlot(num=3, rows=2, cols=3)
389
samples = MakeParetoSamples()
390
NormalPlotSamples(samples, plot=1, ylabel='sum of Pareto values')
391
392
thinkplot.PrePlot(num=3)
393
samples = MakeCorrelatedSamples()
394
NormalPlotSamples(samples, plot=4, ylabel='sum of correlated expo values')
395
thinkplot.Save(root='normal2', legend=False)
396
397
398
def main():
399
thinkstats2.RandomSeed(17)
400
401
MakeCltPlots()
402
403
print('Gorilla example')
404
dist = Normal(90, 7.5**2)
405
print(dist)
406
dist_xbar = dist.Sum(9) / 9
407
print(dist_xbar.sigma)
408
print(dist_xbar.Percentile(5), dist_xbar.Percentile(95))
409
410
live, firsts, others = first.MakeFrames()
411
TestCorrelation(live)
412
PlotPregLengths(live, firsts, others)
413
414
TestChiSquared()
415
416
417
if __name__ == '__main__':
418
main()
419
420
421