Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96161
License: OTHER
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
import math
9
import numpy
10
import cPickle
11
import numpy
12
import random
13
import scipy
14
15
import brfss
16
17
import thinkplot
18
import thinkbayes
19
import thinkstats
20
21
import matplotlib.pyplot as pyplot
22
23
24
NUM_SIGMAS = 1
25
26
class Height(thinkbayes.Suite, thinkbayes.Joint):
27
"""Hypotheses about parameters of the distribution of height."""
28
29
def __init__(self, mus, sigmas, name=''):
30
"""Makes a prior distribution for mu and sigma based on a sample.
31
32
mus: sequence of possible mus
33
sigmas: sequence of possible sigmas
34
name: string name for the Suite
35
"""
36
pairs = [(mu, sigma)
37
for mu in mus
38
for sigma in sigmas]
39
40
thinkbayes.Suite.__init__(self, pairs, name=name)
41
42
def Likelihood(self, data, hypo):
43
"""Computes the likelihood of the data under the hypothesis.
44
45
Args:
46
hypo: tuple of hypothetical mu and sigma
47
data: float sample
48
49
Returns:
50
likelihood of the sample given mu and sigma
51
"""
52
x = data
53
mu, sigma = hypo
54
like = scipy.stats.norm.pdf(x, mu, sigma)
55
return like
56
57
def LogLikelihood(self, data, hypo):
58
"""Computes the log likelihood of the data under the hypothesis.
59
60
Args:
61
data: a list of values
62
hypo: tuple of hypothetical mu and sigma
63
64
Returns:
65
log likelihood of the sample given mu and sigma (unnormalized)
66
"""
67
x = data
68
mu, sigma = hypo
69
loglike = EvalGaussianLogPdf(x, mu, sigma)
70
return loglike
71
72
def LogUpdateSetFast(self, data):
73
"""Updates the suite using a faster implementation.
74
75
Computes the sum of the log likelihoods directly.
76
77
Args:
78
data: sequence of values
79
"""
80
xs = tuple(data)
81
n = len(xs)
82
83
for hypo in self.Values():
84
mu, sigma = hypo
85
total = Summation(xs, mu)
86
loglike = -n * math.log(sigma) - total / 2 / sigma**2
87
self.Incr(hypo, loglike)
88
89
def LogUpdateSetMeanVar(self, data):
90
"""Updates the suite using ABC and mean/var.
91
92
Args:
93
data: sequence of values
94
"""
95
xs = data
96
n = len(xs)
97
98
m = numpy.mean(xs)
99
s = numpy.std(xs)
100
101
self.LogUpdateSetABC(n, m, s)
102
103
def LogUpdateSetMedianIPR(self, data):
104
"""Updates the suite using ABC and median/iqr.
105
106
Args:
107
data: sequence of values
108
"""
109
xs = data
110
n = len(xs)
111
112
# compute summary stats
113
median, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
114
print 'median, s', median, s
115
116
self.LogUpdateSetABC(n, median, s)
117
118
def LogUpdateSetABC(self, n, m, s):
119
"""Updates the suite using ABC.
120
121
n: sample size
122
m: estimated central tendency
123
s: estimated spread
124
"""
125
for hypo in sorted(self.Values()):
126
mu, sigma = hypo
127
128
# compute log likelihood of m, given hypo
129
stderr_m = sigma / math.sqrt(n)
130
loglike = EvalGaussianLogPdf(m, mu, stderr_m)
131
132
#compute log likelihood of s, given hypo
133
stderr_s = sigma / math.sqrt(2 * (n-1))
134
loglike += EvalGaussianLogPdf(s, sigma, stderr_s)
135
136
self.Incr(hypo, loglike)
137
138
139
def EvalGaussianLogPdf(x, mu, sigma):
140
"""Computes the log PDF of x given mu and sigma.
141
142
x: float values
143
mu, sigma: paramemters of Gaussian
144
145
returns: float log-likelihood
146
"""
147
return scipy.stats.norm.logpdf(x, mu, sigma)
148
149
150
def FindPriorRanges(xs, num_points, num_stderrs=3.0, median_flag=False):
151
"""Find ranges for mu and sigma with non-negligible likelihood.
152
153
xs: sample
154
num_points: number of values in each dimension
155
num_stderrs: number of standard errors to include on either side
156
157
Returns: sequence of mus, sequence of sigmas
158
"""
159
def MakeRange(estimate, stderr):
160
"""Makes a linear range around the estimate.
161
162
estimate: central value
163
stderr: standard error of the estimate
164
165
returns: numpy array of float
166
"""
167
spread = stderr * num_stderrs
168
array = numpy.linspace(estimate-spread, estimate+spread, num_points)
169
return array
170
171
# estimate mean and stddev of xs
172
n = len(xs)
173
if median_flag:
174
m, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
175
else:
176
m = numpy.mean(xs)
177
s = numpy.std(xs)
178
179
print 'classical estimators', m, s
180
181
# compute ranges for m and s
182
stderr_m = s / math.sqrt(n)
183
mus = MakeRange(m, stderr_m)
184
185
stderr_s = s / math.sqrt(2 * (n-1))
186
sigmas = MakeRange(s, stderr_s)
187
188
return mus, sigmas
189
190
191
def Summation(xs, mu, cache={}):
192
"""Computes the sum of (x-mu)**2 for x in t.
193
194
Caches previous results.
195
196
xs: tuple of values
197
mu: hypothetical mean
198
cache: cache of previous results
199
"""
200
try:
201
return cache[xs, mu]
202
except KeyError:
203
ds = [(x-mu)**2 for x in xs]
204
total = sum(ds)
205
cache[xs, mu] = total
206
return total
207
208
209
def CoefVariation(suite):
210
"""Computes the distribution of CV.
211
212
suite: Pmf that maps (x, y) to z
213
214
Returns: Pmf object for CV.
215
"""
216
pmf = thinkbayes.Pmf()
217
for (m, s), p in suite.Items():
218
pmf.Incr(s/m, p)
219
return pmf
220
221
222
def PlotCdfs(d, labels):
223
"""Plot CDFs for each sequence in a dictionary.
224
225
Jitters the data and subtracts away the mean.
226
227
d: map from key to sequence of values
228
labels: map from key to string label
229
"""
230
thinkplot.Clf()
231
for key, xs in d.iteritems():
232
mu = thinkstats.Mean(xs)
233
xs = thinkstats.Jitter(xs, 1.3)
234
xs = [x-mu for x in xs]
235
cdf = thinkbayes.MakeCdfFromList(xs)
236
thinkplot.Cdf(cdf, label=labels[key])
237
thinkplot.Show()
238
239
240
def PlotPosterior(suite, pcolor=False, contour=True):
241
"""Makes a contour plot.
242
243
suite: Suite that maps (mu, sigma) to probability
244
"""
245
thinkplot.Clf()
246
thinkplot.Contour(suite.GetDict(), pcolor=pcolor, contour=contour)
247
248
thinkplot.Save(root='variability_posterior_%s' % suite.name,
249
title='Posterior joint distribution',
250
xlabel='Mean height (cm)',
251
ylabel='Stddev (cm)')
252
253
254
def PlotCoefVariation(suites):
255
"""Plot the posterior distributions for CV.
256
257
suites: map from label to Pmf of CVs.
258
"""
259
thinkplot.Clf()
260
thinkplot.PrePlot(num=2)
261
262
pmfs = {}
263
for label, suite in suites.iteritems():
264
pmf = CoefVariation(suite)
265
print 'CV posterior mean', pmf.Mean()
266
cdf = thinkbayes.MakeCdfFromPmf(pmf, label)
267
thinkplot.Cdf(cdf)
268
269
pmfs[label] = pmf
270
271
thinkplot.Save(root='variability_cv',
272
xlabel='Coefficient of variation',
273
ylabel='Probability')
274
275
print 'female bigger', thinkbayes.PmfProbGreater(pmfs['female'],
276
pmfs['male'])
277
print 'male bigger', thinkbayes.PmfProbGreater(pmfs['male'],
278
pmfs['female'])
279
280
281
def PlotOutliers(samples):
282
"""Make CDFs showing the distribution of outliers."""
283
cdfs = []
284
for label, sample in samples.iteritems():
285
outliers = [x for x in sample if x < 150]
286
287
cdf = thinkbayes.MakeCdfFromList(outliers, label)
288
cdfs.append(cdf)
289
290
thinkplot.Clf()
291
thinkplot.Cdfs(cdfs)
292
thinkplot.Save(root='variability_cdfs',
293
title='CDF of height',
294
xlabel='Reported height (cm)',
295
ylabel='CDF')
296
297
298
def PlotMarginals(suite):
299
"""Plots marginal distributions from a joint distribution.
300
301
suite: joint distribution of mu and sigma.
302
"""
303
thinkplot.Clf()
304
305
pyplot.subplot(1, 2, 1)
306
pmf_m = suite.Marginal(0)
307
cdf_m = thinkbayes.MakeCdfFromPmf(pmf_m)
308
thinkplot.Cdf(cdf_m)
309
310
pyplot.subplot(1, 2, 2)
311
pmf_s = suite.Marginal(1)
312
cdf_s = thinkbayes.MakeCdfFromPmf(pmf_s)
313
thinkplot.Cdf(cdf_s)
314
315
thinkplot.Show()
316
317
318
def DumpHeights(data_dir='.', n=10000):
319
"""Read the BRFSS dataset, extract the heights and pickle them."""
320
resp = brfss.Respondents()
321
resp.ReadRecords(data_dir, n)
322
323
d = {1:[], 2:[]}
324
[d[r.sex].append(r.htm3) for r in resp.records if r.htm3 != 'NA']
325
326
fp = open('variability_data.pkl', 'wb')
327
cPickle.dump(d, fp)
328
fp.close()
329
330
331
def LoadHeights():
332
"""Read the pickled height data.
333
334
returns: map from sex code to list of heights.
335
"""
336
fp = open('variability_data.pkl', 'r')
337
d = cPickle.load(fp)
338
fp.close()
339
return d
340
341
342
def UpdateSuite1(suite, xs):
343
"""Computes the posterior distibution of mu and sigma.
344
345
Computes untransformed likelihoods.
346
347
suite: Suite that maps from (mu, sigma) to prob
348
xs: sequence
349
"""
350
suite.UpdateSet(xs)
351
352
353
def UpdateSuite2(suite, xs):
354
"""Computes the posterior distibution of mu and sigma.
355
356
Computes log likelihoods.
357
358
suite: Suite that maps from (mu, sigma) to prob
359
xs: sequence
360
"""
361
suite.Log()
362
suite.LogUpdateSet(xs)
363
suite.Exp()
364
suite.Normalize()
365
366
367
def UpdateSuite3(suite, xs):
368
"""Computes the posterior distibution of mu and sigma.
369
370
Computes log likelihoods efficiently.
371
372
suite: Suite that maps from (mu, sigma) to prob
373
t: sequence
374
"""
375
suite.Log()
376
suite.LogUpdateSetFast(xs)
377
suite.Exp()
378
suite.Normalize()
379
380
381
def UpdateSuite4(suite, xs):
382
"""Computes the posterior distibution of mu and sigma.
383
384
Computes log likelihoods efficiently.
385
386
suite: Suite that maps from (mu, sigma) to prob
387
t: sequence
388
"""
389
suite.Log()
390
suite.LogUpdateSetMeanVar(xs)
391
suite.Exp()
392
suite.Normalize()
393
394
395
def UpdateSuite5(suite, xs):
396
"""Computes the posterior distibution of mu and sigma.
397
398
Computes log likelihoods efficiently.
399
400
suite: Suite that maps from (mu, sigma) to prob
401
t: sequence
402
"""
403
suite.Log()
404
suite.LogUpdateSetMedianIPR(xs)
405
suite.Exp()
406
suite.Normalize()
407
408
409
def MedianIPR(xs, p):
410
"""Computes the median and interpercentile range.
411
412
xs: sequence of values
413
p: range (0-1), 0.5 yields the interquartile range
414
415
returns: tuple of float (median, IPR)
416
"""
417
cdf = thinkbayes.MakeCdfFromList(xs)
418
median = cdf.Percentile(50)
419
420
alpha = (1-p) / 2
421
ipr = cdf.Value(1-alpha) - cdf.Value(alpha)
422
return median, ipr
423
424
425
def MedianS(xs, num_sigmas):
426
"""Computes the median and an estimate of sigma.
427
428
Based on an interpercentile range (IPR).
429
430
factor: number of standard deviations spanned by the IPR
431
"""
432
half_p = thinkbayes.StandardGaussianCdf(num_sigmas) - 0.5
433
median, ipr = MedianIPR(xs, half_p * 2)
434
s = ipr / 2 / num_sigmas
435
436
return median, s
437
438
def Summarize(xs):
439
"""Prints summary statistics from a sequence of values.
440
441
xs: sequence of values
442
"""
443
# print smallest and largest
444
xs.sort()
445
print 'smallest', xs[:10]
446
print 'largest', xs[-10:]
447
448
# print median and interquartile range
449
cdf = thinkbayes.MakeCdfFromList(xs)
450
print cdf.Percentile(25), cdf.Percentile(50), cdf.Percentile(75)
451
452
453
def RunEstimate(update_func, num_points=31, median_flag=False):
454
"""Runs the whole analysis.
455
456
update_func: which of the update functions to use
457
num_points: number of points in the Suite (in each dimension)
458
"""
459
DumpHeights(n=10000000)
460
d = LoadHeights()
461
labels = {1:'male', 2:'female'}
462
463
# PlotCdfs(d, labels)
464
465
suites = {}
466
for key, xs in d.iteritems():
467
name = labels[key]
468
print name, len(xs)
469
Summarize(xs)
470
471
xs = thinkstats.Jitter(xs, 1.3)
472
473
mus, sigmas = FindPriorRanges(xs, num_points, median_flag=median_flag)
474
suite = Height(mus, sigmas, name)
475
suites[name] = suite
476
update_func(suite, xs)
477
print 'MLE', suite.MaximumLikelihood()
478
479
PlotPosterior(suite)
480
481
pmf_m = suite.Marginal(0)
482
pmf_s = suite.Marginal(1)
483
print 'marginal mu', pmf_m.Mean(), pmf_m.Var()
484
print 'marginal sigma', pmf_s.Mean(), pmf_s.Var()
485
486
# PlotMarginals(suite)
487
488
PlotCoefVariation(suites)
489
490
491
def main():
492
random.seed(17)
493
494
func = UpdateSuite5
495
median_flag = (func == UpdateSuite5)
496
RunEstimate(func, median_flag=median_flag)
497
498
499
if __name__ == '__main__':
500
main()
501
502
503
""" Results:
504
505
UpdateSuite1 (100):
506
marginal mu 162.816901408 0.55779791443
507
marginal sigma 6.36966103214 0.277026082819
508
509
UpdateSuite2 (100):
510
marginal mu 162.816901408 0.55779791443
511
marginal sigma 6.36966103214 0.277026082819
512
513
UpdateSuite3 (100):
514
marginal mu 162.816901408 0.55779791443
515
marginal sigma 6.36966103214 0.277026082819
516
517
UpdateSuite4 (100):
518
marginal mu 162.816901408 0.547456009605
519
marginal sigma 6.30305516111 0.27544106054
520
521
UpdateSuite3 (1000):
522
marginal mu 163.722137405 0.0660294386397
523
marginal sigma 6.64453251495 0.0329935312671
524
525
UpdateSuite4 (1000):
526
marginal mu 163.722137405 0.0658920503302
527
marginal sigma 6.63692197049 0.0329689887609
528
529
UpdateSuite3 (all):
530
marginal mu 163.223475005 0.000203282582659
531
marginal sigma 7.26918836916 0.000101641131229
532
533
UpdateSuite4 (all):
534
marginal mu 163.223475004 0.000203281499857
535
marginal sigma 7.26916693422 0.000101640932082
536
537
UpdateSuite5 (all):
538
marginal mu 163.1805214 7.9399898468e-07
539
marginal sigma 7.29969524118 3.26257030869e-14
540
541
"""
542
543
544