Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96161
License: OTHER
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
import csv
9
import math
10
import numpy
11
import sys
12
13
import matplotlib
14
import matplotlib.pyplot as pyplot
15
16
import thinkbayes
17
import thinkplot
18
19
20
def ReadScale(filename='sat_scale.csv', col=2):
21
"""Reads a CSV file of SAT scales (maps from raw score to standard score).
22
23
Args:
24
filename: string filename
25
col: which column to start with (0=Reading, 2=Math, 4=Writing)
26
27
Returns: thinkbayes.Interpolator object
28
"""
29
def ParseRange(s):
30
"""Parse a range of values in the form 123-456
31
32
s: string
33
"""
34
t = [int(x) for x in s.split('-')]
35
return 1.0 * sum(t) / len(t)
36
37
fp = open(filename)
38
reader = csv.reader(fp)
39
raws = []
40
scores = []
41
42
for t in reader:
43
try:
44
raw = int(t[col])
45
raws.append(raw)
46
score = ParseRange(t[col+1])
47
scores.append(score)
48
except ValueError:
49
pass
50
51
raws.sort()
52
scores.sort()
53
return thinkbayes.Interpolator(raws, scores)
54
55
56
def ReadRanks(filename='sat_ranks.csv'):
57
"""Reads a CSV file of SAT scores.
58
59
Args:
60
filename: string filename
61
62
Returns:
63
list of (score, freq) pairs
64
"""
65
fp = open(filename)
66
reader = csv.reader(fp)
67
res = []
68
69
for t in reader:
70
try:
71
score = int(t[0])
72
freq = int(t[1])
73
res.append((score, freq))
74
except ValueError:
75
pass
76
77
return res
78
79
80
def DivideValues(pmf, denom):
81
"""Divides the values in a Pmf by denom.
82
83
Returns a new Pmf.
84
"""
85
new = thinkbayes.Pmf()
86
denom = float(denom)
87
for val, prob in pmf.Items():
88
x = val / denom
89
new.Set(x, prob)
90
return new
91
92
93
class Exam(object):
94
"""Encapsulates information about an exam.
95
96
Contains the distribution of scaled scores and an
97
Interpolator that maps between scaled and raw scores.
98
"""
99
def __init__(self):
100
self.scale = ReadScale()
101
102
scores = ReadRanks()
103
score_pmf = thinkbayes.MakePmfFromDict(dict(scores))
104
105
self.raw = self.ReverseScale(score_pmf)
106
self.max_score = max(self.raw.Values())
107
self.prior = DivideValues(self.raw, denom=self.max_score)
108
109
center = -0.05
110
width = 1.8
111
self.difficulties = MakeDifficulties(center, width, self.max_score)
112
113
def CompareScores(self, a_score, b_score, constructor):
114
"""Computes posteriors for two test scores and the likelihood ratio.
115
116
a_score, b_score: scales SAT scores
117
constructor: function that instantiates an Sat or Sat2 object
118
"""
119
a_sat = constructor(self, a_score)
120
b_sat = constructor(self, b_score)
121
122
a_sat.PlotPosteriors(b_sat)
123
124
if constructor is Sat:
125
PlotJointDist(a_sat, b_sat)
126
127
top = TopLevel('AB')
128
top.Update((a_sat, b_sat))
129
top.Print()
130
131
ratio = top.Prob('A') / top.Prob('B')
132
133
print 'Likelihood ratio', ratio
134
135
posterior = ratio / (ratio + 1)
136
print 'Posterior', posterior
137
138
if constructor is Sat2:
139
ComparePosteriorPredictive(a_sat, b_sat)
140
141
def MakeRawScoreDist(self, efficacies):
142
"""Makes the distribution of raw scores for given difficulty.
143
144
efficacies: Pmf of efficacy
145
"""
146
pmfs = thinkbayes.Pmf()
147
for efficacy, prob in efficacies.Items():
148
scores = self.PmfCorrect(efficacy)
149
pmfs.Set(scores, prob)
150
151
mix = thinkbayes.MakeMixture(pmfs)
152
return mix
153
154
def CalibrateDifficulty(self):
155
"""Make a plot showing the model distribution of raw scores."""
156
thinkplot.Clf()
157
thinkplot.PrePlot(num=2)
158
159
cdf = thinkbayes.MakeCdfFromPmf(self.raw, name='data')
160
thinkplot.Cdf(cdf)
161
162
efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)
163
pmf = self.MakeRawScoreDist(efficacies)
164
cdf = thinkbayes.MakeCdfFromPmf(pmf, name='model')
165
thinkplot.Cdf(cdf)
166
167
thinkplot.Save(root='sat_calibrate',
168
xlabel='raw score',
169
ylabel='CDF',
170
formats=['pdf', 'eps'])
171
172
def PmfCorrect(self, efficacy):
173
"""Returns the PMF of number of correct responses.
174
175
efficacy: float
176
"""
177
pmf = PmfCorrect(efficacy, self.difficulties)
178
return pmf
179
180
def Lookup(self, raw):
181
"""Looks up a raw score and returns a scaled score."""
182
return self.scale.Lookup(raw)
183
184
def Reverse(self, score):
185
"""Looks up a scaled score and returns a raw score.
186
187
Since we ignore the penalty, negative scores round up to zero.
188
"""
189
raw = self.scale.Reverse(score)
190
return raw if raw > 0 else 0
191
192
def ReverseScale(self, pmf):
193
"""Applies the reverse scale to the values of a PMF.
194
195
Args:
196
pmf: Pmf object
197
scale: Interpolator object
198
199
Returns:
200
new Pmf
201
"""
202
new = thinkbayes.Pmf()
203
for val, prob in pmf.Items():
204
raw = self.Reverse(val)
205
new.Incr(raw, prob)
206
return new
207
208
209
class Sat(thinkbayes.Suite):
210
"""Represents the distribution of p_correct for a test-taker."""
211
212
def __init__(self, exam, score):
213
self.exam = exam
214
self.score = score
215
216
# start with the prior distribution
217
thinkbayes.Suite.__init__(self, exam.prior)
218
219
# update based on an exam score
220
self.Update(score)
221
222
def Likelihood(self, data, hypo):
223
"""Computes the likelihood of a test score, given efficacy."""
224
p_correct = hypo
225
score = data
226
227
k = self.exam.Reverse(score)
228
n = self.exam.max_score
229
like = thinkbayes.EvalBinomialPmf(k, n, p_correct)
230
return like
231
232
def PlotPosteriors(self, other):
233
"""Plots posterior distributions of efficacy.
234
235
self, other: Sat objects.
236
"""
237
thinkplot.Clf()
238
thinkplot.PrePlot(num=2)
239
240
cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)
241
cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)
242
243
thinkplot.Cdfs([cdf1, cdf2])
244
thinkplot.Save(xlabel='p_correct',
245
ylabel='CDF',
246
axis=[0.7, 1.0, 0.0, 1.0],
247
root='sat_posteriors_p_corr',
248
formats=['pdf', 'eps'])
249
250
251
class Sat2(thinkbayes.Suite):
252
"""Represents the distribution of efficacy for a test-taker."""
253
254
def __init__(self, exam, score):
255
self.exam = exam
256
self.score = score
257
258
# start with the Gaussian prior
259
efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)
260
thinkbayes.Suite.__init__(self, efficacies)
261
262
# update based on an exam score
263
self.Update(score)
264
265
def Likelihood(self, data, hypo):
266
"""Computes the likelihood of a test score, given efficacy."""
267
efficacy = hypo
268
score = data
269
raw = self.exam.Reverse(score)
270
271
pmf = self.exam.PmfCorrect(efficacy)
272
like = pmf.Prob(raw)
273
return like
274
275
def MakePredictiveDist(self):
276
"""Returns the distribution of raw scores expected on a re-test."""
277
raw_pmf = self.exam.MakeRawScoreDist(self)
278
return raw_pmf
279
280
def PlotPosteriors(self, other):
281
"""Plots posterior distributions of efficacy.
282
283
self, other: Sat objects.
284
"""
285
thinkplot.Clf()
286
thinkplot.PrePlot(num=2)
287
288
cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)
289
cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)
290
291
thinkplot.Cdfs([cdf1, cdf2])
292
thinkplot.Save(xlabel='efficacy',
293
ylabel='CDF',
294
axis=[0, 4.6, 0.0, 1.0],
295
root='sat_posteriors_eff',
296
formats=['pdf', 'eps'])
297
298
299
def PlotJointDist(pmf1, pmf2, thresh=0.8):
300
"""Plot the joint distribution of p_correct.
301
302
pmf1, pmf2: posterior distributions
303
thresh: lower bound of the range to be plotted
304
"""
305
def Clean(pmf):
306
"""Removes values below thresh."""
307
vals = [val for val in pmf.Values() if val < thresh]
308
[pmf.Remove(val) for val in vals]
309
310
Clean(pmf1)
311
Clean(pmf2)
312
pmf = thinkbayes.MakeJoint(pmf1, pmf2)
313
314
thinkplot.Figure(figsize=(6, 6))
315
thinkplot.Contour(pmf, contour=False, pcolor=True)
316
317
thinkplot.Plot([thresh, 1.0], [thresh, 1.0],
318
color='gray', alpha=0.2, linewidth=4)
319
320
thinkplot.Save(root='sat_joint',
321
xlabel='p_correct Alice',
322
ylabel='p_correct Bob',
323
axis=[thresh, 1.0, thresh, 1.0],
324
formats=['pdf', 'eps'])
325
326
327
def ComparePosteriorPredictive(a_sat, b_sat):
328
"""Compares the predictive distributions of raw scores.
329
330
a_sat: posterior distribution
331
b_sat:
332
"""
333
a_pred = a_sat.MakePredictiveDist()
334
b_pred = b_sat.MakePredictiveDist()
335
336
#thinkplot.Clf()
337
#thinkplot.Pmfs([a_pred, b_pred])
338
#thinkplot.Show()
339
340
a_like = thinkbayes.PmfProbGreater(a_pred, b_pred)
341
b_like = thinkbayes.PmfProbLess(a_pred, b_pred)
342
c_like = thinkbayes.PmfProbEqual(a_pred, b_pred)
343
344
print 'Posterior predictive'
345
print 'A', a_like
346
print 'B', b_like
347
print 'C', c_like
348
349
350
def PlotPriorDist(pmf):
351
"""Plot the prior distribution of p_correct.
352
353
pmf: prior
354
"""
355
thinkplot.Clf()
356
thinkplot.PrePlot(num=1)
357
358
cdf1 = thinkbayes.MakeCdfFromPmf(pmf, 'prior')
359
thinkplot.Cdf(cdf1)
360
thinkplot.Save(root='sat_prior',
361
xlabel='p_correct',
362
ylabel='CDF',
363
formats=['pdf', 'eps'])
364
365
366
class TopLevel(thinkbayes.Suite):
367
"""Evaluates the top-level hypotheses about Alice and Bob.
368
369
Uses the bottom-level posterior distribution about p_correct
370
(or efficacy).
371
"""
372
373
def Update(self, data):
374
a_sat, b_sat = data
375
376
a_like = thinkbayes.PmfProbGreater(a_sat, b_sat)
377
b_like = thinkbayes.PmfProbLess(a_sat, b_sat)
378
c_like = thinkbayes.PmfProbEqual(a_sat, b_sat)
379
380
a_like += c_like / 2
381
b_like += c_like / 2
382
383
self.Mult('A', a_like)
384
self.Mult('B', b_like)
385
386
self.Normalize()
387
388
389
def ProbCorrect(efficacy, difficulty, a=1):
390
"""Returns the probability that a person gets a question right.
391
392
efficacy: personal ability to answer questions
393
difficulty: how hard the question is
394
395
Returns: float prob
396
"""
397
return 1 / (1 + math.exp(-a * (efficacy - difficulty)))
398
399
400
def BinaryPmf(p):
401
"""Makes a Pmf with values 1 and 0.
402
403
p: probability given to 1
404
405
Returns: Pmf object
406
"""
407
pmf = thinkbayes.Pmf()
408
pmf.Set(1, p)
409
pmf.Set(0, 1-p)
410
return pmf
411
412
413
def PmfCorrect(efficacy, difficulties):
414
"""Computes the distribution of correct responses.
415
416
efficacy: personal ability to answer questions
417
difficulties: list of difficulties, one for each question
418
419
Returns: new Pmf object
420
"""
421
pmf0 = thinkbayes.Pmf([0])
422
423
ps = [ProbCorrect(efficacy, difficulty) for difficulty in difficulties]
424
pmfs = [BinaryPmf(p) for p in ps]
425
dist = sum(pmfs, pmf0)
426
return dist
427
428
429
def MakeDifficulties(center, width, n):
430
"""Makes a list of n difficulties with a given center and width.
431
432
Returns: list of n floats between center-width and center+width
433
"""
434
low, high = center-width, center+width
435
return numpy.linspace(low, high, n)
436
437
438
def ProbCorrectTable():
439
"""Makes a table of p_correct for a range of efficacy and difficulty."""
440
efficacies = [3, 1.5, 0, -1.5, -3]
441
difficulties = [-1.85, -0.05, 1.75]
442
443
for eff in efficacies:
444
print '%0.2f & ' % eff,
445
for diff in difficulties:
446
p = ProbCorrect(eff, diff)
447
print '%0.2f & ' % p,
448
print r'\\'
449
450
451
def main(script):
452
ProbCorrectTable()
453
454
exam = Exam()
455
456
PlotPriorDist(exam.prior)
457
exam.CalibrateDifficulty()
458
459
exam.CompareScores(780, 740, constructor=Sat)
460
461
exam.CompareScores(780, 740, constructor=Sat2)
462
463
464
if __name__ == '__main__':
465
main(*sys.argv)
466
467