CoCalc -- sat.py

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / think-bayes / code / sat.py
Views: ⁹⁶¹⁶¹
License: OTHER
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
import csv
9
import math
10
import numpy
11
import sys
12

13
import matplotlib
14
import matplotlib.pyplot as pyplot
15

16
import thinkbayes
17
import thinkplot
18

19

20
def ReadScale(filename='sat_scale.csv', col=2):
21
    """Reads a CSV file of SAT scales (maps from raw score to standard score).
22

23
    Args:
24
      filename: string filename
25
      col: which column to start with (0=Reading, 2=Math, 4=Writing)
26

27
    Returns: thinkbayes.Interpolator object
28
    """
29
    def ParseRange(s):
30
        """Parse a range of values in the form 123-456
31

32
        s: string
33
        """
34
        t = [int(x) for x in s.split('-')]
35
        return 1.0 * sum(t) / len(t)
36

37
    fp = open(filename)
38
    reader = csv.reader(fp)
39
    raws = []
40
    scores = []
41

42
    for t in reader:
43
        try:
44
            raw = int(t[col])
45
            raws.append(raw)
46
            score = ParseRange(t[col+1])
47
            scores.append(score)
48
        except ValueError:
49
            pass
50

51
    raws.sort()
52
    scores.sort()
53
    return thinkbayes.Interpolator(raws, scores)
54

55

56
def ReadRanks(filename='sat_ranks.csv'):
57
    """Reads a CSV file of SAT scores.
58

59
    Args:
60
      filename: string filename
61

62
    Returns:
63
      list of (score, freq) pairs
64
    """
65
    fp = open(filename)
66
    reader = csv.reader(fp)
67
    res = []
68

69
    for t in reader:
70
        try:
71
            score = int(t[0])
72
            freq = int(t[1])
73
            res.append((score, freq))
74
        except ValueError:
75
            pass
76

77
    return res
78

79

80
def DivideValues(pmf, denom):
81
    """Divides the values in a Pmf by denom.
82

83
    Returns a new Pmf.
84
    """
85
    new = thinkbayes.Pmf()
86
    denom = float(denom)
87
    for val, prob in pmf.Items():
88
        x = val / denom
89
        new.Set(x, prob)
90
    return new
91

92

93
class Exam(object):
94
    """Encapsulates information about an exam.
95

96
    Contains the distribution of scaled scores and an
97
    Interpolator that maps between scaled and raw scores.
98
    """
99
    def __init__(self):
100
        self.scale = ReadScale()
101

102
        scores = ReadRanks()
103
        score_pmf = thinkbayes.MakePmfFromDict(dict(scores))
104

105
        self.raw = self.ReverseScale(score_pmf)
106
        self.max_score = max(self.raw.Values())
107
        self.prior = DivideValues(self.raw, denom=self.max_score)
108
        
109
        center = -0.05
110
        width = 1.8
111
        self.difficulties = MakeDifficulties(center, width, self.max_score)
112

113
    def CompareScores(self, a_score, b_score, constructor):
114
        """Computes posteriors for two test scores and the likelihood ratio.
115

116
        a_score, b_score: scales SAT scores
117
        constructor: function that instantiates an Sat or Sat2 object
118
        """
119
        a_sat = constructor(self, a_score)
120
        b_sat = constructor(self, b_score)
121

122
        a_sat.PlotPosteriors(b_sat)
123

124
        if constructor is Sat:
125
            PlotJointDist(a_sat, b_sat)
126

127
        top = TopLevel('AB')
128
        top.Update((a_sat, b_sat))
129
        top.Print()
130

131
        ratio = top.Prob('A') / top.Prob('B')
132
        
133
        print 'Likelihood ratio', ratio
134

135
        posterior = ratio / (ratio + 1)
136
        print 'Posterior', posterior
137

138
        if constructor is Sat2:
139
            ComparePosteriorPredictive(a_sat, b_sat)
140

141
    def MakeRawScoreDist(self, efficacies):
142
        """Makes the distribution of raw scores for given difficulty.
143

144
        efficacies: Pmf of efficacy
145
        """
146
        pmfs = thinkbayes.Pmf()
147
        for efficacy, prob in efficacies.Items():
148
            scores = self.PmfCorrect(efficacy)
149
            pmfs.Set(scores, prob)
150

151
        mix = thinkbayes.MakeMixture(pmfs)
152
        return mix
153

154
    def CalibrateDifficulty(self):
155
        """Make a plot showing the model distribution of raw scores."""
156
        thinkplot.Clf()
157
        thinkplot.PrePlot(num=2)
158

159
        cdf = thinkbayes.MakeCdfFromPmf(self.raw, name='data')
160
        thinkplot.Cdf(cdf)
161

162
        efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)
163
        pmf = self.MakeRawScoreDist(efficacies)
164
        cdf = thinkbayes.MakeCdfFromPmf(pmf, name='model')
165
        thinkplot.Cdf(cdf)
166
        
167
        thinkplot.Save(root='sat_calibrate',
168
                    xlabel='raw score',
169
                    ylabel='CDF',
170
                    formats=['pdf', 'eps'])
171

172
    def PmfCorrect(self, efficacy):
173
        """Returns the PMF of number of correct responses.
174

175
        efficacy: float
176
        """
177
        pmf = PmfCorrect(efficacy, self.difficulties)
178
        return pmf
179

180
    def Lookup(self, raw):
181
        """Looks up a raw score and returns a scaled score."""
182
        return self.scale.Lookup(raw)
183
        
184
    def Reverse(self, score):
185
        """Looks up a scaled score and returns a raw score.
186

187
        Since we ignore the penalty, negative scores round up to zero.
188
        """
189
        raw = self.scale.Reverse(score)
190
        return raw if raw > 0 else 0
191
        
192
    def ReverseScale(self, pmf):
193
        """Applies the reverse scale to the values of a PMF.
194

195
        Args:
196
            pmf: Pmf object
197
            scale: Interpolator object
198

199
        Returns:
200
            new Pmf
201
        """
202
        new = thinkbayes.Pmf()
203
        for val, prob in pmf.Items():
204
            raw = self.Reverse(val)
205
            new.Incr(raw, prob)
206
        return new
207

208

209
class Sat(thinkbayes.Suite):
210
    """Represents the distribution of p_correct for a test-taker."""
211

212
    def __init__(self, exam, score):
213
        self.exam = exam
214
        self.score = score
215

216
        # start with the prior distribution
217
        thinkbayes.Suite.__init__(self, exam.prior)
218

219
        # update based on an exam score
220
        self.Update(score)
221

222
    def Likelihood(self, data, hypo):
223
        """Computes the likelihood of a test score, given efficacy."""
224
        p_correct = hypo
225
        score = data
226

227
        k = self.exam.Reverse(score)
228
        n = self.exam.max_score
229
        like = thinkbayes.EvalBinomialPmf(k, n, p_correct)
230
        return like
231

232
    def PlotPosteriors(self, other):
233
        """Plots posterior distributions of efficacy.
234

235
        self, other: Sat objects.
236
        """
237
        thinkplot.Clf()
238
        thinkplot.PrePlot(num=2)
239

240
        cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)
241
        cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)
242

243
        thinkplot.Cdfs([cdf1, cdf2])
244
        thinkplot.Save(xlabel='p_correct', 
245
                    ylabel='CDF', 
246
                    axis=[0.7, 1.0, 0.0, 1.0],
247
                    root='sat_posteriors_p_corr',
248
                    formats=['pdf', 'eps'])
249

250

251
class Sat2(thinkbayes.Suite):
252
    """Represents the distribution of efficacy for a test-taker."""
253

254
    def __init__(self, exam, score):
255
        self.exam = exam
256
        self.score = score
257

258
        # start with the Gaussian prior
259
        efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)
260
        thinkbayes.Suite.__init__(self, efficacies)
261

262
        # update based on an exam score
263
        self.Update(score)
264

265
    def Likelihood(self, data, hypo):
266
        """Computes the likelihood of a test score, given efficacy."""
267
        efficacy = hypo
268
        score = data
269
        raw = self.exam.Reverse(score)
270

271
        pmf = self.exam.PmfCorrect(efficacy)
272
        like = pmf.Prob(raw)
273
        return like
274

275
    def MakePredictiveDist(self):
276
        """Returns the distribution of raw scores expected on a re-test."""
277
        raw_pmf = self.exam.MakeRawScoreDist(self)
278
        return raw_pmf
279
    
280
    def PlotPosteriors(self, other):
281
        """Plots posterior distributions of efficacy.
282

283
        self, other: Sat objects.
284
        """
285
        thinkplot.Clf()
286
        thinkplot.PrePlot(num=2)
287

288
        cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)
289
        cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)
290

291
        thinkplot.Cdfs([cdf1, cdf2])
292
        thinkplot.Save(xlabel='efficacy', 
293
                    ylabel='CDF', 
294
                    axis=[0, 4.6, 0.0, 1.0],
295
                    root='sat_posteriors_eff',
296
                    formats=['pdf', 'eps'])
297

298

299
def PlotJointDist(pmf1, pmf2, thresh=0.8):
300
    """Plot the joint distribution of p_correct.
301

302
    pmf1, pmf2: posterior distributions
303
    thresh: lower bound of the range to be plotted
304
    """
305
    def Clean(pmf):
306
        """Removes values below thresh."""
307
        vals = [val for val in pmf.Values() if val < thresh]
308
        [pmf.Remove(val) for val in vals]
309

310
    Clean(pmf1)
311
    Clean(pmf2)
312
    pmf = thinkbayes.MakeJoint(pmf1, pmf2)
313

314
    thinkplot.Figure(figsize=(6, 6))    
315
    thinkplot.Contour(pmf, contour=False, pcolor=True)
316

317
    thinkplot.Plot([thresh, 1.0], [thresh, 1.0],
318
                color='gray', alpha=0.2, linewidth=4)
319

320
    thinkplot.Save(root='sat_joint',
321
                   xlabel='p_correct Alice', 
322
                   ylabel='p_correct Bob',
323
                   axis=[thresh, 1.0, thresh, 1.0],
324
                   formats=['pdf', 'eps'])
325

326

327
def ComparePosteriorPredictive(a_sat, b_sat):
328
    """Compares the predictive distributions of raw scores.
329

330
    a_sat: posterior distribution
331
    b_sat:
332
    """
333
    a_pred = a_sat.MakePredictiveDist()
334
    b_pred = b_sat.MakePredictiveDist()
335

336
    #thinkplot.Clf()
337
    #thinkplot.Pmfs([a_pred, b_pred])
338
    #thinkplot.Show()
339

340
    a_like = thinkbayes.PmfProbGreater(a_pred, b_pred)
341
    b_like = thinkbayes.PmfProbLess(a_pred, b_pred)
342
    c_like = thinkbayes.PmfProbEqual(a_pred, b_pred)
343

344
    print 'Posterior predictive'
345
    print 'A', a_like
346
    print 'B', b_like
347
    print 'C', c_like
348

349

350
def PlotPriorDist(pmf):
351
    """Plot the prior distribution of p_correct.
352

353
    pmf: prior
354
    """
355
    thinkplot.Clf()
356
    thinkplot.PrePlot(num=1)
357

358
    cdf1 = thinkbayes.MakeCdfFromPmf(pmf, 'prior')
359
    thinkplot.Cdf(cdf1)
360
    thinkplot.Save(root='sat_prior',
361
                   xlabel='p_correct', 
362
                   ylabel='CDF',
363
                   formats=['pdf', 'eps'])
364

365

366
class TopLevel(thinkbayes.Suite):
367
    """Evaluates the top-level hypotheses about Alice and Bob.
368

369
    Uses the bottom-level posterior distribution about p_correct
370
    (or efficacy).
371
    """
372

373
    def Update(self, data):
374
        a_sat, b_sat = data
375

376
        a_like = thinkbayes.PmfProbGreater(a_sat, b_sat)
377
        b_like = thinkbayes.PmfProbLess(a_sat, b_sat)
378
        c_like = thinkbayes.PmfProbEqual(a_sat, b_sat)
379

380
        a_like += c_like / 2
381
        b_like += c_like / 2
382

383
        self.Mult('A', a_like)
384
        self.Mult('B', b_like)
385

386
        self.Normalize()
387

388

389
def ProbCorrect(efficacy, difficulty, a=1):
390
    """Returns the probability that a person gets a question right.
391

392
    efficacy: personal ability to answer questions
393
    difficulty: how hard the question is
394

395
    Returns: float prob
396
    """
397
    return 1 / (1 + math.exp(-a * (efficacy - difficulty)))
398

399

400
def BinaryPmf(p):
401
    """Makes a Pmf with values 1 and 0.
402
    
403
    p: probability given to 1
404
    
405
    Returns: Pmf object
406
    """
407
    pmf = thinkbayes.Pmf()
408
    pmf.Set(1, p)
409
    pmf.Set(0, 1-p)
410
    return pmf
411

412

413
def PmfCorrect(efficacy, difficulties):
414
    """Computes the distribution of correct responses.
415

416
    efficacy: personal ability to answer questions
417
    difficulties: list of difficulties, one for each question
418

419
    Returns: new Pmf object
420
    """
421
    pmf0 = thinkbayes.Pmf([0])
422

423
    ps = [ProbCorrect(efficacy, difficulty) for difficulty in difficulties]
424
    pmfs = [BinaryPmf(p) for p in ps]
425
    dist = sum(pmfs, pmf0)
426
    return dist
427

428

429
def MakeDifficulties(center, width, n):
430
    """Makes a list of n difficulties with a given center and width.
431

432
    Returns: list of n floats between center-width and center+width
433
    """
434
    low, high = center-width, center+width
435
    return numpy.linspace(low, high, n)
436

437

438
def ProbCorrectTable():
439
    """Makes a table of p_correct for a range of efficacy and difficulty."""
440
    efficacies = [3, 1.5, 0, -1.5, -3]
441
    difficulties = [-1.85, -0.05, 1.75]
442

443
    for eff in efficacies:
444
        print '%0.2f & ' % eff, 
445
        for diff in difficulties:
446
            p = ProbCorrect(eff, diff)
447
            print '%0.2f & ' % p, 
448
        print r'\\'
449

450

451
def main(script):
452
    ProbCorrectTable()
453

454
    exam = Exam()
455

456
    PlotPriorDist(exam.prior)
457
    exam.CalibrateDifficulty()
458

459
    exam.CompareScores(780, 740, constructor=Sat)
460

461
    exam.CompareScores(780, 740, constructor=Sat2)
462

463

464
if __name__ == '__main__':
465
    main(*sys.argv)
466

467
Product

Resources

Company