Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96161
License: OTHER
1
"""This file contains code for use with "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
import math
9
import numpy
10
import random
11
import sys
12
13
import correlation
14
import thinkplot
15
import matplotlib.pyplot as pyplot
16
import thinkbayes
17
18
19
INTERVAL = 245/365.0
20
FORMATS = ['pdf', 'eps']
21
MINSIZE = 0.2
22
MAXSIZE = 20
23
BUCKET_FACTOR = 10
24
25
26
def log2(x, denom=math.log(2)):
27
"""Computes log base 2."""
28
return math.log(x) / denom
29
30
31
def SimpleModel():
32
"""Runs calculations based on a simple model."""
33
34
# time between discharge and diagnosis, in days
35
interval = 3291.0
36
37
# doubling time in linear measure is doubling time in volume * 3
38
dt = 811.0 * 3
39
40
# number of doublings since discharge
41
doublings = interval / dt
42
43
# how big was the tumor at time of discharge (diameter in cm)
44
d1 = 15.5
45
d0 = d1 / 2.0 ** doublings
46
47
print 'interval (days)', interval
48
print 'interval (years)', interval / 365
49
print 'dt', dt
50
print 'doublings', doublings
51
print 'd1', d1
52
print 'd0', d0
53
54
# assume an initial linear measure of 0.1 cm
55
d0 = 0.1
56
d1 = 15.5
57
58
# how many doublings would it take to get from d0 to d1
59
doublings = log2(d1 / d0)
60
61
# what linear doubling time does that imply?
62
dt = interval / doublings
63
64
print 'doublings', doublings
65
print 'dt', dt
66
67
# compute the volumetric doubling time and RDT
68
vdt = dt / 3
69
rdt = 365 / vdt
70
71
print 'vdt', vdt
72
print 'rdt', rdt
73
74
cdf = MakeCdf()
75
p = cdf.Prob(rdt)
76
print 'Prob{RDT > 2.4}', 1-p
77
78
79
def MakeCdf():
80
"""Uses the data from Zhang et al. to construct a CDF."""
81
n = 53.0
82
freqs = [0, 2, 31, 42, 48, 51, 52, 53]
83
ps = [freq/n for freq in freqs]
84
xs = numpy.arange(-1.5, 6.5, 1.0)
85
86
cdf = thinkbayes.Cdf(xs, ps)
87
return cdf
88
89
90
def PlotCdf(cdf):
91
"""Plots the actual and fitted distributions.
92
93
cdf: CDF object
94
"""
95
xs, ps = cdf.xs, cdf.ps
96
cps = [1-p for p in ps]
97
98
# CCDF on logy scale: shows exponential behavior
99
thinkplot.Clf()
100
thinkplot.Plot(xs, cps, 'bo-')
101
thinkplot.Save(root='kidney1',
102
formats=FORMATS,
103
xlabel='RDT',
104
ylabel='CCDF (log scale)',
105
yscale='log')
106
107
# CDF, model and data
108
109
thinkplot.Clf()
110
thinkplot.PrePlot(num=2)
111
mxs, mys = ModelCdf()
112
thinkplot.Plot(mxs, mys, label='model', linestyle='dashed')
113
114
thinkplot.Plot(xs, ps, 'gs', label='data')
115
thinkplot.Save(root='kidney2',
116
formats=FORMATS,
117
xlabel='RDT (volume doublings per year)',
118
ylabel='CDF',
119
title='Distribution of RDT',
120
axis=[-2, 7, 0, 1],
121
loc=4)
122
123
124
def QQPlot(cdf, fit):
125
"""Makes a QQPlot of the values from actual and fitted distributions.
126
127
cdf: actual Cdf of RDT
128
fit: model
129
"""
130
xs = [-1.5, 5.5]
131
thinkplot.Clf()
132
thinkplot.Plot(xs, xs, 'b-')
133
134
xs, ps = cdf.xs, cdf.ps
135
fs = [fit.Value(p) for p in ps]
136
137
thinkplot.Plot(xs, fs, 'gs')
138
thinkplot.Save(root = 'kidney3',
139
formats=FORMATS,
140
xlabel='Actual',
141
ylabel='Model')
142
143
144
def FitCdf(cdf):
145
"""Fits a line to the log CCDF and returns the slope.
146
147
cdf: Cdf of RDT
148
"""
149
xs, ps = cdf.xs, cdf.ps
150
cps = [1-p for p in ps]
151
152
xs = xs[1:-1]
153
lcps = [math.log(p) for p in cps[1:-1]]
154
155
_inter, slope = correlation.LeastSquares(xs, lcps)
156
return -slope
157
158
159
def CorrelatedGenerator(cdf, rho):
160
"""Generates a sequence of values from cdf with correlation.
161
162
Generates a correlated standard Gaussian series, then transforms to
163
values from cdf
164
165
cdf: distribution to choose from
166
rho: target coefficient of correlation
167
"""
168
def Transform(x):
169
"""Maps from a Gaussian variate to a variate with the given CDF."""
170
p = thinkbayes.GaussianCdf(x)
171
y = cdf.Value(p)
172
return y
173
174
# for the first value, choose from a Gaussian and transform it
175
x = random.gauss(0, 1)
176
yield Transform(x)
177
178
# for subsequent values, choose from the conditional distribution
179
# based on the previous value
180
sigma = math.sqrt(1 - rho**2)
181
while True:
182
x = random.gauss(x * rho, sigma)
183
yield Transform(x)
184
185
186
def UncorrelatedGenerator(cdf, _rho=None):
187
"""Generates a sequence of values from cdf with no correlation.
188
189
Ignores rho, which is accepted as a parameter to provide the
190
same interface as CorrelatedGenerator
191
192
cdf: distribution to choose from
193
rho: ignored
194
"""
195
while True:
196
x = cdf.Random()
197
yield x
198
199
200
def RdtGenerator(cdf, rho):
201
"""Returns an iterator with n values from cdf and the given correlation.
202
203
cdf: Cdf object
204
rho: coefficient of correlation
205
"""
206
if rho == 0.0:
207
return UncorrelatedGenerator(cdf)
208
else:
209
return CorrelatedGenerator(cdf, rho)
210
211
212
def GenerateRdt(pc, lam1, lam2):
213
"""Generate an RDT from a mixture of exponential distributions.
214
215
With prob pc, generate a negative value with param lam2;
216
otherwise generate a positive value with param lam1.
217
"""
218
if random.random() < pc:
219
return -random.expovariate(lam2)
220
else:
221
return random.expovariate(lam1)
222
223
224
def GenerateSample(n, pc, lam1, lam2):
225
"""Generates a sample of RDTs.
226
227
n: sample size
228
pc: probablity of negative growth
229
lam1: exponential parameter of positive growth
230
lam2: exponential parameter of negative growth
231
232
Returns: list of random variates
233
"""
234
xs = [GenerateRdt(pc, lam1, lam2) for _ in xrange(n)]
235
return xs
236
237
238
def GenerateCdf(n=1000, pc=0.35, lam1=0.79, lam2=5.0):
239
"""Generates a sample of RDTs and returns its CDF.
240
241
n: sample size
242
pc: probablity of negative growth
243
lam1: exponential parameter of positive growth
244
lam2: exponential parameter of negative growth
245
246
Returns: Cdf of generated sample
247
"""
248
xs = GenerateSample(n, pc, lam1, lam2)
249
cdf = thinkbayes.MakeCdfFromList(xs)
250
return cdf
251
252
253
def ModelCdf(pc=0.35, lam1=0.79, lam2=5.0):
254
"""
255
256
pc: probablity of negative growth
257
lam1: exponential parameter of positive growth
258
lam2: exponential parameter of negative growth
259
260
Returns: list of xs, list of ys
261
"""
262
cdf = thinkbayes.EvalExponentialCdf
263
x1 = numpy.arange(-2, 0, 0.1)
264
y1 = [pc * (1 - cdf(-x, lam2)) for x in x1]
265
x2 = numpy.arange(0, 7, 0.1)
266
y2 = [pc + (1-pc) * cdf(x, lam1) for x in x2]
267
return list(x1) + list(x2), y1+y2
268
269
270
def BucketToCm(y, factor=BUCKET_FACTOR):
271
"""Computes the linear dimension for a given bucket.
272
273
t: bucket number
274
factor: multiplicitive factor from one bucket to the next
275
276
Returns: linear dimension in cm
277
"""
278
return math.exp(y / factor)
279
280
281
def CmToBucket(x, factor=BUCKET_FACTOR):
282
"""Computes the bucket for a given linear dimension.
283
284
x: linear dimension in cm
285
factor: multiplicitive factor from one bucket to the next
286
287
Returns: float bucket number
288
"""
289
return round(factor * math.log(x))
290
291
292
def Diameter(volume, factor=3/math.pi/4, exp=1/3.0):
293
"""Converts a volume to a diameter.
294
295
d = 2r = 2 * (3/4/pi V)^1/3
296
"""
297
return 2 * (factor * volume) ** exp
298
299
300
def Volume(diameter, factor=4*math.pi/3):
301
"""Converts a diameter to a volume.
302
303
V = 4/3 pi (d/2)^3
304
"""
305
return factor * (diameter/2.0)**3
306
307
308
class Cache(object):
309
"""Records each observation point for each tumor."""
310
311
def __init__(self):
312
"""Initializes the cache.
313
314
joint: map from (age, bucket) to frequency
315
sequences: map from bucket to a list of sequences
316
initial_rdt: sequence of (V0, rdt) pairs
317
"""
318
self.joint = thinkbayes.Joint()
319
self.sequences = {}
320
self.initial_rdt = []
321
322
def GetBuckets(self):
323
"""Returns an iterator for the keys in the cache."""
324
return self.sequences.iterkeys()
325
326
def GetSequence(self, bucket):
327
"""Looks up a bucket in the cache."""
328
return self.sequences[bucket]
329
330
def ConditionalCdf(self, bucket, name=''):
331
"""Forms the cdf of ages for a given bucket.
332
333
bucket: int bucket number
334
name: string
335
"""
336
pmf = self.joint.Conditional(0, 1, bucket, name=name)
337
cdf = pmf.MakeCdf()
338
return cdf
339
340
def ProbOlder(self, cm, age):
341
"""Computes the probability of exceeding age, given size.
342
343
cm: size in cm
344
age: age in years
345
"""
346
bucket = CmToBucket(cm)
347
cdf = self.ConditionalCdf(bucket)
348
p = cdf.Prob(age)
349
return 1-p
350
351
def GetDistAgeSize(self, size_thresh=MAXSIZE):
352
"""Gets the joint distribution of age and size.
353
354
Map from (age, log size in cm) to log freq
355
356
Returns: new Pmf object
357
"""
358
joint = thinkbayes.Joint()
359
360
for val, freq in self.joint.Items():
361
age, bucket = val
362
cm = BucketToCm(bucket)
363
if cm > size_thresh:
364
continue
365
log_cm = math.log10(cm)
366
joint.Set((age, log_cm), math.log(freq) * 10)
367
368
return joint
369
370
def Add(self, age, seq, rdt):
371
"""Adds this observation point to the cache.
372
373
age: age of the tumor in years
374
seq: sequence of volumes
375
rdt: RDT during this interval
376
"""
377
final = seq[-1]
378
cm = Diameter(final)
379
bucket = CmToBucket(cm)
380
self.joint.Incr((age, bucket))
381
382
self.sequences.setdefault(bucket, []).append(seq)
383
384
initial = seq[-2]
385
self.initial_rdt.append((initial, rdt))
386
387
def Print(self):
388
"""Prints the size (cm) for each bucket, and the number of sequences."""
389
for bucket in sorted(self.GetBuckets()):
390
ss = self.GetSequence(bucket)
391
diameter = BucketToCm(bucket)
392
print diameter, len(ss)
393
394
def Correlation(self):
395
"""Computes the correlation between log volumes and rdts."""
396
vs, rdts = zip(*self.initial_rdt)
397
lvs = [math.log(v) for v in vs]
398
return correlation.Corr(lvs, rdts)
399
400
401
class Calculator(object):
402
"""Encapsulates the state of the computation."""
403
404
def __init__(self):
405
"""Initializes the cache."""
406
self.cache = Cache()
407
408
def MakeSequences(self, n, rho, cdf):
409
"""Returns a list of sequences of volumes.
410
411
n: number of sequences to make
412
rho: serial correlation
413
cdf: Cdf of rdts
414
415
Returns: list of n sequences of volumes
416
"""
417
sequences = []
418
for i in range(n):
419
rdt_seq = RdtGenerator(cdf, rho)
420
seq = self.MakeSequence(rdt_seq)
421
sequences.append(seq)
422
423
if i % 100 == 0:
424
print i
425
426
return sequences
427
428
def MakeSequence(self, rdt_seq, v0=0.01, interval=INTERVAL,
429
vmax=Volume(MAXSIZE)):
430
"""Simulate the growth of a tumor.
431
432
rdt_seq: sequence of rdts
433
v0: initial volume in mL (cm^3)
434
interval: timestep in years
435
vmax: volume to stop at
436
437
Returns: sequence of volumes
438
"""
439
seq = v0,
440
age = 0
441
442
for rdt in rdt_seq:
443
age += interval
444
final, seq = self.ExtendSequence(age, seq, rdt, interval)
445
if final > vmax:
446
break
447
448
return seq
449
450
def ExtendSequence(self, age, seq, rdt, interval):
451
"""Generates a new random value and adds it to the end of seq.
452
453
Side-effect: adds sub-sequences to the cache.
454
455
age: age of tumor at the end of this interval
456
seq: sequence of values so far
457
rdt: reciprocal doubling time in doublings per year
458
interval: timestep in years
459
460
Returns: final volume, extended sequence
461
"""
462
initial = seq[-1]
463
doublings = rdt * interval
464
final = initial * 2**doublings
465
new_seq = seq + (final,)
466
self.cache.Add(age, new_seq, rdt)
467
468
return final, new_seq
469
470
def PlotBucket(self, bucket, color='blue'):
471
"""Plots the set of sequences for the given bucket.
472
473
bucket: int bucket number
474
color: string
475
"""
476
sequences = self.cache.GetSequence(bucket)
477
for seq in sequences:
478
n = len(seq)
479
age = n * INTERVAL
480
ts = numpy.linspace(-age, 0, n)
481
PlotSequence(ts, seq, color)
482
483
def PlotBuckets(self):
484
"""Plots the set of sequences that ended in a given bucket."""
485
# 2.01, 4.95 cm, 9.97 cm
486
buckets = [7.0, 16.0, 23.0]
487
buckets = [23.0]
488
colors = ['blue', 'green', 'red', 'cyan']
489
490
thinkplot.Clf()
491
for bucket, color in zip(buckets, colors):
492
self.PlotBucket(bucket, color)
493
494
thinkplot.Save(root='kidney5',
495
formats=FORMATS,
496
title='History of simulated tumors',
497
axis=[-40, 1, MINSIZE, 12],
498
xlabel='years',
499
ylabel='diameter (cm, log scale)',
500
yscale='log')
501
502
def PlotJointDist(self):
503
"""Makes a pcolor plot of the age-size joint distribution."""
504
thinkplot.Clf()
505
506
joint = self.cache.GetDistAgeSize()
507
thinkplot.Contour(joint, contour=False, pcolor=True)
508
509
thinkplot.Save(root='kidney8',
510
formats=FORMATS,
511
axis=[0, 41, -0.7, 1.31],
512
yticks=MakeLogTicks([0.2, 0.5, 1, 2, 5, 10, 20]),
513
xlabel='ages',
514
ylabel='diameter (cm, log scale)')
515
516
def PlotConditionalCdfs(self):
517
"""Plots the cdf of ages for each bucket."""
518
buckets = [7.0, 16.0, 23.0, 27.0]
519
# 2.01, 4.95 cm, 9.97 cm, 14.879 cm
520
names = ['2 cm', '5 cm', '10 cm', '15 cm']
521
cdfs = []
522
523
for bucket, name in zip(buckets, names):
524
cdf = self.cache.ConditionalCdf(bucket, name)
525
cdfs.append(cdf)
526
527
thinkplot.Clf()
528
thinkplot.PrePlot(num=len(cdfs))
529
thinkplot.Cdfs(cdfs)
530
thinkplot.Save(root='kidney6',
531
title='Distribution of age for several diameters',
532
formats=FORMATS,
533
xlabel='tumor age (years)',
534
ylabel='CDF',
535
loc=4)
536
537
def PlotCredibleIntervals(self, xscale='linear'):
538
"""Plots the confidence interval for each bucket."""
539
xs = []
540
ts = []
541
percentiles = [95, 75, 50, 25, 5]
542
min_size = 0.3
543
544
# loop through the buckets, accumulate
545
# xs: sequence of sizes in cm
546
# ts: sequence of percentile tuples
547
for _, bucket in enumerate(sorted(self.cache.GetBuckets())):
548
cm = BucketToCm(bucket)
549
if cm < min_size or cm > 20.0:
550
continue
551
xs.append(cm)
552
cdf = self.cache.ConditionalCdf(bucket)
553
ps = [cdf.Percentile(p) for p in percentiles]
554
ts.append(ps)
555
556
# dump the results into a table
557
fp = open('kidney_table.tex', 'w')
558
PrintTable(fp, xs, ts)
559
fp.close()
560
561
# make the figure
562
linewidths = [1, 2, 3, 2, 1]
563
alphas = [0.3, 0.5, 1, 0.5, 0.3]
564
labels = ['95th', '75th', '50th', '25th', '5th']
565
566
# transpose the ts so we have sequences for each percentile rank
567
thinkplot.Clf()
568
yys = zip(*ts)
569
570
for ys, linewidth, alpha, label in zip(yys, linewidths, alphas, labels):
571
options = dict(color='blue', linewidth=linewidth,
572
alpha=alpha, label=label, markersize=2)
573
574
# plot the data points
575
thinkplot.Plot(xs, ys, 'bo', **options)
576
577
# plot the fit lines
578
fxs = [min_size, 20.0]
579
fys = FitLine(xs, ys, fxs)
580
581
thinkplot.Plot(fxs, fys, **options)
582
583
# put a label at the end of each line
584
x, y = fxs[-1], fys[-1]
585
pyplot.text(x*1.05, y, label, color='blue',
586
horizontalalignment='left',
587
verticalalignment='center')
588
589
# make the figure
590
thinkplot.Save(root='kidney7',
591
formats=FORMATS,
592
title='Credible interval for age vs diameter',
593
xlabel='diameter (cm, log scale)',
594
ylabel='tumor age (years)',
595
xscale=xscale,
596
xticks=MakeTicks([0.5, 1, 2, 5, 10, 20]),
597
axis=[0.25, 35, 0, 45],
598
legend=False,
599
)
600
601
602
def PlotSequences(sequences):
603
"""Plots linear measurement vs time.
604
605
sequences: list of sequences of volumes
606
"""
607
thinkplot.Clf()
608
609
options = dict(color='gray', linewidth=1, linestyle='dashed')
610
thinkplot.Plot([0, 40], [10, 10], **options)
611
612
for seq in sequences:
613
n = len(seq)
614
age = n * INTERVAL
615
ts = numpy.linspace(0, age, n)
616
PlotSequence(ts, seq)
617
618
thinkplot.Save(root='kidney4',
619
formats=FORMATS,
620
axis=[0, 40, MINSIZE, 20],
621
title='Simulations of tumor growth',
622
xlabel='tumor age (years)',
623
yticks=MakeTicks([0.2, 0.5, 1, 2, 5, 10, 20]),
624
ylabel='diameter (cm, log scale)',
625
yscale='log')
626
627
628
def PlotSequence(ts, seq, color='blue'):
629
"""Plots a time series of linear measurements.
630
631
ts: sequence of times in years
632
seq: sequence of columes
633
color: color string
634
"""
635
options = dict(color=color, linewidth=1, alpha=0.2)
636
xs = [Diameter(v) for v in seq]
637
638
thinkplot.Plot(ts, xs, **options)
639
640
641
def PrintCI(fp, cm, ps):
642
"""Writes a line in the LaTeX table.
643
644
fp: file pointer
645
cm: diameter in cm
646
ts: tuples of percentiles
647
"""
648
fp.write('%0.1f' % round(cm, 1))
649
for p in reversed(ps):
650
fp.write(' & %0.1f ' % round(p, 1))
651
fp.write(r'\\' '\n')
652
653
654
def PrintTable(fp, xs, ts):
655
"""Writes the data in a LaTeX table.
656
657
fp: file pointer
658
xs: diameters in cm
659
ts: sequence of tuples of percentiles
660
"""
661
fp.write(r'\begin{tabular}{|r||r|r|r|r|r|}' '\n')
662
fp.write(r'\hline' '\n')
663
fp.write(r'Diameter & \multicolumn{5}{c|}{Percentiles of age} \\' '\n')
664
fp.write(r'(cm) & 5th & 25th & 50th & 75th & 95th \\' '\n')
665
fp.write(r'\hline' '\n')
666
667
for i, (cm, ps) in enumerate(zip(xs, ts)):
668
#print cm, ps
669
if i % 3 == 0:
670
PrintCI(fp, cm, ps)
671
672
fp.write(r'\hline' '\n')
673
fp.write(r'\end{tabular}' '\n')
674
675
676
def FitLine(xs, ys, fxs):
677
"""Fits a line to the xs and ys, and returns fitted values for fxs.
678
679
Applies a log transform to the xs.
680
681
xs: diameter in cm
682
ys: age in years
683
fxs: diameter in cm
684
"""
685
lxs = [math.log(x) for x in xs]
686
inter, slope = correlation.LeastSquares(lxs, ys)
687
# res = correlation.Residuals(lxs, ys, inter, slope)
688
# r2 = correlation.CoefDetermination(ys, res)
689
690
lfxs = [math.log(x) for x in fxs]
691
fys = [inter + slope * x for x in lfxs]
692
return fys
693
694
695
def MakeTicks(xs):
696
"""Makes a pair of sequences for use as pyplot ticks.
697
698
xs: sequence of floats
699
700
Returns (xs, labels), where labels is a sequence of strings.
701
"""
702
labels = [str(x) for x in xs]
703
return xs, labels
704
705
706
def MakeLogTicks(xs):
707
"""Makes a pair of sequences for use as pyplot ticks.
708
709
xs: sequence of floats
710
711
Returns (xs, labels), where labels is a sequence of strings.
712
"""
713
lxs = [math.log10(x) for x in xs]
714
labels = [str(x) for x in xs]
715
return lxs, labels
716
717
718
def TestCorrelation(cdf):
719
"""Tests the correlated generator.
720
721
Makes sure that the sequence has the right distribution and correlation.
722
"""
723
n = 10000
724
rho = 0.4
725
726
rdt_seq = CorrelatedGenerator(cdf, rho)
727
xs = [rdt_seq.next() for _ in range(n)]
728
729
rho2 = correlation.SerialCorr(xs)
730
print rho, rho2
731
cdf2 = thinkbayes.MakeCdfFromList(xs)
732
733
thinkplot.Cdfs([cdf, cdf2])
734
thinkplot.Show()
735
736
737
def main(script):
738
for size in [1, 5, 10]:
739
bucket = CmToBucket(size)
740
print 'Size, bucket', size, bucket
741
742
SimpleModel()
743
744
random.seed(17)
745
746
cdf = MakeCdf()
747
748
lam1 = FitCdf(cdf)
749
fit = GenerateCdf(lam1=lam1)
750
751
# TestCorrelation(fit)
752
753
PlotCdf(cdf)
754
# QQPlot(cdf, fit)
755
756
calc = Calculator()
757
rho = 0.0
758
sequences = calc.MakeSequences(100, rho, fit)
759
PlotSequences(sequences)
760
761
calc.PlotBuckets()
762
763
_ = calc.MakeSequences(1900, rho, fit)
764
print 'V0-RDT correlation', calc.cache.Correlation()
765
766
print '15.5 Probability age > 8 year', calc.cache.ProbOlder(15.5, 8)
767
print '6.0 Probability age > 8 year', calc.cache.ProbOlder(6.0, 8)
768
769
calc.PlotConditionalCdfs()
770
771
calc.PlotCredibleIntervals(xscale='log')
772
773
calc.PlotJointDist()
774
775
776
if __name__ == '__main__':
777
main(*sys.argv)
778
779
780
781