"""This file contains code for use with "Think Bayes",
by Allen B. Downey, available from greenteapress.com
Copyright 2012 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
import math
import numpy
import random
import sys
import correlation
import thinkplot
import matplotlib.pyplot as pyplot
import thinkbayes
INTERVAL = 245/365.0
FORMATS = ['pdf', 'eps']
MINSIZE = 0.2
MAXSIZE = 20
BUCKET_FACTOR = 10
def log2(x, denom=math.log(2)):
"""Computes log base 2."""
return math.log(x) / denom
def SimpleModel():
"""Runs calculations based on a simple model."""
interval = 3291.0
dt = 811.0 * 3
doublings = interval / dt
d1 = 15.5
d0 = d1 / 2.0 ** doublings
print 'interval (days)', interval
print 'interval (years)', interval / 365
print 'dt', dt
print 'doublings', doublings
print 'd1', d1
print 'd0', d0
d0 = 0.1
d1 = 15.5
doublings = log2(d1 / d0)
dt = interval / doublings
print 'doublings', doublings
print 'dt', dt
vdt = dt / 3
rdt = 365 / vdt
print 'vdt', vdt
print 'rdt', rdt
cdf = MakeCdf()
p = cdf.Prob(rdt)
print 'Prob{RDT > 2.4}', 1-p
def MakeCdf():
"""Uses the data from Zhang et al. to construct a CDF."""
n = 53.0
freqs = [0, 2, 31, 42, 48, 51, 52, 53]
ps = [freq/n for freq in freqs]
xs = numpy.arange(-1.5, 6.5, 1.0)
cdf = thinkbayes.Cdf(xs, ps)
return cdf
def PlotCdf(cdf):
"""Plots the actual and fitted distributions.
cdf: CDF object
"""
xs, ps = cdf.xs, cdf.ps
cps = [1-p for p in ps]
thinkplot.Clf()
thinkplot.Plot(xs, cps, 'bo-')
thinkplot.Save(root='kidney1',
formats=FORMATS,
xlabel='RDT',
ylabel='CCDF (log scale)',
yscale='log')
thinkplot.Clf()
thinkplot.PrePlot(num=2)
mxs, mys = ModelCdf()
thinkplot.Plot(mxs, mys, label='model', linestyle='dashed')
thinkplot.Plot(xs, ps, 'gs', label='data')
thinkplot.Save(root='kidney2',
formats=FORMATS,
xlabel='RDT (volume doublings per year)',
ylabel='CDF',
title='Distribution of RDT',
axis=[-2, 7, 0, 1],
loc=4)
def QQPlot(cdf, fit):
"""Makes a QQPlot of the values from actual and fitted distributions.
cdf: actual Cdf of RDT
fit: model
"""
xs = [-1.5, 5.5]
thinkplot.Clf()
thinkplot.Plot(xs, xs, 'b-')
xs, ps = cdf.xs, cdf.ps
fs = [fit.Value(p) for p in ps]
thinkplot.Plot(xs, fs, 'gs')
thinkplot.Save(root = 'kidney3',
formats=FORMATS,
xlabel='Actual',
ylabel='Model')
def FitCdf(cdf):
"""Fits a line to the log CCDF and returns the slope.
cdf: Cdf of RDT
"""
xs, ps = cdf.xs, cdf.ps
cps = [1-p for p in ps]
xs = xs[1:-1]
lcps = [math.log(p) for p in cps[1:-1]]
_inter, slope = correlation.LeastSquares(xs, lcps)
return -slope
def CorrelatedGenerator(cdf, rho):
"""Generates a sequence of values from cdf with correlation.
Generates a correlated standard Gaussian series, then transforms to
values from cdf
cdf: distribution to choose from
rho: target coefficient of correlation
"""
def Transform(x):
"""Maps from a Gaussian variate to a variate with the given CDF."""
p = thinkbayes.GaussianCdf(x)
y = cdf.Value(p)
return y
x = random.gauss(0, 1)
yield Transform(x)
sigma = math.sqrt(1 - rho**2)
while True:
x = random.gauss(x * rho, sigma)
yield Transform(x)
def UncorrelatedGenerator(cdf, _rho=None):
"""Generates a sequence of values from cdf with no correlation.
Ignores rho, which is accepted as a parameter to provide the
same interface as CorrelatedGenerator
cdf: distribution to choose from
rho: ignored
"""
while True:
x = cdf.Random()
yield x
def RdtGenerator(cdf, rho):
"""Returns an iterator with n values from cdf and the given correlation.
cdf: Cdf object
rho: coefficient of correlation
"""
if rho == 0.0:
return UncorrelatedGenerator(cdf)
else:
return CorrelatedGenerator(cdf, rho)
def GenerateRdt(pc, lam1, lam2):
"""Generate an RDT from a mixture of exponential distributions.
With prob pc, generate a negative value with param lam2;
otherwise generate a positive value with param lam1.
"""
if random.random() < pc:
return -random.expovariate(lam2)
else:
return random.expovariate(lam1)
def GenerateSample(n, pc, lam1, lam2):
"""Generates a sample of RDTs.
n: sample size
pc: probablity of negative growth
lam1: exponential parameter of positive growth
lam2: exponential parameter of negative growth
Returns: list of random variates
"""
xs = [GenerateRdt(pc, lam1, lam2) for _ in xrange(n)]
return xs
def GenerateCdf(n=1000, pc=0.35, lam1=0.79, lam2=5.0):
"""Generates a sample of RDTs and returns its CDF.
n: sample size
pc: probablity of negative growth
lam1: exponential parameter of positive growth
lam2: exponential parameter of negative growth
Returns: Cdf of generated sample
"""
xs = GenerateSample(n, pc, lam1, lam2)
cdf = thinkbayes.MakeCdfFromList(xs)
return cdf
def ModelCdf(pc=0.35, lam1=0.79, lam2=5.0):
"""
pc: probablity of negative growth
lam1: exponential parameter of positive growth
lam2: exponential parameter of negative growth
Returns: list of xs, list of ys
"""
cdf = thinkbayes.EvalExponentialCdf
x1 = numpy.arange(-2, 0, 0.1)
y1 = [pc * (1 - cdf(-x, lam2)) for x in x1]
x2 = numpy.arange(0, 7, 0.1)
y2 = [pc + (1-pc) * cdf(x, lam1) for x in x2]
return list(x1) + list(x2), y1+y2
def BucketToCm(y, factor=BUCKET_FACTOR):
"""Computes the linear dimension for a given bucket.
t: bucket number
factor: multiplicitive factor from one bucket to the next
Returns: linear dimension in cm
"""
return math.exp(y / factor)
def CmToBucket(x, factor=BUCKET_FACTOR):
"""Computes the bucket for a given linear dimension.
x: linear dimension in cm
factor: multiplicitive factor from one bucket to the next
Returns: float bucket number
"""
return round(factor * math.log(x))
def Diameter(volume, factor=3/math.pi/4, exp=1/3.0):
"""Converts a volume to a diameter.
d = 2r = 2 * (3/4/pi V)^1/3
"""
return 2 * (factor * volume) ** exp
def Volume(diameter, factor=4*math.pi/3):
"""Converts a diameter to a volume.
V = 4/3 pi (d/2)^3
"""
return factor * (diameter/2.0)**3
class Cache(object):
"""Records each observation point for each tumor."""
def __init__(self):
"""Initializes the cache.
joint: map from (age, bucket) to frequency
sequences: map from bucket to a list of sequences
initial_rdt: sequence of (V0, rdt) pairs
"""
self.joint = thinkbayes.Joint()
self.sequences = {}
self.initial_rdt = []
def GetBuckets(self):
"""Returns an iterator for the keys in the cache."""
return self.sequences.iterkeys()
def GetSequence(self, bucket):
"""Looks up a bucket in the cache."""
return self.sequences[bucket]
def ConditionalCdf(self, bucket, name=''):
"""Forms the cdf of ages for a given bucket.
bucket: int bucket number
name: string
"""
pmf = self.joint.Conditional(0, 1, bucket, name=name)
cdf = pmf.MakeCdf()
return cdf
def ProbOlder(self, cm, age):
"""Computes the probability of exceeding age, given size.
cm: size in cm
age: age in years
"""
bucket = CmToBucket(cm)
cdf = self.ConditionalCdf(bucket)
p = cdf.Prob(age)
return 1-p
def GetDistAgeSize(self, size_thresh=MAXSIZE):
"""Gets the joint distribution of age and size.
Map from (age, log size in cm) to log freq
Returns: new Pmf object
"""
joint = thinkbayes.Joint()
for val, freq in self.joint.Items():
age, bucket = val
cm = BucketToCm(bucket)
if cm > size_thresh:
continue
log_cm = math.log10(cm)
joint.Set((age, log_cm), math.log(freq) * 10)
return joint
def Add(self, age, seq, rdt):
"""Adds this observation point to the cache.
age: age of the tumor in years
seq: sequence of volumes
rdt: RDT during this interval
"""
final = seq[-1]
cm = Diameter(final)
bucket = CmToBucket(cm)
self.joint.Incr((age, bucket))
self.sequences.setdefault(bucket, []).append(seq)
initial = seq[-2]
self.initial_rdt.append((initial, rdt))
def Print(self):
"""Prints the size (cm) for each bucket, and the number of sequences."""
for bucket in sorted(self.GetBuckets()):
ss = self.GetSequence(bucket)
diameter = BucketToCm(bucket)
print diameter, len(ss)
def Correlation(self):
"""Computes the correlation between log volumes and rdts."""
vs, rdts = zip(*self.initial_rdt)
lvs = [math.log(v) for v in vs]
return correlation.Corr(lvs, rdts)
class Calculator(object):
"""Encapsulates the state of the computation."""
def __init__(self):
"""Initializes the cache."""
self.cache = Cache()
def MakeSequences(self, n, rho, cdf):
"""Returns a list of sequences of volumes.
n: number of sequences to make
rho: serial correlation
cdf: Cdf of rdts
Returns: list of n sequences of volumes
"""
sequences = []
for i in range(n):
rdt_seq = RdtGenerator(cdf, rho)
seq = self.MakeSequence(rdt_seq)
sequences.append(seq)
if i % 100 == 0:
print i
return sequences
def MakeSequence(self, rdt_seq, v0=0.01, interval=INTERVAL,
vmax=Volume(MAXSIZE)):
"""Simulate the growth of a tumor.
rdt_seq: sequence of rdts
v0: initial volume in mL (cm^3)
interval: timestep in years
vmax: volume to stop at
Returns: sequence of volumes
"""
seq = v0,
age = 0
for rdt in rdt_seq:
age += interval
final, seq = self.ExtendSequence(age, seq, rdt, interval)
if final > vmax:
break
return seq
def ExtendSequence(self, age, seq, rdt, interval):
"""Generates a new random value and adds it to the end of seq.
Side-effect: adds sub-sequences to the cache.
age: age of tumor at the end of this interval
seq: sequence of values so far
rdt: reciprocal doubling time in doublings per year
interval: timestep in years
Returns: final volume, extended sequence
"""
initial = seq[-1]
doublings = rdt * interval
final = initial * 2**doublings
new_seq = seq + (final,)
self.cache.Add(age, new_seq, rdt)
return final, new_seq
def PlotBucket(self, bucket, color='blue'):
"""Plots the set of sequences for the given bucket.
bucket: int bucket number
color: string
"""
sequences = self.cache.GetSequence(bucket)
for seq in sequences:
n = len(seq)
age = n * INTERVAL
ts = numpy.linspace(-age, 0, n)
PlotSequence(ts, seq, color)
def PlotBuckets(self):
"""Plots the set of sequences that ended in a given bucket."""
buckets = [7.0, 16.0, 23.0]
buckets = [23.0]
colors = ['blue', 'green', 'red', 'cyan']
thinkplot.Clf()
for bucket, color in zip(buckets, colors):
self.PlotBucket(bucket, color)
thinkplot.Save(root='kidney5',
formats=FORMATS,
title='History of simulated tumors',
axis=[-40, 1, MINSIZE, 12],
xlabel='years',
ylabel='diameter (cm, log scale)',
yscale='log')
def PlotJointDist(self):
"""Makes a pcolor plot of the age-size joint distribution."""
thinkplot.Clf()
joint = self.cache.GetDistAgeSize()
thinkplot.Contour(joint, contour=False, pcolor=True)
thinkplot.Save(root='kidney8',
formats=FORMATS,
axis=[0, 41, -0.7, 1.31],
yticks=MakeLogTicks([0.2, 0.5, 1, 2, 5, 10, 20]),
xlabel='ages',
ylabel='diameter (cm, log scale)')
def PlotConditionalCdfs(self):
"""Plots the cdf of ages for each bucket."""
buckets = [7.0, 16.0, 23.0, 27.0]
names = ['2 cm', '5 cm', '10 cm', '15 cm']
cdfs = []
for bucket, name in zip(buckets, names):
cdf = self.cache.ConditionalCdf(bucket, name)
cdfs.append(cdf)
thinkplot.Clf()
thinkplot.PrePlot(num=len(cdfs))
thinkplot.Cdfs(cdfs)
thinkplot.Save(root='kidney6',
title='Distribution of age for several diameters',
formats=FORMATS,
xlabel='tumor age (years)',
ylabel='CDF',
loc=4)
def PlotCredibleIntervals(self, xscale='linear'):
"""Plots the confidence interval for each bucket."""
xs = []
ts = []
percentiles = [95, 75, 50, 25, 5]
min_size = 0.3
for _, bucket in enumerate(sorted(self.cache.GetBuckets())):
cm = BucketToCm(bucket)
if cm < min_size or cm > 20.0:
continue
xs.append(cm)
cdf = self.cache.ConditionalCdf(bucket)
ps = [cdf.Percentile(p) for p in percentiles]
ts.append(ps)
fp = open('kidney_table.tex', 'w')
PrintTable(fp, xs, ts)
fp.close()
linewidths = [1, 2, 3, 2, 1]
alphas = [0.3, 0.5, 1, 0.5, 0.3]
labels = ['95th', '75th', '50th', '25th', '5th']
thinkplot.Clf()
yys = zip(*ts)
for ys, linewidth, alpha, label in zip(yys, linewidths, alphas, labels):
options = dict(color='blue', linewidth=linewidth,
alpha=alpha, label=label, markersize=2)
thinkplot.Plot(xs, ys, 'bo', **options)
fxs = [min_size, 20.0]
fys = FitLine(xs, ys, fxs)
thinkplot.Plot(fxs, fys, **options)
x, y = fxs[-1], fys[-1]
pyplot.text(x*1.05, y, label, color='blue',
horizontalalignment='left',
verticalalignment='center')
thinkplot.Save(root='kidney7',
formats=FORMATS,
title='Credible interval for age vs diameter',
xlabel='diameter (cm, log scale)',
ylabel='tumor age (years)',
xscale=xscale,
xticks=MakeTicks([0.5, 1, 2, 5, 10, 20]),
axis=[0.25, 35, 0, 45],
legend=False,
)
def PlotSequences(sequences):
"""Plots linear measurement vs time.
sequences: list of sequences of volumes
"""
thinkplot.Clf()
options = dict(color='gray', linewidth=1, linestyle='dashed')
thinkplot.Plot([0, 40], [10, 10], **options)
for seq in sequences:
n = len(seq)
age = n * INTERVAL
ts = numpy.linspace(0, age, n)
PlotSequence(ts, seq)
thinkplot.Save(root='kidney4',
formats=FORMATS,
axis=[0, 40, MINSIZE, 20],
title='Simulations of tumor growth',
xlabel='tumor age (years)',
yticks=MakeTicks([0.2, 0.5, 1, 2, 5, 10, 20]),
ylabel='diameter (cm, log scale)',
yscale='log')
def PlotSequence(ts, seq, color='blue'):
"""Plots a time series of linear measurements.
ts: sequence of times in years
seq: sequence of columes
color: color string
"""
options = dict(color=color, linewidth=1, alpha=0.2)
xs = [Diameter(v) for v in seq]
thinkplot.Plot(ts, xs, **options)
def PrintCI(fp, cm, ps):
"""Writes a line in the LaTeX table.
fp: file pointer
cm: diameter in cm
ts: tuples of percentiles
"""
fp.write('%0.1f' % round(cm, 1))
for p in reversed(ps):
fp.write(' & %0.1f ' % round(p, 1))
fp.write(r'\\' '\n')
def PrintTable(fp, xs, ts):
"""Writes the data in a LaTeX table.
fp: file pointer
xs: diameters in cm
ts: sequence of tuples of percentiles
"""
fp.write(r'\begin{tabular}{|r||r|r|r|r|r|}' '\n')
fp.write(r'\hline' '\n')
fp.write(r'Diameter & \multicolumn{5}{c|}{Percentiles of age} \\' '\n')
fp.write(r'(cm) & 5th & 25th & 50th & 75th & 95th \\' '\n')
fp.write(r'\hline' '\n')
for i, (cm, ps) in enumerate(zip(xs, ts)):
if i % 3 == 0:
PrintCI(fp, cm, ps)
fp.write(r'\hline' '\n')
fp.write(r'\end{tabular}' '\n')
def FitLine(xs, ys, fxs):
"""Fits a line to the xs and ys, and returns fitted values for fxs.
Applies a log transform to the xs.
xs: diameter in cm
ys: age in years
fxs: diameter in cm
"""
lxs = [math.log(x) for x in xs]
inter, slope = correlation.LeastSquares(lxs, ys)
lfxs = [math.log(x) for x in fxs]
fys = [inter + slope * x for x in lfxs]
return fys
def MakeTicks(xs):
"""Makes a pair of sequences for use as pyplot ticks.
xs: sequence of floats
Returns (xs, labels), where labels is a sequence of strings.
"""
labels = [str(x) for x in xs]
return xs, labels
def MakeLogTicks(xs):
"""Makes a pair of sequences for use as pyplot ticks.
xs: sequence of floats
Returns (xs, labels), where labels is a sequence of strings.
"""
lxs = [math.log10(x) for x in xs]
labels = [str(x) for x in xs]
return lxs, labels
def TestCorrelation(cdf):
"""Tests the correlated generator.
Makes sure that the sequence has the right distribution and correlation.
"""
n = 10000
rho = 0.4
rdt_seq = CorrelatedGenerator(cdf, rho)
xs = [rdt_seq.next() for _ in range(n)]
rho2 = correlation.SerialCorr(xs)
print rho, rho2
cdf2 = thinkbayes.MakeCdfFromList(xs)
thinkplot.Cdfs([cdf, cdf2])
thinkplot.Show()
def main(script):
for size in [1, 5, 10]:
bucket = CmToBucket(size)
print 'Size, bucket', size, bucket
SimpleModel()
random.seed(17)
cdf = MakeCdf()
lam1 = FitCdf(cdf)
fit = GenerateCdf(lam1=lam1)
PlotCdf(cdf)
calc = Calculator()
rho = 0.0
sequences = calc.MakeSequences(100, rho, fit)
PlotSequences(sequences)
calc.PlotBuckets()
_ = calc.MakeSequences(1900, rho, fit)
print 'V0-RDT correlation', calc.cache.Correlation()
print '15.5 Probability age > 8 year', calc.cache.ProbOlder(15.5, 8)
print '6.0 Probability age > 8 year', calc.cache.ProbOlder(6.0, 8)
calc.PlotConditionalCdfs()
calc.PlotCredibleIntervals(xscale='log')
calc.PlotJointDist()
if __name__ == '__main__':
main(*sys.argv)