Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96160
License: OTHER
1
"""This file contains code for use with "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
"""This file contains class definitions for:
9
10
Hist: represents a histogram (map from values to integer frequencies).
11
12
Pmf: represents a probability mass function (map from values to probs).
13
14
_DictWrapper: private parent class for Hist and Pmf.
15
16
Cdf: represents a discrete cumulative distribution function
17
18
Pdf: represents a continuous probability density function
19
20
"""
21
22
import bisect
23
import copy
24
import logging
25
import math
26
import numpy
27
import random
28
29
import scipy.stats
30
from scipy.special import erf, erfinv, gammaln
31
32
ROOT2 = math.sqrt(2)
33
34
def RandomSeed(x):
35
"""Initialize the random and numpy.random generators.
36
37
x: int seed
38
"""
39
random.seed(x)
40
numpy.random.seed(x)
41
42
43
def Odds(p):
44
"""Computes odds for a given probability.
45
46
Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
47
48
Note: when p=1, the formula for odds divides by zero, which is
49
normally undefined. But I think it is reasonable to define Odds(1)
50
to be infinity, so that's what this function does.
51
52
p: float 0-1
53
54
Returns: float odds
55
"""
56
if p == 1:
57
return float('inf')
58
return p / (1 - p)
59
60
61
def Probability(o):
62
"""Computes the probability corresponding to given odds.
63
64
Example: o=2 means 2:1 odds in favor, or 2/3 probability
65
66
o: float odds, strictly positive
67
68
Returns: float probability
69
"""
70
return o / (o + 1)
71
72
73
def Probability2(yes, no):
74
"""Computes the probability corresponding to given odds.
75
76
Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
77
78
yes, no: int or float odds in favor
79
"""
80
return float(yes) / (yes + no)
81
82
83
class Interpolator(object):
84
"""Represents a mapping between sorted sequences; performs linear interp.
85
86
Attributes:
87
xs: sorted list
88
ys: sorted list
89
"""
90
91
def __init__(self, xs, ys):
92
self.xs = xs
93
self.ys = ys
94
95
def Lookup(self, x):
96
"""Looks up x and returns the corresponding value of y."""
97
return self._Bisect(x, self.xs, self.ys)
98
99
def Reverse(self, y):
100
"""Looks up y and returns the corresponding value of x."""
101
return self._Bisect(y, self.ys, self.xs)
102
103
def _Bisect(self, x, xs, ys):
104
"""Helper function."""
105
if x <= xs[0]:
106
return ys[0]
107
if x >= xs[-1]:
108
return ys[-1]
109
i = bisect.bisect(xs, x)
110
frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
111
y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
112
return y
113
114
115
class _DictWrapper(object):
116
"""An object that contains a dictionary."""
117
118
def __init__(self, values=None, name=''):
119
"""Initializes the distribution.
120
121
hypos: sequence of hypotheses
122
"""
123
self.name = name
124
self.d = {}
125
126
# flag whether the distribution is under a log transform
127
self.log = False
128
129
if values is None:
130
return
131
132
init_methods = [
133
self.InitPmf,
134
self.InitMapping,
135
self.InitSequence,
136
self.InitFailure,
137
]
138
139
for method in init_methods:
140
try:
141
method(values)
142
break
143
except AttributeError:
144
continue
145
146
if len(self) > 0:
147
self.Normalize()
148
149
def InitSequence(self, values):
150
"""Initializes with a sequence of equally-likely values.
151
152
values: sequence of values
153
"""
154
for value in values:
155
self.Set(value, 1)
156
157
def InitMapping(self, values):
158
"""Initializes with a map from value to probability.
159
160
values: map from value to probability
161
"""
162
for value, prob in values.iteritems():
163
self.Set(value, prob)
164
165
def InitPmf(self, values):
166
"""Initializes with a Pmf.
167
168
values: Pmf object
169
"""
170
for value, prob in values.Items():
171
self.Set(value, prob)
172
173
def InitFailure(self, values):
174
"""Raises an error."""
175
raise ValueError('None of the initialization methods worked.')
176
177
def __len__(self):
178
return len(self.d)
179
180
def __iter__(self):
181
return iter(self.d)
182
183
def iterkeys(self):
184
return iter(self.d)
185
186
def __contains__(self, value):
187
return value in self.d
188
189
def Copy(self, name=None):
190
"""Returns a copy.
191
192
Make a shallow copy of d. If you want a deep copy of d,
193
use copy.deepcopy on the whole object.
194
195
Args:
196
name: string name for the new Hist
197
"""
198
new = copy.copy(self)
199
new.d = copy.copy(self.d)
200
new.name = name if name is not None else self.name
201
return new
202
203
def Scale(self, factor):
204
"""Multiplies the values by a factor.
205
206
factor: what to multiply by
207
208
Returns: new object
209
"""
210
new = self.Copy()
211
new.d.clear()
212
213
for val, prob in self.Items():
214
new.Set(val * factor, prob)
215
return new
216
217
def Log(self, m=None):
218
"""Log transforms the probabilities.
219
220
Removes values with probability 0.
221
222
Normalizes so that the largest logprob is 0.
223
"""
224
if self.log:
225
raise ValueError("Pmf/Hist already under a log transform")
226
self.log = True
227
228
if m is None:
229
m = self.MaxLike()
230
231
for x, p in self.d.iteritems():
232
if p:
233
self.Set(x, math.log(p / m))
234
else:
235
self.Remove(x)
236
237
def Exp(self, m=None):
238
"""Exponentiates the probabilities.
239
240
m: how much to shift the ps before exponentiating
241
242
If m is None, normalizes so that the largest prob is 1.
243
"""
244
if not self.log:
245
raise ValueError("Pmf/Hist not under a log transform")
246
self.log = False
247
248
if m is None:
249
m = self.MaxLike()
250
251
for x, p in self.d.iteritems():
252
self.Set(x, math.exp(p - m))
253
254
def GetDict(self):
255
"""Gets the dictionary."""
256
return self.d
257
258
def SetDict(self, d):
259
"""Sets the dictionary."""
260
self.d = d
261
262
def Values(self):
263
"""Gets an unsorted sequence of values.
264
265
Note: one source of confusion is that the keys of this
266
dictionary are the values of the Hist/Pmf, and the
267
values of the dictionary are frequencies/probabilities.
268
"""
269
return self.d.keys()
270
271
def Items(self):
272
"""Gets an unsorted sequence of (value, freq/prob) pairs."""
273
return self.d.items()
274
275
def Render(self):
276
"""Generates a sequence of points suitable for plotting.
277
278
Returns:
279
tuple of (sorted value sequence, freq/prob sequence)
280
"""
281
return zip(*sorted(self.Items()))
282
283
def Print(self):
284
"""Prints the values and freqs/probs in ascending order."""
285
for val, prob in sorted(self.d.iteritems()):
286
print val, prob
287
288
def Set(self, x, y=0):
289
"""Sets the freq/prob associated with the value x.
290
291
Args:
292
x: number value
293
y: number freq or prob
294
"""
295
self.d[x] = y
296
297
def Incr(self, x, term=1):
298
"""Increments the freq/prob associated with the value x.
299
300
Args:
301
x: number value
302
term: how much to increment by
303
"""
304
self.d[x] = self.d.get(x, 0) + term
305
306
def Mult(self, x, factor):
307
"""Scales the freq/prob associated with the value x.
308
309
Args:
310
x: number value
311
factor: how much to multiply by
312
"""
313
self.d[x] = self.d.get(x, 0) * factor
314
315
def Remove(self, x):
316
"""Removes a value.
317
318
Throws an exception if the value is not there.
319
320
Args:
321
x: value to remove
322
"""
323
del self.d[x]
324
325
def Total(self):
326
"""Returns the total of the frequencies/probabilities in the map."""
327
total = sum(self.d.itervalues())
328
return total
329
330
def MaxLike(self):
331
"""Returns the largest frequency/probability in the map."""
332
return max(self.d.itervalues())
333
334
335
class Hist(_DictWrapper):
336
"""Represents a histogram, which is a map from values to frequencies.
337
338
Values can be any hashable type; frequencies are integer counters.
339
"""
340
341
def Freq(self, x):
342
"""Gets the frequency associated with the value x.
343
344
Args:
345
x: number value
346
347
Returns:
348
int frequency
349
"""
350
return self.d.get(x, 0)
351
352
def Freqs(self, xs):
353
"""Gets frequencies for a sequence of values."""
354
return [self.Freq(x) for x in xs]
355
356
def IsSubset(self, other):
357
"""Checks whether the values in this histogram are a subset of
358
the values in the given histogram."""
359
for val, freq in self.Items():
360
if freq > other.Freq(val):
361
return False
362
return True
363
364
def Subtract(self, other):
365
"""Subtracts the values in the given histogram from this histogram."""
366
for val, freq in other.Items():
367
self.Incr(val, -freq)
368
369
370
class Pmf(_DictWrapper):
371
"""Represents a probability mass function.
372
373
Values can be any hashable type; probabilities are floating-point.
374
Pmfs are not necessarily normalized.
375
"""
376
377
def Prob(self, x, default=0):
378
"""Gets the probability associated with the value x.
379
380
Args:
381
x: number value
382
default: value to return if the key is not there
383
384
Returns:
385
float probability
386
"""
387
return self.d.get(x, default)
388
389
def Probs(self, xs):
390
"""Gets probabilities for a sequence of values."""
391
return [self.Prob(x) for x in xs]
392
393
def MakeCdf(self, name=None):
394
"""Makes a Cdf."""
395
return MakeCdfFromPmf(self, name=name)
396
397
def ProbGreater(self, x):
398
"""Probability that a sample from this Pmf exceeds x.
399
400
x: number
401
402
returns: float probability
403
"""
404
t = [prob for (val, prob) in self.d.iteritems() if val > x]
405
return sum(t)
406
407
def ProbLess(self, x):
408
"""Probability that a sample from this Pmf is less than x.
409
410
x: number
411
412
returns: float probability
413
"""
414
t = [prob for (val, prob) in self.d.iteritems() if val < x]
415
return sum(t)
416
417
def __lt__(self, obj):
418
"""Less than.
419
420
obj: number or _DictWrapper
421
422
returns: float probability
423
"""
424
if isinstance(obj, _DictWrapper):
425
return PmfProbLess(self, obj)
426
else:
427
return self.ProbLess(obj)
428
429
def __gt__(self, obj):
430
"""Greater than.
431
432
obj: number or _DictWrapper
433
434
returns: float probability
435
"""
436
if isinstance(obj, _DictWrapper):
437
return PmfProbGreater(self, obj)
438
else:
439
return self.ProbGreater(obj)
440
441
def __ge__(self, obj):
442
"""Greater than or equal.
443
444
obj: number or _DictWrapper
445
446
returns: float probability
447
"""
448
return 1 - (self < obj)
449
450
def __le__(self, obj):
451
"""Less than or equal.
452
453
obj: number or _DictWrapper
454
455
returns: float probability
456
"""
457
return 1 - (self > obj)
458
459
def __eq__(self, obj):
460
"""Equal to.
461
462
obj: number or _DictWrapper
463
464
returns: float probability
465
"""
466
if isinstance(obj, _DictWrapper):
467
return PmfProbEqual(self, obj)
468
else:
469
return self.Prob(obj)
470
471
def __ne__(self, obj):
472
"""Not equal to.
473
474
obj: number or _DictWrapper
475
476
returns: float probability
477
"""
478
return 1 - (self == obj)
479
480
def Normalize(self, fraction=1.0):
481
"""Normalizes this PMF so the sum of all probs is fraction.
482
483
Args:
484
fraction: what the total should be after normalization
485
486
Returns: the total probability before normalizing
487
"""
488
if self.log:
489
raise ValueError("Pmf is under a log transform")
490
491
total = self.Total()
492
if total == 0.0:
493
raise ValueError('total probability is zero.')
494
logging.warning('Normalize: total probability is zero.')
495
return total
496
497
factor = float(fraction) / total
498
for x in self.d:
499
self.d[x] *= factor
500
501
return total
502
503
def Random(self):
504
"""Chooses a random element from this PMF.
505
506
Returns:
507
float value from the Pmf
508
"""
509
if len(self.d) == 0:
510
raise ValueError('Pmf contains no values.')
511
512
target = random.random()
513
total = 0.0
514
for x, p in self.d.iteritems():
515
total += p
516
if total >= target:
517
return x
518
519
# we shouldn't get here
520
assert False
521
522
def Mean(self):
523
"""Computes the mean of a PMF.
524
525
Returns:
526
float mean
527
"""
528
mu = 0.0
529
for x, p in self.d.iteritems():
530
mu += p * x
531
return mu
532
533
def Var(self, mu=None):
534
"""Computes the variance of a PMF.
535
536
Args:
537
mu: the point around which the variance is computed;
538
if omitted, computes the mean
539
540
Returns:
541
float variance
542
"""
543
if mu is None:
544
mu = self.Mean()
545
546
var = 0.0
547
for x, p in self.d.iteritems():
548
var += p * (x - mu) ** 2
549
return var
550
551
def MaximumLikelihood(self):
552
"""Returns the value with the highest probability.
553
554
Returns: float probability
555
"""
556
prob, val = max((prob, val) for val, prob in self.Items())
557
return val
558
559
def CredibleInterval(self, percentage=90):
560
"""Computes the central credible interval.
561
562
If percentage=90, computes the 90% CI.
563
564
Args:
565
percentage: float between 0 and 100
566
567
Returns:
568
sequence of two floats, low and high
569
"""
570
cdf = self.MakeCdf()
571
return cdf.CredibleInterval(percentage)
572
573
def __add__(self, other):
574
"""Computes the Pmf of the sum of values drawn from self and other.
575
576
other: another Pmf
577
578
returns: new Pmf
579
"""
580
try:
581
return self.AddPmf(other)
582
except AttributeError:
583
return self.AddConstant(other)
584
585
def AddPmf(self, other):
586
"""Computes the Pmf of the sum of values drawn from self and other.
587
588
other: another Pmf
589
590
returns: new Pmf
591
"""
592
pmf = Pmf()
593
for v1, p1 in self.Items():
594
for v2, p2 in other.Items():
595
pmf.Incr(v1 + v2, p1 * p2)
596
return pmf
597
598
def AddConstant(self, other):
599
"""Computes the Pmf of the sum a constant and values from self.
600
601
other: a number
602
603
returns: new Pmf
604
"""
605
pmf = Pmf()
606
for v1, p1 in self.Items():
607
pmf.Set(v1 + other, p1)
608
return pmf
609
610
def __sub__(self, other):
611
"""Computes the Pmf of the diff of values drawn from self and other.
612
613
other: another Pmf
614
615
returns: new Pmf
616
"""
617
pmf = Pmf()
618
for v1, p1 in self.Items():
619
for v2, p2 in other.Items():
620
pmf.Incr(v1 - v2, p1 * p2)
621
return pmf
622
623
def Max(self, k):
624
"""Computes the CDF of the maximum of k selections from this dist.
625
626
k: int
627
628
returns: new Cdf
629
"""
630
cdf = self.MakeCdf()
631
cdf.ps = [p ** k for p in cdf.ps]
632
return cdf
633
634
635
class Joint(Pmf):
636
"""Represents a joint distribution.
637
638
The values are sequences (usually tuples)
639
"""
640
641
def Marginal(self, i, name=''):
642
"""Gets the marginal distribution of the indicated variable.
643
644
i: index of the variable we want
645
646
Returns: Pmf
647
"""
648
pmf = Pmf(name=name)
649
for vs, prob in self.Items():
650
pmf.Incr(vs[i], prob)
651
return pmf
652
653
def Conditional(self, i, j, val, name=''):
654
"""Gets the conditional distribution of the indicated variable.
655
656
Distribution of vs[i], conditioned on vs[j] = val.
657
658
i: index of the variable we want
659
j: which variable is conditioned on
660
val: the value the jth variable has to have
661
662
Returns: Pmf
663
"""
664
pmf = Pmf(name=name)
665
for vs, prob in self.Items():
666
if vs[j] != val: continue
667
pmf.Incr(vs[i], prob)
668
669
pmf.Normalize()
670
return pmf
671
672
def MaxLikeInterval(self, percentage=90):
673
"""Returns the maximum-likelihood credible interval.
674
675
If percentage=90, computes a 90% CI containing the values
676
with the highest likelihoods.
677
678
percentage: float between 0 and 100
679
680
Returns: list of values from the suite
681
"""
682
interval = []
683
total = 0
684
685
t = [(prob, val) for val, prob in self.Items()]
686
t.sort(reverse=True)
687
688
for prob, val in t:
689
interval.append(val)
690
total += prob
691
if total >= percentage / 100.0:
692
break
693
694
return interval
695
696
697
def MakeJoint(pmf1, pmf2):
698
"""Joint distribution of values from pmf1 and pmf2.
699
700
Args:
701
pmf1: Pmf object
702
pmf2: Pmf object
703
704
Returns:
705
Joint pmf of value pairs
706
"""
707
joint = Joint()
708
for v1, p1 in pmf1.Items():
709
for v2, p2 in pmf2.Items():
710
joint.Set((v1, v2), p1 * p2)
711
return joint
712
713
714
def MakeHistFromList(t, name=''):
715
"""Makes a histogram from an unsorted sequence of values.
716
717
Args:
718
t: sequence of numbers
719
name: string name for this histogram
720
721
Returns:
722
Hist object
723
"""
724
hist = Hist(name=name)
725
[hist.Incr(x) for x in t]
726
return hist
727
728
729
def MakeHistFromDict(d, name=''):
730
"""Makes a histogram from a map from values to frequencies.
731
732
Args:
733
d: dictionary that maps values to frequencies
734
name: string name for this histogram
735
736
Returns:
737
Hist object
738
"""
739
return Hist(d, name)
740
741
742
def MakePmfFromList(t, name=''):
743
"""Makes a PMF from an unsorted sequence of values.
744
745
Args:
746
t: sequence of numbers
747
name: string name for this PMF
748
749
Returns:
750
Pmf object
751
"""
752
hist = MakeHistFromList(t)
753
d = hist.GetDict()
754
pmf = Pmf(d, name)
755
pmf.Normalize()
756
return pmf
757
758
759
def MakePmfFromDict(d, name=''):
760
"""Makes a PMF from a map from values to probabilities.
761
762
Args:
763
d: dictionary that maps values to probabilities
764
name: string name for this PMF
765
766
Returns:
767
Pmf object
768
"""
769
pmf = Pmf(d, name)
770
pmf.Normalize()
771
return pmf
772
773
774
def MakePmfFromItems(t, name=''):
775
"""Makes a PMF from a sequence of value-probability pairs
776
777
Args:
778
t: sequence of value-probability pairs
779
name: string name for this PMF
780
781
Returns:
782
Pmf object
783
"""
784
pmf = Pmf(dict(t), name)
785
pmf.Normalize()
786
return pmf
787
788
789
def MakePmfFromHist(hist, name=None):
790
"""Makes a normalized PMF from a Hist object.
791
792
Args:
793
hist: Hist object
794
name: string name
795
796
Returns:
797
Pmf object
798
"""
799
if name is None:
800
name = hist.name
801
802
# make a copy of the dictionary
803
d = dict(hist.GetDict())
804
pmf = Pmf(d, name)
805
pmf.Normalize()
806
return pmf
807
808
809
def MakePmfFromCdf(cdf, name=None):
810
"""Makes a normalized Pmf from a Cdf object.
811
812
Args:
813
cdf: Cdf object
814
name: string name for the new Pmf
815
816
Returns:
817
Pmf object
818
"""
819
if name is None:
820
name = cdf.name
821
822
pmf = Pmf(name=name)
823
824
prev = 0.0
825
for val, prob in cdf.Items():
826
pmf.Incr(val, prob - prev)
827
prev = prob
828
829
return pmf
830
831
832
def MakeMixture(metapmf, name='mix'):
833
"""Make a mixture distribution.
834
835
Args:
836
metapmf: Pmf that maps from Pmfs to probs.
837
name: string name for the new Pmf.
838
839
Returns: Pmf object.
840
"""
841
mix = Pmf(name=name)
842
for pmf, p1 in metapmf.Items():
843
for x, p2 in pmf.Items():
844
mix.Incr(x, p1 * p2)
845
return mix
846
847
848
def MakeUniformPmf(low, high, n):
849
"""Make a uniform Pmf.
850
851
low: lowest value (inclusive)
852
high: highest value (inclusize)
853
n: number of values
854
"""
855
pmf = Pmf()
856
for x in numpy.linspace(low, high, n):
857
pmf.Set(x, 1)
858
pmf.Normalize()
859
return pmf
860
861
862
class Cdf(object):
863
"""Represents a cumulative distribution function.
864
865
Attributes:
866
xs: sequence of values
867
ps: sequence of probabilities
868
name: string used as a graph label.
869
"""
870
871
def __init__(self, xs=None, ps=None, name=''):
872
self.xs = [] if xs is None else xs
873
self.ps = [] if ps is None else ps
874
self.name = name
875
876
def Copy(self, name=None):
877
"""Returns a copy of this Cdf.
878
879
Args:
880
name: string name for the new Cdf
881
"""
882
if name is None:
883
name = self.name
884
return Cdf(list(self.xs), list(self.ps), name)
885
886
def MakePmf(self, name=None):
887
"""Makes a Pmf."""
888
return MakePmfFromCdf(self, name=name)
889
890
def Values(self):
891
"""Returns a sorted list of values.
892
"""
893
return self.xs
894
895
def Items(self):
896
"""Returns a sorted sequence of (value, probability) pairs.
897
898
Note: in Python3, returns an iterator.
899
"""
900
return zip(self.xs, self.ps)
901
902
def Append(self, x, p):
903
"""Add an (x, p) pair to the end of this CDF.
904
905
Note: this us normally used to build a CDF from scratch, not
906
to modify existing CDFs. It is up to the caller to make sure
907
that the result is a legal CDF.
908
"""
909
self.xs.append(x)
910
self.ps.append(p)
911
912
def Shift(self, term):
913
"""Adds a term to the xs.
914
915
term: how much to add
916
"""
917
new = self.Copy()
918
new.xs = [x + term for x in self.xs]
919
return new
920
921
def Scale(self, factor):
922
"""Multiplies the xs by a factor.
923
924
factor: what to multiply by
925
"""
926
new = self.Copy()
927
new.xs = [x * factor for x in self.xs]
928
return new
929
930
def Prob(self, x):
931
"""Returns CDF(x), the probability that corresponds to value x.
932
933
Args:
934
x: number
935
936
Returns:
937
float probability
938
"""
939
if x < self.xs[0]: return 0.0
940
index = bisect.bisect(self.xs, x)
941
p = self.ps[index - 1]
942
return p
943
944
def Value(self, p):
945
"""Returns InverseCDF(p), the value that corresponds to probability p.
946
947
Args:
948
p: number in the range [0, 1]
949
950
Returns:
951
number value
952
"""
953
if p < 0 or p > 1:
954
raise ValueError('Probability p must be in range [0, 1]')
955
956
if p == 0: return self.xs[0]
957
if p == 1: return self.xs[-1]
958
index = bisect.bisect(self.ps, p)
959
if p == self.ps[index - 1]:
960
return self.xs[index - 1]
961
else:
962
return self.xs[index]
963
964
def Percentile(self, p):
965
"""Returns the value that corresponds to percentile p.
966
967
Args:
968
p: number in the range [0, 100]
969
970
Returns:
971
number value
972
"""
973
return self.Value(p / 100.0)
974
975
def Random(self):
976
"""Chooses a random value from this distribution."""
977
return self.Value(random.random())
978
979
def Sample(self, n):
980
"""Generates a random sample from this distribution.
981
982
Args:
983
n: int length of the sample
984
"""
985
return [self.Random() for i in range(n)]
986
987
def Mean(self):
988
"""Computes the mean of a CDF.
989
990
Returns:
991
float mean
992
"""
993
old_p = 0
994
total = 0.0
995
for x, new_p in zip(self.xs, self.ps):
996
p = new_p - old_p
997
total += p * x
998
old_p = new_p
999
return total
1000
1001
def CredibleInterval(self, percentage=90):
1002
"""Computes the central credible interval.
1003
1004
If percentage=90, computes the 90% CI.
1005
1006
Args:
1007
percentage: float between 0 and 100
1008
1009
Returns:
1010
sequence of two floats, low and high
1011
"""
1012
prob = (1 - percentage / 100.0) / 2
1013
interval = self.Value(prob), self.Value(1 - prob)
1014
return interval
1015
1016
def _Round(self, multiplier=1000.0):
1017
"""
1018
An entry is added to the cdf only if the percentile differs
1019
from the previous value in a significant digit, where the number
1020
of significant digits is determined by multiplier. The
1021
default is 1000, which keeps log10(1000) = 3 significant digits.
1022
"""
1023
# TODO(write this method)
1024
raise UnimplementedMethodException()
1025
1026
def Render(self):
1027
"""Generates a sequence of points suitable for plotting.
1028
1029
An empirical CDF is a step function; linear interpolation
1030
can be misleading.
1031
1032
Returns:
1033
tuple of (xs, ps)
1034
"""
1035
xs = [self.xs[0]]
1036
ps = [0.0]
1037
for i, p in enumerate(self.ps):
1038
xs.append(self.xs[i])
1039
ps.append(p)
1040
1041
try:
1042
xs.append(self.xs[i + 1])
1043
ps.append(p)
1044
except IndexError:
1045
pass
1046
return xs, ps
1047
1048
def Max(self, k):
1049
"""Computes the CDF of the maximum of k selections from this dist.
1050
1051
k: int
1052
1053
returns: new Cdf
1054
"""
1055
cdf = self.Copy()
1056
cdf.ps = [p ** k for p in cdf.ps]
1057
return cdf
1058
1059
1060
def MakeCdfFromItems(items, name=''):
1061
"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1062
1063
Args:
1064
items: unsorted sequence of (value, frequency) pairs
1065
name: string name for this CDF
1066
1067
Returns:
1068
cdf: list of (value, fraction) pairs
1069
"""
1070
runsum = 0
1071
xs = []
1072
cs = []
1073
1074
for value, count in sorted(items):
1075
runsum += count
1076
xs.append(value)
1077
cs.append(runsum)
1078
1079
total = float(runsum)
1080
ps = [c / total for c in cs]
1081
1082
cdf = Cdf(xs, ps, name)
1083
return cdf
1084
1085
1086
def MakeCdfFromDict(d, name=''):
1087
"""Makes a CDF from a dictionary that maps values to frequencies.
1088
1089
Args:
1090
d: dictionary that maps values to frequencies.
1091
name: string name for the data.
1092
1093
Returns:
1094
Cdf object
1095
"""
1096
return MakeCdfFromItems(d.iteritems(), name)
1097
1098
1099
def MakeCdfFromHist(hist, name=''):
1100
"""Makes a CDF from a Hist object.
1101
1102
Args:
1103
hist: Pmf.Hist object
1104
name: string name for the data.
1105
1106
Returns:
1107
Cdf object
1108
"""
1109
return MakeCdfFromItems(hist.Items(), name)
1110
1111
1112
def MakeCdfFromPmf(pmf, name=None):
1113
"""Makes a CDF from a Pmf object.
1114
1115
Args:
1116
pmf: Pmf.Pmf object
1117
name: string name for the data.
1118
1119
Returns:
1120
Cdf object
1121
"""
1122
if name == None:
1123
name = pmf.name
1124
return MakeCdfFromItems(pmf.Items(), name)
1125
1126
1127
def MakeCdfFromList(seq, name=''):
1128
"""Creates a CDF from an unsorted sequence.
1129
1130
Args:
1131
seq: unsorted sequence of sortable values
1132
name: string name for the cdf
1133
1134
Returns:
1135
Cdf object
1136
"""
1137
hist = MakeHistFromList(seq)
1138
return MakeCdfFromHist(hist, name)
1139
1140
1141
class UnimplementedMethodException(Exception):
1142
"""Exception if someone calls a method that should be overridden."""
1143
1144
1145
class Suite(Pmf):
1146
"""Represents a suite of hypotheses and their probabilities."""
1147
1148
def Update(self, data):
1149
"""Updates each hypothesis based on the data.
1150
1151
data: any representation of the data
1152
1153
returns: the normalizing constant
1154
"""
1155
for hypo in self.Values():
1156
like = self.Likelihood(data, hypo)
1157
self.Mult(hypo, like)
1158
return self.Normalize()
1159
1160
def LogUpdate(self, data):
1161
"""Updates a suite of hypotheses based on new data.
1162
1163
Modifies the suite directly; if you want to keep the original, make
1164
a copy.
1165
1166
Note: unlike Update, LogUpdate does not normalize.
1167
1168
Args:
1169
data: any representation of the data
1170
"""
1171
for hypo in self.Values():
1172
like = self.LogLikelihood(data, hypo)
1173
self.Incr(hypo, like)
1174
1175
def UpdateSet(self, dataset):
1176
"""Updates each hypothesis based on the dataset.
1177
1178
This is more efficient than calling Update repeatedly because
1179
it waits until the end to Normalize.
1180
1181
Modifies the suite directly; if you want to keep the original, make
1182
a copy.
1183
1184
dataset: a sequence of data
1185
1186
returns: the normalizing constant
1187
"""
1188
for data in dataset:
1189
for hypo in self.Values():
1190
like = self.Likelihood(data, hypo)
1191
self.Mult(hypo, like)
1192
return self.Normalize()
1193
1194
def LogUpdateSet(self, dataset):
1195
"""Updates each hypothesis based on the dataset.
1196
1197
Modifies the suite directly; if you want to keep the original, make
1198
a copy.
1199
1200
dataset: a sequence of data
1201
1202
returns: None
1203
"""
1204
for data in dataset:
1205
self.LogUpdate(data)
1206
1207
def Likelihood(self, data, hypo):
1208
"""Computes the likelihood of the data under the hypothesis.
1209
1210
hypo: some representation of the hypothesis
1211
data: some representation of the data
1212
"""
1213
raise UnimplementedMethodException()
1214
1215
def LogLikelihood(self, data, hypo):
1216
"""Computes the log likelihood of the data under the hypothesis.
1217
1218
hypo: some representation of the hypothesis
1219
data: some representation of the data
1220
"""
1221
raise UnimplementedMethodException()
1222
1223
def Print(self):
1224
"""Prints the hypotheses and their probabilities."""
1225
for hypo, prob in sorted(self.Items()):
1226
print hypo, prob
1227
1228
def MakeOdds(self):
1229
"""Transforms from probabilities to odds.
1230
1231
Values with prob=0 are removed.
1232
"""
1233
for hypo, prob in self.Items():
1234
if prob:
1235
self.Set(hypo, Odds(prob))
1236
else:
1237
self.Remove(hypo)
1238
1239
def MakeProbs(self):
1240
"""Transforms from odds to probabilities."""
1241
for hypo, odds in self.Items():
1242
self.Set(hypo, Probability(odds))
1243
1244
1245
def MakeSuiteFromList(t, name=''):
1246
"""Makes a suite from an unsorted sequence of values.
1247
1248
Args:
1249
t: sequence of numbers
1250
name: string name for this suite
1251
1252
Returns:
1253
Suite object
1254
"""
1255
hist = MakeHistFromList(t)
1256
d = hist.GetDict()
1257
return MakeSuiteFromDict(d)
1258
1259
1260
def MakeSuiteFromHist(hist, name=None):
1261
"""Makes a normalized suite from a Hist object.
1262
1263
Args:
1264
hist: Hist object
1265
name: string name
1266
1267
Returns:
1268
Suite object
1269
"""
1270
if name is None:
1271
name = hist.name
1272
1273
# make a copy of the dictionary
1274
d = dict(hist.GetDict())
1275
return MakeSuiteFromDict(d, name)
1276
1277
1278
def MakeSuiteFromDict(d, name=''):
1279
"""Makes a suite from a map from values to probabilities.
1280
1281
Args:
1282
d: dictionary that maps values to probabilities
1283
name: string name for this suite
1284
1285
Returns:
1286
Suite object
1287
"""
1288
suite = Suite(name=name)
1289
suite.SetDict(d)
1290
suite.Normalize()
1291
return suite
1292
1293
1294
def MakeSuiteFromCdf(cdf, name=None):
1295
"""Makes a normalized Suite from a Cdf object.
1296
1297
Args:
1298
cdf: Cdf object
1299
name: string name for the new Suite
1300
1301
Returns:
1302
Suite object
1303
"""
1304
if name is None:
1305
name = cdf.name
1306
1307
suite = Suite(name=name)
1308
1309
prev = 0.0
1310
for val, prob in cdf.Items():
1311
suite.Incr(val, prob - prev)
1312
prev = prob
1313
1314
return suite
1315
1316
1317
class Pdf(object):
1318
"""Represents a probability density function (PDF)."""
1319
1320
def Density(self, x):
1321
"""Evaluates this Pdf at x.
1322
1323
Returns: float probability density
1324
"""
1325
raise UnimplementedMethodException()
1326
1327
def MakePmf(self, xs, name=''):
1328
"""Makes a discrete version of this Pdf, evaluated at xs.
1329
1330
xs: equally-spaced sequence of values
1331
1332
Returns: new Pmf
1333
"""
1334
pmf = Pmf(name=name)
1335
for x in xs:
1336
pmf.Set(x, self.Density(x))
1337
pmf.Normalize()
1338
return pmf
1339
1340
1341
class GaussianPdf(Pdf):
1342
"""Represents the PDF of a Gaussian distribution."""
1343
1344
def __init__(self, mu, sigma):
1345
"""Constructs a Gaussian Pdf with given mu and sigma.
1346
1347
mu: mean
1348
sigma: standard deviation
1349
"""
1350
self.mu = mu
1351
self.sigma = sigma
1352
1353
def Density(self, x):
1354
"""Evaluates this Pdf at x.
1355
1356
Returns: float probability density
1357
"""
1358
return EvalGaussianPdf(x, self.mu, self.sigma)
1359
1360
1361
class EstimatedPdf(Pdf):
1362
"""Represents a PDF estimated by KDE."""
1363
1364
def __init__(self, sample):
1365
"""Estimates the density function based on a sample.
1366
1367
sample: sequence of data
1368
"""
1369
self.kde = scipy.stats.gaussian_kde(sample)
1370
1371
def Density(self, x):
1372
"""Evaluates this Pdf at x.
1373
1374
Returns: float probability density
1375
"""
1376
return self.kde.evaluate(x)
1377
1378
def MakePmf(self, xs, name=''):
1379
ps = self.kde.evaluate(xs)
1380
pmf = MakePmfFromItems(zip(xs, ps), name=name)
1381
return pmf
1382
1383
1384
def Percentile(pmf, percentage):
1385
"""Computes a percentile of a given Pmf.
1386
1387
percentage: float 0-100
1388
"""
1389
p = percentage / 100.0
1390
total = 0
1391
for val, prob in pmf.Items():
1392
total += prob
1393
if total >= p:
1394
return val
1395
1396
1397
def CredibleInterval(pmf, percentage=90):
1398
"""Computes a credible interval for a given distribution.
1399
1400
If percentage=90, computes the 90% CI.
1401
1402
Args:
1403
pmf: Pmf object representing a posterior distribution
1404
percentage: float between 0 and 100
1405
1406
Returns:
1407
sequence of two floats, low and high
1408
"""
1409
cdf = pmf.MakeCdf()
1410
prob = (1 - percentage / 100.0) / 2
1411
interval = cdf.Value(prob), cdf.Value(1 - prob)
1412
return interval
1413
1414
1415
def PmfProbLess(pmf1, pmf2):
1416
"""Probability that a value from pmf1 is less than a value from pmf2.
1417
1418
Args:
1419
pmf1: Pmf object
1420
pmf2: Pmf object
1421
1422
Returns:
1423
float probability
1424
"""
1425
total = 0.0
1426
for v1, p1 in pmf1.Items():
1427
for v2, p2 in pmf2.Items():
1428
if v1 < v2:
1429
total += p1 * p2
1430
return total
1431
1432
1433
def PmfProbGreater(pmf1, pmf2):
1434
"""Probability that a value from pmf1 is less than a value from pmf2.
1435
1436
Args:
1437
pmf1: Pmf object
1438
pmf2: Pmf object
1439
1440
Returns:
1441
float probability
1442
"""
1443
total = 0.0
1444
for v1, p1 in pmf1.Items():
1445
for v2, p2 in pmf2.Items():
1446
if v1 > v2:
1447
total += p1 * p2
1448
return total
1449
1450
1451
def PmfProbEqual(pmf1, pmf2):
1452
"""Probability that a value from pmf1 equals a value from pmf2.
1453
1454
Args:
1455
pmf1: Pmf object
1456
pmf2: Pmf object
1457
1458
Returns:
1459
float probability
1460
"""
1461
total = 0.0
1462
for v1, p1 in pmf1.Items():
1463
for v2, p2 in pmf2.Items():
1464
if v1 == v2:
1465
total += p1 * p2
1466
return total
1467
1468
1469
def RandomSum(dists):
1470
"""Chooses a random value from each dist and returns the sum.
1471
1472
dists: sequence of Pmf or Cdf objects
1473
1474
returns: numerical sum
1475
"""
1476
total = sum(dist.Random() for dist in dists)
1477
return total
1478
1479
1480
def SampleSum(dists, n):
1481
"""Draws a sample of sums from a list of distributions.
1482
1483
dists: sequence of Pmf or Cdf objects
1484
n: sample size
1485
1486
returns: new Pmf of sums
1487
"""
1488
pmf = MakePmfFromList(RandomSum(dists) for i in xrange(n))
1489
return pmf
1490
1491
1492
def EvalGaussianPdf(x, mu, sigma):
1493
"""Computes the unnormalized PDF of the normal distribution.
1494
1495
x: value
1496
mu: mean
1497
sigma: standard deviation
1498
1499
returns: float probability density
1500
"""
1501
return scipy.stats.norm.pdf(x, mu, sigma)
1502
1503
1504
def MakeGaussianPmf(mu, sigma, num_sigmas, n=201):
1505
"""Makes a PMF discrete approx to a Gaussian distribution.
1506
1507
mu: float mean
1508
sigma: float standard deviation
1509
num_sigmas: how many sigmas to extend in each direction
1510
n: number of values in the Pmf
1511
1512
returns: normalized Pmf
1513
"""
1514
pmf = Pmf()
1515
low = mu - num_sigmas * sigma
1516
high = mu + num_sigmas * sigma
1517
1518
for x in numpy.linspace(low, high, n):
1519
p = EvalGaussianPdf(x, mu, sigma)
1520
pmf.Set(x, p)
1521
pmf.Normalize()
1522
return pmf
1523
1524
1525
def EvalBinomialPmf(k, n, p):
1526
"""Evaluates the binomial pmf.
1527
1528
Returns the probabily of k successes in n trials with probability p.
1529
"""
1530
return scipy.stats.binom.pmf(k, n, p)
1531
1532
1533
def EvalPoissonPmf(k, lam):
1534
"""Computes the Poisson PMF.
1535
1536
k: number of events
1537
lam: parameter lambda in events per unit time
1538
1539
returns: float probability
1540
"""
1541
return scipy.stats.poisson.pmf(k, lam)
1542
1543
1544
def MakePoissonPmf(lam, high, step=1):
1545
"""Makes a PMF discrete approx to a Poisson distribution.
1546
1547
lam: parameter lambda in events per unit time
1548
high: upper bound of the Pmf
1549
1550
returns: normalized Pmf
1551
"""
1552
pmf = Pmf()
1553
for k in xrange(0, high + 1, step):
1554
p = EvalPoissonPmf(k, lam)
1555
pmf.Set(k, p)
1556
pmf.Normalize()
1557
return pmf
1558
1559
1560
def EvalExponentialPdf(x, lam):
1561
"""Computes the exponential PDF.
1562
1563
x: value
1564
lam: parameter lambda in events per unit time
1565
1566
returns: float probability density
1567
"""
1568
return lam * math.exp(-lam * x)
1569
1570
1571
def EvalExponentialCdf(x, lam):
1572
"""Evaluates CDF of the exponential distribution with parameter lam."""
1573
return 1 - math.exp(-lam * x)
1574
1575
1576
def MakeExponentialPmf(lam, high, n=200):
1577
"""Makes a PMF discrete approx to an exponential distribution.
1578
1579
lam: parameter lambda in events per unit time
1580
high: upper bound
1581
n: number of values in the Pmf
1582
1583
returns: normalized Pmf
1584
"""
1585
pmf = Pmf()
1586
for x in numpy.linspace(0, high, n):
1587
p = EvalExponentialPdf(x, lam)
1588
pmf.Set(x, p)
1589
pmf.Normalize()
1590
return pmf
1591
1592
1593
def StandardGaussianCdf(x):
1594
"""Evaluates the CDF of the standard Gaussian distribution.
1595
1596
See http://en.wikipedia.org/wiki/Normal_distribution
1597
#Cumulative_distribution_function
1598
1599
Args:
1600
x: float
1601
1602
Returns:
1603
float
1604
"""
1605
return (erf(x / ROOT2) + 1) / 2
1606
1607
1608
def GaussianCdf(x, mu=0, sigma=1):
1609
"""Evaluates the CDF of the gaussian distribution.
1610
1611
Args:
1612
x: float
1613
1614
mu: mean parameter
1615
1616
sigma: standard deviation parameter
1617
1618
Returns:
1619
float
1620
"""
1621
return StandardGaussianCdf(float(x - mu) / sigma)
1622
1623
1624
def GaussianCdfInverse(p, mu=0, sigma=1):
1625
"""Evaluates the inverse CDF of the gaussian distribution.
1626
1627
See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function
1628
1629
Args:
1630
p: float
1631
1632
mu: mean parameter
1633
1634
sigma: standard deviation parameter
1635
1636
Returns:
1637
float
1638
"""
1639
x = ROOT2 * erfinv(2 * p - 1)
1640
return mu + x * sigma
1641
1642
1643
class Beta(object):
1644
"""Represents a Beta distribution.
1645
1646
See http://en.wikipedia.org/wiki/Beta_distribution
1647
"""
1648
def __init__(self, alpha=1, beta=1, name=''):
1649
"""Initializes a Beta distribution."""
1650
self.alpha = alpha
1651
self.beta = beta
1652
self.name = name
1653
1654
def Update(self, data):
1655
"""Updates a Beta distribution.
1656
1657
data: pair of int (heads, tails)
1658
"""
1659
heads, tails = data
1660
self.alpha += heads
1661
self.beta += tails
1662
1663
def Mean(self):
1664
"""Computes the mean of this distribution."""
1665
return float(self.alpha) / (self.alpha + self.beta)
1666
1667
def Random(self):
1668
"""Generates a random variate from this distribution."""
1669
return random.betavariate(self.alpha, self.beta)
1670
1671
def Sample(self, n):
1672
"""Generates a random sample from this distribution.
1673
1674
n: int sample size
1675
"""
1676
size = n,
1677
return numpy.random.beta(self.alpha, self.beta, size)
1678
1679
def EvalPdf(self, x):
1680
"""Evaluates the PDF at x."""
1681
return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
1682
1683
def MakePmf(self, steps=101, name=''):
1684
"""Returns a Pmf of this distribution.
1685
1686
Note: Normally, we just evaluate the PDF at a sequence
1687
of points and treat the probability density as a probability
1688
mass.
1689
1690
But if alpha or beta is less than one, we have to be
1691
more careful because the PDF goes to infinity at x=0
1692
and x=1. In that case we evaluate the CDF and compute
1693
differences.
1694
"""
1695
if self.alpha < 1 or self.beta < 1:
1696
cdf = self.MakeCdf()
1697
pmf = cdf.MakePmf()
1698
return pmf
1699
1700
xs = [i / (steps - 1.0) for i in xrange(steps)]
1701
probs = [self.EvalPdf(x) for x in xs]
1702
pmf = MakePmfFromDict(dict(zip(xs, probs)), name)
1703
return pmf
1704
1705
def MakeCdf(self, steps=101):
1706
"""Returns the CDF of this distribution."""
1707
xs = [i / (steps - 1.0) for i in xrange(steps)]
1708
ps = [scipy.special.betainc(self.alpha, self.beta, x) for x in xs]
1709
cdf = Cdf(xs, ps)
1710
return cdf
1711
1712
1713
class Dirichlet(object):
1714
"""Represents a Dirichlet distribution.
1715
1716
See http://en.wikipedia.org/wiki/Dirichlet_distribution
1717
"""
1718
1719
def __init__(self, n, conc=1, name=''):
1720
"""Initializes a Dirichlet distribution.
1721
1722
n: number of dimensions
1723
conc: concentration parameter (smaller yields more concentration)
1724
name: string name
1725
"""
1726
if n < 2:
1727
raise ValueError('A Dirichlet distribution with '
1728
'n<2 makes no sense')
1729
1730
self.n = n
1731
self.params = numpy.ones(n, dtype=numpy.float) * conc
1732
self.name = name
1733
1734
def Update(self, data):
1735
"""Updates a Dirichlet distribution.
1736
1737
data: sequence of observations, in order corresponding to params
1738
"""
1739
m = len(data)
1740
self.params[:m] += data
1741
1742
def Random(self):
1743
"""Generates a random variate from this distribution.
1744
1745
Returns: normalized vector of fractions
1746
"""
1747
p = numpy.random.gamma(self.params)
1748
return p / p.sum()
1749
1750
def Likelihood(self, data):
1751
"""Computes the likelihood of the data.
1752
1753
Selects a random vector of probabilities from this distribution.
1754
1755
Returns: float probability
1756
"""
1757
m = len(data)
1758
if self.n < m:
1759
return 0
1760
1761
x = data
1762
p = self.Random()
1763
q = p[:m] ** x
1764
return q.prod()
1765
1766
def LogLikelihood(self, data):
1767
"""Computes the log likelihood of the data.
1768
1769
Selects a random vector of probabilities from this distribution.
1770
1771
Returns: float log probability
1772
"""
1773
m = len(data)
1774
if self.n < m:
1775
return float('-inf')
1776
1777
x = self.Random()
1778
y = numpy.log(x[:m]) * data
1779
return y.sum()
1780
1781
def MarginalBeta(self, i):
1782
"""Computes the marginal distribution of the ith element.
1783
1784
See http://en.wikipedia.org/wiki/Dirichlet_distribution
1785
#Marginal_distributions
1786
1787
i: int
1788
1789
Returns: Beta object
1790
"""
1791
alpha0 = self.params.sum()
1792
alpha = self.params[i]
1793
return Beta(alpha, alpha0 - alpha)
1794
1795
def PredictivePmf(self, xs, name=''):
1796
"""Makes a predictive distribution.
1797
1798
xs: values to go into the Pmf
1799
1800
Returns: Pmf that maps from x to the mean prevalence of x
1801
"""
1802
alpha0 = self.params.sum()
1803
ps = self.params / alpha0
1804
return MakePmfFromItems(zip(xs, ps), name=name)
1805
1806
1807
def BinomialCoef(n, k):
1808
"""Compute the binomial coefficient "n choose k".
1809
1810
n: number of trials
1811
k: number of successes
1812
1813
Returns: float
1814
"""
1815
return scipy.misc.comb(n, k)
1816
1817
1818
def LogBinomialCoef(n, k):
1819
"""Computes the log of the binomial coefficient.
1820
1821
http://math.stackexchange.com/questions/64716/
1822
approximating-the-logarithm-of-the-binomial-coefficient
1823
1824
n: number of trials
1825
k: number of successes
1826
1827
Returns: float
1828
"""
1829
return n * log(n) - k * log(k) - (n - k) * log(n - k)
1830
1831
1832
1833