Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7089
License: GPL3
1
"""This file contains code for use with "Think Stats" and
2
"Think Bayes", both by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
"""This file contains class definitions for:
11
12
Hist: represents a histogram (map from values to integer frequencies).
13
14
Pmf: represents a probability mass function (map from values to probs).
15
16
_DictWrapper: private parent class for Hist and Pmf.
17
18
Cdf: represents a discrete cumulative distribution function
19
20
Pdf: represents a continuous probability density function
21
22
"""
23
24
import bisect
25
import copy
26
import logging
27
import math
28
import random
29
import re
30
31
from collections import Counter
32
from operator import itemgetter
33
34
import thinkplot
35
36
import numpy as np
37
import pandas
38
39
import scipy
40
from scipy import stats
41
from scipy import special
42
from scipy import ndimage
43
44
from scipy.special import gamma
45
46
from io import open
47
48
ROOT2 = math.sqrt(2)
49
50
def RandomSeed(x):
51
"""Initialize the random and np.random generators.
52
53
x: int seed
54
"""
55
random.seed(x)
56
np.random.seed(x)
57
58
59
def Odds(p):
60
"""Computes odds for a given probability.
61
62
Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
63
64
Note: when p=1, the formula for odds divides by zero, which is
65
normally undefined. But I think it is reasonable to define Odds(1)
66
to be infinity, so that's what this function does.
67
68
p: float 0-1
69
70
Returns: float odds
71
"""
72
if p == 1:
73
return float('inf')
74
return p / (1 - p)
75
76
77
def Probability(o):
78
"""Computes the probability corresponding to given odds.
79
80
Example: o=2 means 2:1 odds in favor, or 2/3 probability
81
82
o: float odds, strictly positive
83
84
Returns: float probability
85
"""
86
return o / (o + 1)
87
88
89
def Probability2(yes, no):
90
"""Computes the probability corresponding to given odds.
91
92
Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
93
94
yes, no: int or float odds in favor
95
"""
96
return yes / (yes + no)
97
98
99
class Interpolator(object):
100
"""Represents a mapping between sorted sequences; performs linear interp.
101
102
Attributes:
103
xs: sorted list
104
ys: sorted list
105
"""
106
107
def __init__(self, xs, ys):
108
self.xs = xs
109
self.ys = ys
110
111
def Lookup(self, x):
112
"""Looks up x and returns the corresponding value of y."""
113
return self._Bisect(x, self.xs, self.ys)
114
115
def Reverse(self, y):
116
"""Looks up y and returns the corresponding value of x."""
117
return self._Bisect(y, self.ys, self.xs)
118
119
def _Bisect(self, x, xs, ys):
120
"""Helper function."""
121
if x <= xs[0]:
122
return ys[0]
123
if x >= xs[-1]:
124
return ys[-1]
125
i = bisect.bisect(xs, x)
126
frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
127
y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
128
return y
129
130
131
# When we plot Hist, Pmf and Cdf objects, they don't appear in
132
# the legend unless we override the default label.
133
DEFAULT_LABEL = '_nolegend_'
134
135
136
class _DictWrapper(object):
137
"""An object that contains a dictionary."""
138
139
def __init__(self, obj=None, label=None):
140
"""Initializes the distribution.
141
142
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
143
label: string label
144
"""
145
self.label = label if label is not None else DEFAULT_LABEL
146
self.d = {}
147
148
# flag whether the distribution is under a log transform
149
self.log = False
150
151
if obj is None:
152
return
153
154
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
155
self.label = label if label is not None else obj.label
156
157
if isinstance(obj, dict):
158
self.d.update(obj.items())
159
elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):
160
self.d.update(obj.Items())
161
elif isinstance(obj, pandas.Series):
162
self.d.update(obj.value_counts().iteritems())
163
else:
164
# finally, treat it like a list
165
self.d.update(Counter(obj))
166
167
if len(self) > 0 and isinstance(self, Pmf):
168
self.Normalize()
169
170
def __hash__(self):
171
return id(self)
172
173
def __str__(self):
174
cls = self.__class__.__name__
175
if self.label == DEFAULT_LABEL:
176
return '%s(%s)' % (cls, str(self.d))
177
else:
178
return self.label
179
180
def __repr__(self):
181
cls = self.__class__.__name__
182
if self.label == DEFAULT_LABEL:
183
return '%s(%s)' % (cls, repr(self.d))
184
else:
185
return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label))
186
187
def __eq__(self, other):
188
try:
189
return self.d == other.d
190
except AttributeError:
191
return False
192
193
def __len__(self):
194
return len(self.d)
195
196
def __iter__(self):
197
return iter(self.d)
198
199
def iterkeys(self):
200
"""Returns an iterator over keys."""
201
return iter(self.d)
202
203
def __contains__(self, value):
204
return value in self.d
205
206
def __getitem__(self, value):
207
return self.d.get(value, 0)
208
209
def __setitem__(self, value, prob):
210
self.d[value] = prob
211
212
def __delitem__(self, value):
213
del self.d[value]
214
215
def Copy(self, label=None):
216
"""Returns a copy.
217
218
Make a shallow copy of d. If you want a deep copy of d,
219
use copy.deepcopy on the whole object.
220
221
label: string label for the new Hist
222
223
returns: new _DictWrapper with the same type
224
"""
225
new = copy.copy(self)
226
new.d = copy.copy(self.d)
227
new.label = label if label is not None else self.label
228
return new
229
230
def Scale(self, factor):
231
"""Multiplies the values by a factor.
232
233
factor: what to multiply by
234
235
Returns: new object
236
"""
237
new = self.Copy()
238
new.d.clear()
239
240
for val, prob in self.Items():
241
new.Set(val * factor, prob)
242
return new
243
244
def Log(self, m=None):
245
"""Log transforms the probabilities.
246
247
Removes values with probability 0.
248
249
Normalizes so that the largest logprob is 0.
250
"""
251
if self.log:
252
raise ValueError("Pmf/Hist already under a log transform")
253
self.log = True
254
255
if m is None:
256
m = self.MaxLike()
257
258
for x, p in self.d.items():
259
if p:
260
self.Set(x, math.log(p / m))
261
else:
262
self.Remove(x)
263
264
def Exp(self, m=None):
265
"""Exponentiates the probabilities.
266
267
m: how much to shift the ps before exponentiating
268
269
If m is None, normalizes so that the largest prob is 1.
270
"""
271
if not self.log:
272
raise ValueError("Pmf/Hist not under a log transform")
273
self.log = False
274
275
if m is None:
276
m = self.MaxLike()
277
278
for x, p in self.d.items():
279
self.Set(x, math.exp(p - m))
280
281
def GetDict(self):
282
"""Gets the dictionary."""
283
return self.d
284
285
def SetDict(self, d):
286
"""Sets the dictionary."""
287
self.d = d
288
289
def Values(self):
290
"""Gets an unsorted sequence of values.
291
292
Note: one source of confusion is that the keys of this
293
dictionary are the values of the Hist/Pmf, and the
294
values of the dictionary are frequencies/probabilities.
295
"""
296
return self.d.keys()
297
298
def Items(self):
299
"""Gets an unsorted sequence of (value, freq/prob) pairs."""
300
return self.d.items()
301
302
def SortedItems(self):
303
"""Gets a sorted sequence of (value, freq/prob) pairs.
304
305
It items are unsortable, the result is unsorted.
306
"""
307
def isnan(x):
308
try:
309
return math.isnan(x)
310
except TypeError:
311
return False
312
313
if any([isnan(x) for x in self.Values()]):
314
msg = 'Keys contain NaN, may not sort correctly.'
315
logging.warning(msg)
316
317
try:
318
return sorted(self.d.items())
319
except TypeError:
320
return self.d.items()
321
322
def Render(self, **options):
323
"""Generates a sequence of points suitable for plotting.
324
325
Note: options are ignored
326
327
Returns:
328
tuple of (sorted value sequence, freq/prob sequence)
329
"""
330
return zip(*self.SortedItems())
331
332
def MakeCdf(self, label=None):
333
"""Makes a Cdf."""
334
label = label if label is not None else self.label
335
return Cdf(self, label=label)
336
337
def Print(self):
338
"""Prints the values and freqs/probs in ascending order."""
339
for val, prob in self.SortedItems():
340
print(val, prob)
341
342
def Set(self, x, y=0):
343
"""Sets the freq/prob associated with the value x.
344
345
Args:
346
x: number value
347
y: number freq or prob
348
"""
349
self.d[x] = y
350
351
def Incr(self, x, term=1):
352
"""Increments the freq/prob associated with the value x.
353
354
Args:
355
x: number value
356
term: how much to increment by
357
"""
358
self.d[x] = self.d.get(x, 0) + term
359
360
def Mult(self, x, factor):
361
"""Scales the freq/prob associated with the value x.
362
363
Args:
364
x: number value
365
factor: how much to multiply by
366
"""
367
self.d[x] = self.d.get(x, 0) * factor
368
369
def Remove(self, x):
370
"""Removes a value.
371
372
Throws an exception if the value is not there.
373
374
Args:
375
x: value to remove
376
"""
377
del self.d[x]
378
379
def Total(self):
380
"""Returns the total of the frequencies/probabilities in the map."""
381
total = sum(self.d.values())
382
return total
383
384
def MaxLike(self):
385
"""Returns the largest frequency/probability in the map."""
386
return max(self.d.values())
387
388
def Largest(self, n=10):
389
"""Returns the largest n values, with frequency/probability.
390
391
n: number of items to return
392
"""
393
return sorted(self.d.items(), reverse=True)[:n]
394
395
def Smallest(self, n=10):
396
"""Returns the smallest n values, with frequency/probability.
397
398
n: number of items to return
399
"""
400
return sorted(self.d.items(), reverse=False)[:n]
401
402
403
class Hist(_DictWrapper):
404
"""Represents a histogram, which is a map from values to frequencies.
405
406
Values can be any hashable type; frequencies are integer counters.
407
"""
408
def Freq(self, x):
409
"""Gets the frequency associated with the value x.
410
411
Args:
412
x: number value
413
414
Returns:
415
int frequency
416
"""
417
return self.d.get(x, 0)
418
419
def Freqs(self, xs):
420
"""Gets frequencies for a sequence of values."""
421
return [self.Freq(x) for x in xs]
422
423
def IsSubset(self, other):
424
"""Checks whether the values in this histogram are a subset of
425
the values in the given histogram."""
426
for val, freq in self.Items():
427
if freq > other.Freq(val):
428
return False
429
return True
430
431
def Subtract(self, other):
432
"""Subtracts the values in the given histogram from this histogram."""
433
for val, freq in other.Items():
434
self.Incr(val, -freq)
435
436
437
class Pmf(_DictWrapper):
438
"""Represents a probability mass function.
439
440
Values can be any hashable type; probabilities are floating-point.
441
Pmfs are not necessarily normalized.
442
"""
443
444
def Prob(self, x, default=0):
445
"""Gets the probability associated with the value x.
446
447
Args:
448
x: number value
449
default: value to return if the key is not there
450
451
Returns:
452
float probability
453
"""
454
return self.d.get(x, default)
455
456
def Probs(self, xs):
457
"""Gets probabilities for a sequence of values."""
458
return [self.Prob(x) for x in xs]
459
460
def Percentile(self, percentage):
461
"""Computes a percentile of a given Pmf.
462
463
Note: this is not super efficient. If you are planning
464
to compute more than a few percentiles, compute the Cdf.
465
466
percentage: float 0-100
467
468
returns: value from the Pmf
469
"""
470
p = percentage / 100
471
total = 0
472
for val, prob in sorted(self.Items()):
473
total += prob
474
if total >= p:
475
return val
476
477
def ProbGreater(self, x):
478
"""Probability that a sample from this Pmf exceeds x.
479
480
x: number
481
482
returns: float probability
483
"""
484
if isinstance(x, _DictWrapper):
485
return PmfProbGreater(self, x)
486
else:
487
t = [prob for (val, prob) in self.d.items() if val > x]
488
return sum(t)
489
490
def ProbLess(self, x):
491
"""Probability that a sample from this Pmf is less than x.
492
493
x: number
494
495
returns: float probability
496
"""
497
if isinstance(x, _DictWrapper):
498
return PmfProbLess(self, x)
499
else:
500
t = [prob for (val, prob) in self.d.items() if val < x]
501
return sum(t)
502
503
def ProbEqual(self, x):
504
"""Probability that a sample from this Pmf is exactly x.
505
506
x: number
507
508
returns: float probability
509
"""
510
if isinstance(x, _DictWrapper):
511
return PmfProbEqual(self, x)
512
else:
513
return self[x]
514
515
# NOTE: I've decided to remove the magic comparators because they
516
# have the side-effect of making Pmf sortable, but in fact they
517
# don't support sorting.
518
519
def Normalize(self, fraction=1):
520
"""Normalizes this PMF so the sum of all probs is fraction.
521
522
Args:
523
fraction: what the total should be after normalization
524
525
Returns: the total probability before normalizing
526
"""
527
if self.log:
528
raise ValueError("Normalize: Pmf is under a log transform")
529
530
total = self.Total()
531
if total == 0:
532
raise ValueError('Normalize: total probability is zero.')
533
534
factor = fraction / total
535
for x in self.d:
536
self.d[x] *= factor
537
538
return total
539
540
def Random(self):
541
"""Chooses a random element from this PMF.
542
543
Note: this is not very efficient. If you plan to call
544
this more than a few times, consider converting to a CDF.
545
546
Returns:
547
float value from the Pmf
548
"""
549
target = random.random()
550
total = 0
551
for x, p in self.d.items():
552
total += p
553
if total >= target:
554
return x
555
556
# we shouldn't get here
557
raise ValueError('Random: Pmf might not be normalized.')
558
559
def Sample(self, n):
560
"""Generates a random sample from this distribution.
561
562
n: int length of the sample
563
returns: NumPy array
564
"""
565
return self.MakeCdf().Sample(n)
566
567
def Mean(self):
568
"""Computes the mean of a PMF.
569
570
Returns:
571
float mean
572
"""
573
return sum(p * x for x, p in self.Items())
574
575
def Median(self):
576
"""Computes the median of a PMF.
577
578
Returns:
579
float median
580
"""
581
return self.MakeCdf().Percentile(50)
582
583
def Var(self, mu=None):
584
"""Computes the variance of a PMF.
585
586
mu: the point around which the variance is computed;
587
if omitted, computes the mean
588
589
returns: float variance
590
"""
591
if mu is None:
592
mu = self.Mean()
593
594
return sum(p * (x-mu)**2 for x, p in self.Items())
595
596
def Expect(self, func):
597
"""Computes the expectation of func(x).
598
599
Returns:
600
expectation
601
"""
602
return np.sum(p * func(x) for x, p in self.Items())
603
604
def Std(self, mu=None):
605
"""Computes the standard deviation of a PMF.
606
607
mu: the point around which the variance is computed;
608
if omitted, computes the mean
609
610
returns: float standard deviation
611
"""
612
var = self.Var(mu)
613
return math.sqrt(var)
614
615
def Mode(self):
616
"""Returns the value with the highest probability.
617
618
Returns: float probability
619
"""
620
_, val = max((prob, val) for val, prob in self.Items())
621
return val
622
623
# The mode of a posterior is the maximum aposteori probability (MAP)
624
MAP = Mode
625
626
# If the distribution contains likelihoods only, the peak is the
627
# maximum likelihood estimator.
628
MaximumLikelihood = Mode
629
630
def CredibleInterval(self, percentage=90):
631
"""Computes the central credible interval.
632
633
If percentage=90, computes the 90% CI.
634
635
Args:
636
percentage: float between 0 and 100
637
638
Returns:
639
sequence of two floats, low and high
640
"""
641
cdf = self.MakeCdf()
642
return cdf.CredibleInterval(percentage)
643
644
def __add__(self, other):
645
"""Computes the Pmf of the sum of values drawn from self and other.
646
647
other: another Pmf or a scalar
648
649
returns: new Pmf
650
"""
651
try:
652
return self.AddPmf(other)
653
except AttributeError:
654
return self.AddConstant(other)
655
656
__radd__ = __add__
657
658
def AddPmf(self, other):
659
"""Computes the Pmf of the sum of values drawn from self and other.
660
661
other: another Pmf
662
663
returns: new Pmf
664
"""
665
pmf = Pmf()
666
for v1, p1 in self.Items():
667
for v2, p2 in other.Items():
668
pmf[v1 + v2] += p1 * p2
669
return pmf
670
671
def AddConstant(self, other):
672
"""Computes the Pmf of the sum a constant and values from self.
673
674
other: a number
675
676
returns: new Pmf
677
"""
678
if other == 0:
679
return self.Copy()
680
681
pmf = Pmf()
682
for v1, p1 in self.Items():
683
pmf.Set(v1 + other, p1)
684
return pmf
685
686
def __sub__(self, other):
687
"""Computes the Pmf of the diff of values drawn from self and other.
688
689
other: another Pmf
690
691
returns: new Pmf
692
"""
693
try:
694
return self.SubPmf(other)
695
except AttributeError:
696
return self.AddConstant(-other)
697
698
def SubPmf(self, other):
699
"""Computes the Pmf of the diff of values drawn from self and other.
700
701
other: another Pmf
702
703
returns: new Pmf
704
"""
705
pmf = Pmf()
706
for v1, p1 in self.Items():
707
for v2, p2 in other.Items():
708
pmf.Incr(v1 - v2, p1 * p2)
709
return pmf
710
711
def __mul__(self, other):
712
"""Computes the Pmf of the product of values drawn from self and other.
713
714
other: another Pmf
715
716
returns: new Pmf
717
"""
718
try:
719
return self.MulPmf(other)
720
except AttributeError:
721
return self.MulConstant(other)
722
723
def MulPmf(self, other):
724
"""Computes the Pmf of the diff of values drawn from self and other.
725
726
other: another Pmf
727
728
returns: new Pmf
729
"""
730
pmf = Pmf()
731
for v1, p1 in self.Items():
732
for v2, p2 in other.Items():
733
pmf.Incr(v1 * v2, p1 * p2)
734
return pmf
735
736
def MulConstant(self, other):
737
"""Computes the Pmf of the product of a constant and values from self.
738
739
other: a number
740
741
returns: new Pmf
742
"""
743
pmf = Pmf()
744
for v1, p1 in self.Items():
745
pmf.Set(v1 * other, p1)
746
return pmf
747
748
def __div__(self, other):
749
"""Computes the Pmf of the ratio of values drawn from self and other.
750
751
other: another Pmf
752
753
returns: new Pmf
754
"""
755
try:
756
return self.DivPmf(other)
757
except AttributeError:
758
return self.MulConstant(1/other)
759
760
__truediv__ = __div__
761
762
def DivPmf(self, other):
763
"""Computes the Pmf of the ratio of values drawn from self and other.
764
765
other: another Pmf
766
767
returns: new Pmf
768
"""
769
pmf = Pmf()
770
for v1, p1 in self.Items():
771
for v2, p2 in other.Items():
772
pmf.Incr(v1 / v2, p1 * p2)
773
return pmf
774
775
def Max(self, k):
776
"""Computes the CDF of the maximum of k selections from this dist.
777
778
k: int
779
780
returns: new Cdf
781
"""
782
cdf = self.MakeCdf()
783
cdf.ps **= k
784
return cdf
785
786
787
class Joint(Pmf):
788
"""Represents a joint distribution.
789
790
The values are sequences (usually tuples)
791
"""
792
793
def Marginal(self, i, label=None):
794
"""Gets the marginal distribution of the indicated variable.
795
796
i: index of the variable we want
797
798
Returns: Pmf
799
"""
800
pmf = Pmf(label=label)
801
for vs, prob in self.Items():
802
pmf.Incr(vs[i], prob)
803
return pmf
804
805
def Conditional(self, i, j, val, label=None):
806
"""Gets the conditional distribution of the indicated variable.
807
808
Distribution of vs[i], conditioned on vs[j] = val.
809
810
i: index of the variable we want
811
j: which variable is conditioned on
812
val: the value the jth variable has to have
813
814
Returns: Pmf
815
"""
816
pmf = Pmf(label=label)
817
for vs, prob in self.Items():
818
if vs[j] != val:
819
continue
820
pmf.Incr(vs[i], prob)
821
822
pmf.Normalize()
823
return pmf
824
825
def MaxLikeInterval(self, percentage=90):
826
"""Returns the maximum-likelihood credible interval.
827
828
If percentage=90, computes a 90% CI containing the values
829
with the highest likelihoods.
830
831
percentage: float between 0 and 100
832
833
Returns: list of values from the suite
834
"""
835
interval = []
836
total = 0
837
838
t = [(prob, val) for val, prob in self.Items()]
839
t.sort(reverse=True)
840
841
for prob, val in t:
842
interval.append(val)
843
total += prob
844
if total >= percentage / 100:
845
break
846
847
return interval
848
849
850
def MakeJoint(pmf1, pmf2):
851
"""Joint distribution of values from pmf1 and pmf2.
852
853
Assumes that the PMFs represent independent random variables.
854
855
Args:
856
pmf1: Pmf object
857
pmf2: Pmf object
858
859
Returns:
860
Joint pmf of value pairs
861
"""
862
joint = Joint()
863
for v1, p1 in pmf1.Items():
864
for v2, p2 in pmf2.Items():
865
joint.Set((v1, v2), p1 * p2)
866
return joint
867
868
869
def MakeHistFromList(t, label=None):
870
"""Makes a histogram from an unsorted sequence of values.
871
872
Args:
873
t: sequence of numbers
874
label: string label for this histogram
875
876
Returns:
877
Hist object
878
"""
879
return Hist(t, label=label)
880
881
882
def MakeHistFromDict(d, label=None):
883
"""Makes a histogram from a map from values to frequencies.
884
885
Args:
886
d: dictionary that maps values to frequencies
887
label: string label for this histogram
888
889
Returns:
890
Hist object
891
"""
892
return Hist(d, label)
893
894
895
def MakePmfFromList(t, label=None):
896
"""Makes a PMF from an unsorted sequence of values.
897
898
Args:
899
t: sequence of numbers
900
label: string label for this PMF
901
902
Returns:
903
Pmf object
904
"""
905
return Pmf(t, label=label)
906
907
908
def MakePmfFromDict(d, label=None):
909
"""Makes a PMF from a map from values to probabilities.
910
911
Args:
912
d: dictionary that maps values to probabilities
913
label: string label for this PMF
914
915
Returns:
916
Pmf object
917
"""
918
return Pmf(d, label=label)
919
920
921
def MakePmfFromItems(t, label=None):
922
"""Makes a PMF from a sequence of value-probability pairs
923
924
Args:
925
t: sequence of value-probability pairs
926
label: string label for this PMF
927
928
Returns:
929
Pmf object
930
"""
931
return Pmf(dict(t), label=label)
932
933
934
def MakePmfFromHist(hist, label=None):
935
"""Makes a normalized PMF from a Hist object.
936
937
Args:
938
hist: Hist object
939
label: string label
940
941
Returns:
942
Pmf object
943
"""
944
if label is None:
945
label = hist.label
946
947
return Pmf(hist, label=label)
948
949
950
def MakeMixture(metapmf, label='mix'):
951
"""Make a mixture distribution.
952
953
Args:
954
metapmf: Pmf that maps from Pmfs to probs.
955
label: string label for the new Pmf.
956
957
Returns: Pmf object.
958
"""
959
mix = Pmf(label=label)
960
for pmf, p1 in metapmf.Items():
961
for x, p2 in pmf.Items():
962
mix[x] += p1 * p2
963
return mix
964
965
966
def MakeUniformPmf(low, high, n):
967
"""Make a uniform Pmf.
968
969
low: lowest value (inclusive)
970
high: highest value (inclusize)
971
n: number of values
972
"""
973
pmf = Pmf()
974
for x in np.linspace(low, high, n):
975
pmf.Set(x, 1)
976
pmf.Normalize()
977
return pmf
978
979
980
class Cdf:
981
"""Represents a cumulative distribution function.
982
983
Attributes:
984
xs: sequence of values
985
ps: sequence of probabilities
986
label: string used as a graph label.
987
"""
988
def __init__(self, obj=None, ps=None, label=None):
989
"""Initializes.
990
991
If ps is provided, obj must be the corresponding list of values.
992
993
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
994
ps: list of cumulative probabilities
995
label: string label
996
"""
997
self.label = label if label is not None else DEFAULT_LABEL
998
999
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
1000
if not label:
1001
self.label = label if label is not None else obj.label
1002
1003
if obj is None:
1004
# caller does not provide obj, make an empty Cdf
1005
self.xs = np.asarray([])
1006
self.ps = np.asarray([])
1007
if ps is not None:
1008
logging.warning("Cdf: can't pass ps without also passing xs.")
1009
return
1010
else:
1011
# if the caller provides xs and ps, just store them
1012
if ps is not None:
1013
if isinstance(ps, str):
1014
logging.warning("Cdf: ps can't be a string")
1015
1016
self.xs = np.asarray(obj)
1017
self.ps = np.asarray(ps)
1018
return
1019
1020
# caller has provided just obj, not ps
1021
if isinstance(obj, Cdf):
1022
self.xs = copy.copy(obj.xs)
1023
self.ps = copy.copy(obj.ps)
1024
return
1025
1026
if isinstance(obj, _DictWrapper):
1027
dw = obj
1028
else:
1029
dw = Hist(obj)
1030
1031
if len(dw) == 0:
1032
self.xs = np.asarray([])
1033
self.ps = np.asarray([])
1034
return
1035
1036
xs, freqs = zip(*sorted(dw.Items()))
1037
self.xs = np.asarray(xs)
1038
self.ps = np.cumsum(freqs, dtype=np.float)
1039
self.ps /= self.ps[-1]
1040
1041
def __str__(self):
1042
cls = self.__class__.__name__
1043
if self.label == DEFAULT_LABEL:
1044
return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1045
else:
1046
return self.label
1047
1048
def __repr__(self):
1049
cls = self.__class__.__name__
1050
if self.label == DEFAULT_LABEL:
1051
return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1052
else:
1053
return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps),
1054
repr(self.label))
1055
1056
def __len__(self):
1057
return len(self.xs)
1058
1059
def __getitem__(self, x):
1060
return self.Prob(x)
1061
1062
def __setitem__(self):
1063
raise UnimplementedMethodException()
1064
1065
def __delitem__(self):
1066
raise UnimplementedMethodException()
1067
1068
def __eq__(self, other):
1069
return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)
1070
1071
def Print(self):
1072
"""Prints the values and freqs/probs in ascending order."""
1073
for val, prob in zip(self.xs, self.ps):
1074
print(val, prob)
1075
1076
def Copy(self, label=None):
1077
"""Returns a copy of this Cdf.
1078
1079
label: string label for the new Cdf
1080
"""
1081
if label is None:
1082
label = self.label
1083
return Cdf(list(self.xs), list(self.ps), label=label)
1084
1085
def MakePmf(self, label=None):
1086
"""Makes a Pmf."""
1087
if label is None:
1088
label = self.label
1089
return Pmf(self, label=label)
1090
1091
def Items(self):
1092
"""Returns a sorted sequence of (value, probability) pairs.
1093
1094
Note: in Python3, returns an iterator.
1095
"""
1096
a = self.ps
1097
b = np.roll(a, 1)
1098
b[0] = 0
1099
return zip(self.xs, a-b)
1100
1101
def Shift(self, term):
1102
"""Adds a term to the xs.
1103
1104
term: how much to add
1105
"""
1106
new = self.Copy()
1107
# don't use +=, or else an int array + float yields int array
1108
new.xs = new.xs + term
1109
return new
1110
1111
def Scale(self, factor):
1112
"""Multiplies the xs by a factor.
1113
1114
factor: what to multiply by
1115
"""
1116
new = self.Copy()
1117
# don't use *=, or else an int array * float yields int array
1118
new.xs = new.xs * factor
1119
return new
1120
1121
def Prob(self, x):
1122
"""Returns CDF(x), the probability that corresponds to value x.
1123
1124
Args:
1125
x: number
1126
1127
Returns:
1128
float probability
1129
"""
1130
if x < self.xs[0]:
1131
return 0
1132
index = bisect.bisect(self.xs, x)
1133
p = self.ps[index-1]
1134
return p
1135
1136
def Probs(self, xs):
1137
"""Gets probabilities for a sequence of values.
1138
1139
xs: any sequence that can be converted to NumPy array
1140
1141
returns: NumPy array of cumulative probabilities
1142
"""
1143
xs = np.asarray(xs)
1144
index = np.searchsorted(self.xs, xs, side='right')
1145
ps = self.ps[index-1]
1146
ps[xs < self.xs[0]] = 0
1147
return ps
1148
1149
ProbArray = Probs
1150
1151
def Value(self, p):
1152
"""Returns InverseCDF(p), the value that corresponds to probability p.
1153
1154
Args:
1155
p: number in the range [0, 1]
1156
1157
Returns:
1158
number value
1159
"""
1160
if p < 0 or p > 1:
1161
raise ValueError('Probability p must be in range [0, 1]')
1162
1163
index = bisect.bisect_left(self.ps, p)
1164
return self.xs[index]
1165
1166
def Values(self, ps=None):
1167
"""Returns InverseCDF(p), the value that corresponds to probability p.
1168
1169
If ps is not provided, returns all values.
1170
1171
Args:
1172
ps: NumPy array of numbers in the range [0, 1]
1173
1174
Returns:
1175
NumPy array of values
1176
"""
1177
if ps is None:
1178
return self.xs
1179
1180
ps = np.asarray(ps)
1181
if np.any(ps < 0) or np.any(ps > 1):
1182
raise ValueError('Probability p must be in range [0, 1]')
1183
1184
index = np.searchsorted(self.ps, ps, side='left')
1185
return self.xs[index]
1186
1187
ValueArray = Values
1188
1189
def Percentile(self, p):
1190
"""Returns the value that corresponds to percentile p.
1191
1192
Args:
1193
p: number in the range [0, 100]
1194
1195
Returns:
1196
number value
1197
"""
1198
return self.Value(p / 100)
1199
1200
def Percentiles(self, ps):
1201
"""Returns the value that corresponds to percentiles ps.
1202
1203
Args:
1204
ps: numbers in the range [0, 100]
1205
1206
Returns:
1207
array of values
1208
"""
1209
ps = np.asarray(ps)
1210
return self.Values(ps / 100)
1211
1212
def PercentileRank(self, x):
1213
"""Returns the percentile rank of the value x.
1214
1215
x: potential value in the CDF
1216
1217
returns: percentile rank in the range 0 to 100
1218
"""
1219
return self.Prob(x) * 100
1220
1221
def PercentileRanks(self, xs):
1222
"""Returns the percentile ranks of the values in xs.
1223
1224
xs: potential value in the CDF
1225
1226
returns: array of percentile ranks in the range 0 to 100
1227
"""
1228
return self.Probs(x) * 100
1229
1230
def Random(self):
1231
"""Chooses a random value from this distribution."""
1232
return self.Value(random.random())
1233
1234
def Sample(self, n):
1235
"""Generates a random sample from this distribution.
1236
1237
n: int length of the sample
1238
returns: NumPy array
1239
"""
1240
ps = np.random.random(n)
1241
return self.ValueArray(ps)
1242
1243
def Mean(self):
1244
"""Computes the mean of a CDF.
1245
1246
Returns:
1247
float mean
1248
"""
1249
old_p = 0
1250
total = 0
1251
for x, new_p in zip(self.xs, self.ps):
1252
p = new_p - old_p
1253
total += p * x
1254
old_p = new_p
1255
return total
1256
1257
def CredibleInterval(self, percentage=90):
1258
"""Computes the central credible interval.
1259
1260
If percentage=90, computes the 90% CI.
1261
1262
Args:
1263
percentage: float between 0 and 100
1264
1265
Returns:
1266
sequence of two floats, low and high
1267
"""
1268
prob = (1 - percentage / 100) / 2
1269
interval = self.Value(prob), self.Value(1 - prob)
1270
return interval
1271
1272
ConfidenceInterval = CredibleInterval
1273
1274
def _Round(self, multiplier=1000):
1275
"""
1276
An entry is added to the cdf only if the percentile differs
1277
from the previous value in a significant digit, where the number
1278
of significant digits is determined by multiplier. The
1279
default is 1000, which keeps log10(1000) = 3 significant digits.
1280
"""
1281
# TODO(write this method)
1282
raise UnimplementedMethodException()
1283
1284
def Render(self, **options):
1285
"""Generates a sequence of points suitable for plotting.
1286
1287
An empirical CDF is a step function; linear interpolation
1288
can be misleading.
1289
1290
Note: options are ignored
1291
1292
Returns:
1293
tuple of (xs, ps)
1294
"""
1295
def interleave(a, b):
1296
c = np.empty(a.shape[0] + b.shape[0])
1297
c[::2] = a
1298
c[1::2] = b
1299
return c
1300
1301
a = np.array(self.xs)
1302
xs = interleave(a, a)
1303
shift_ps = np.roll(self.ps, 1)
1304
shift_ps[0] = 0
1305
ps = interleave(shift_ps, self.ps)
1306
return xs, ps
1307
1308
def Max(self, k):
1309
"""Computes the CDF of the maximum of k selections from this dist.
1310
1311
k: int
1312
1313
returns: new Cdf
1314
"""
1315
cdf = self.Copy()
1316
cdf.ps **= k
1317
return cdf
1318
1319
1320
def MakeCdfFromItems(items, label=None):
1321
"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1322
1323
Args:
1324
items: unsorted sequence of (value, frequency) pairs
1325
label: string label for this CDF
1326
1327
Returns:
1328
cdf: list of (value, fraction) pairs
1329
"""
1330
return Cdf(dict(items), label=label)
1331
1332
1333
def MakeCdfFromDict(d, label=None):
1334
"""Makes a CDF from a dictionary that maps values to frequencies.
1335
1336
Args:
1337
d: dictionary that maps values to frequencies.
1338
label: string label for the data.
1339
1340
Returns:
1341
Cdf object
1342
"""
1343
return Cdf(d, label=label)
1344
1345
1346
def MakeCdfFromList(seq, label=None):
1347
"""Creates a CDF from an unsorted sequence.
1348
1349
Args:
1350
seq: unsorted sequence of sortable values
1351
label: string label for the cdf
1352
1353
Returns:
1354
Cdf object
1355
"""
1356
return Cdf(seq, label=label)
1357
1358
1359
def MakeCdfFromHist(hist, label=None):
1360
"""Makes a CDF from a Hist object.
1361
1362
Args:
1363
hist: Pmf.Hist object
1364
label: string label for the data.
1365
1366
Returns:
1367
Cdf object
1368
"""
1369
if label is None:
1370
label = hist.label
1371
1372
return Cdf(hist, label=label)
1373
1374
1375
def MakeCdfFromPmf(pmf, label=None):
1376
"""Makes a CDF from a Pmf object.
1377
1378
Args:
1379
pmf: Pmf.Pmf object
1380
label: string label for the data.
1381
1382
Returns:
1383
Cdf object
1384
"""
1385
if label is None:
1386
label = pmf.label
1387
1388
return Cdf(pmf, label=label)
1389
1390
1391
class UnimplementedMethodException(Exception):
1392
"""Exception if someone calls a method that should be overridden."""
1393
1394
1395
class Suite(Pmf):
1396
"""Represents a suite of hypotheses and their probabilities."""
1397
1398
def Update(self, data):
1399
"""Updates each hypothesis based on the data.
1400
1401
data: any representation of the data
1402
1403
returns: the normalizing constant
1404
"""
1405
for hypo in self.Values():
1406
like = self.Likelihood(data, hypo)
1407
self.Mult(hypo, like)
1408
return self.Normalize()
1409
1410
def LogUpdate(self, data):
1411
"""Updates a suite of hypotheses based on new data.
1412
1413
Modifies the suite directly; if you want to keep the original, make
1414
a copy.
1415
1416
Note: unlike Update, LogUpdate does not normalize.
1417
1418
Args:
1419
data: any representation of the data
1420
"""
1421
for hypo in self.Values():
1422
like = self.LogLikelihood(data, hypo)
1423
self.Incr(hypo, like)
1424
1425
def UpdateSet(self, dataset):
1426
"""Updates each hypothesis based on the dataset.
1427
1428
This is more efficient than calling Update repeatedly because
1429
it waits until the end to Normalize.
1430
1431
Modifies the suite directly; if you want to keep the original, make
1432
a copy.
1433
1434
dataset: a sequence of data
1435
1436
returns: the normalizing constant
1437
"""
1438
for data in dataset:
1439
for hypo in self.Values():
1440
like = self.Likelihood(data, hypo)
1441
self.Mult(hypo, like)
1442
return self.Normalize()
1443
1444
def LogUpdateSet(self, dataset):
1445
"""Updates each hypothesis based on the dataset.
1446
1447
Modifies the suite directly; if you want to keep the original, make
1448
a copy.
1449
1450
dataset: a sequence of data
1451
1452
returns: None
1453
"""
1454
for data in dataset:
1455
self.LogUpdate(data)
1456
1457
def Likelihood(self, data, hypo):
1458
"""Computes the likelihood of the data under the hypothesis.
1459
1460
hypo: some representation of the hypothesis
1461
data: some representation of the data
1462
"""
1463
raise UnimplementedMethodException()
1464
1465
def LogLikelihood(self, data, hypo):
1466
"""Computes the log likelihood of the data under the hypothesis.
1467
1468
hypo: some representation of the hypothesis
1469
data: some representation of the data
1470
"""
1471
raise UnimplementedMethodException()
1472
1473
def Print(self):
1474
"""Prints the hypotheses and their probabilities."""
1475
for hypo, prob in sorted(self.Items()):
1476
print(hypo, prob)
1477
1478
def MakeOdds(self):
1479
"""Transforms from probabilities to odds.
1480
1481
Values with prob=0 are removed.
1482
"""
1483
for hypo, prob in self.Items():
1484
if prob:
1485
self.Set(hypo, Odds(prob))
1486
else:
1487
self.Remove(hypo)
1488
1489
def MakeProbs(self):
1490
"""Transforms from odds to probabilities."""
1491
for hypo, odds in self.Items():
1492
self.Set(hypo, Probability(odds))
1493
1494
1495
def MakeSuiteFromList(t, label=None):
1496
"""Makes a suite from an unsorted sequence of values.
1497
1498
Args:
1499
t: sequence of numbers
1500
label: string label for this suite
1501
1502
Returns:
1503
Suite object
1504
"""
1505
hist = MakeHistFromList(t, label=label)
1506
d = hist.GetDict()
1507
return MakeSuiteFromDict(d)
1508
1509
1510
def MakeSuiteFromHist(hist, label=None):
1511
"""Makes a normalized suite from a Hist object.
1512
1513
Args:
1514
hist: Hist object
1515
label: string label
1516
1517
Returns:
1518
Suite object
1519
"""
1520
if label is None:
1521
label = hist.label
1522
1523
# make a copy of the dictionary
1524
d = dict(hist.GetDict())
1525
return MakeSuiteFromDict(d, label)
1526
1527
1528
def MakeSuiteFromDict(d, label=None):
1529
"""Makes a suite from a map from values to probabilities.
1530
1531
Args:
1532
d: dictionary that maps values to probabilities
1533
label: string label for this suite
1534
1535
Returns:
1536
Suite object
1537
"""
1538
suite = Suite(label=label)
1539
suite.SetDict(d)
1540
suite.Normalize()
1541
return suite
1542
1543
1544
class Pdf(object):
1545
"""Represents a probability density function (PDF)."""
1546
1547
def Density(self, x):
1548
"""Evaluates this Pdf at x.
1549
1550
Returns: float or NumPy array of probability density
1551
"""
1552
raise UnimplementedMethodException()
1553
1554
def GetLinspace(self):
1555
"""Get a linspace for plotting.
1556
1557
Not all subclasses of Pdf implement this.
1558
1559
Returns: numpy array
1560
"""
1561
raise UnimplementedMethodException()
1562
1563
def MakePmf(self, **options):
1564
"""Makes a discrete version of this Pdf.
1565
1566
options can include
1567
label: string
1568
low: low end of range
1569
high: high end of range
1570
n: number of places to evaluate
1571
1572
Returns: new Pmf
1573
"""
1574
label = options.pop('label', '')
1575
xs, ds = self.Render(**options)
1576
return Pmf(dict(zip(xs, ds)), label=label)
1577
1578
def Render(self, **options):
1579
"""Generates a sequence of points suitable for plotting.
1580
1581
If options includes low and high, it must also include n;
1582
in that case the density is evaluated an n locations between
1583
low and high, including both.
1584
1585
If options includes xs, the density is evaluate at those location.
1586
1587
Otherwise, self.GetLinspace is invoked to provide the locations.
1588
1589
Returns:
1590
tuple of (xs, densities)
1591
"""
1592
low, high = options.pop('low', None), options.pop('high', None)
1593
if low is not None and high is not None:
1594
n = options.pop('n', 101)
1595
xs = np.linspace(low, high, n)
1596
else:
1597
xs = options.pop('xs', None)
1598
if xs is None:
1599
xs = self.GetLinspace()
1600
1601
ds = self.Density(xs)
1602
return xs, ds
1603
1604
def Items(self):
1605
"""Generates a sequence of (value, probability) pairs.
1606
"""
1607
return zip(*self.Render())
1608
1609
1610
class NormalPdf(Pdf):
1611
"""Represents the PDF of a Normal distribution."""
1612
1613
def __init__(self, mu=0, sigma=1, label=None):
1614
"""Constructs a Normal Pdf with given mu and sigma.
1615
1616
mu: mean
1617
sigma: standard deviation
1618
label: string
1619
"""
1620
self.mu = mu
1621
self.sigma = sigma
1622
self.label = label if label is not None else '_nolegend_'
1623
1624
def __str__(self):
1625
return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)
1626
1627
def GetLinspace(self):
1628
"""Get a linspace for plotting.
1629
1630
Returns: numpy array
1631
"""
1632
low, high = self.mu-3*self.sigma, self.mu+3*self.sigma
1633
return np.linspace(low, high, 101)
1634
1635
def Density(self, xs):
1636
"""Evaluates this Pdf at xs.
1637
1638
xs: scalar or sequence of floats
1639
1640
returns: float or NumPy array of probability density
1641
"""
1642
return stats.norm.pdf(xs, self.mu, self.sigma)
1643
1644
1645
class ExponentialPdf(Pdf):
1646
"""Represents the PDF of an exponential distribution."""
1647
1648
def __init__(self, lam=1, label=None):
1649
"""Constructs an exponential Pdf with given parameter.
1650
1651
lam: rate parameter
1652
label: string
1653
"""
1654
self.lam = lam
1655
self.label = label if label is not None else '_nolegend_'
1656
1657
def __str__(self):
1658
return 'ExponentialPdf(%f)' % (self.lam)
1659
1660
def GetLinspace(self):
1661
"""Get a linspace for plotting.
1662
1663
Returns: numpy array
1664
"""
1665
low, high = 0, 5.0/self.lam
1666
return np.linspace(low, high, 101)
1667
1668
def Density(self, xs):
1669
"""Evaluates this Pdf at xs.
1670
1671
xs: scalar or sequence of floats
1672
1673
returns: float or NumPy array of probability density
1674
"""
1675
return stats.expon.pdf(xs, scale=1.0/self.lam)
1676
1677
1678
class EstimatedPdf(Pdf):
1679
"""Represents a PDF estimated by KDE."""
1680
1681
def __init__(self, sample, label=None):
1682
"""Estimates the density function based on a sample.
1683
1684
sample: sequence of data
1685
label: string
1686
"""
1687
self.label = label if label is not None else '_nolegend_'
1688
self.kde = stats.gaussian_kde(sample)
1689
low = min(sample)
1690
high = max(sample)
1691
self.linspace = np.linspace(low, high, 101)
1692
1693
def __str__(self):
1694
return 'EstimatedPdf(label=%s)' % str(self.label)
1695
1696
def GetLinspace(self):
1697
"""Get a linspace for plotting.
1698
1699
Returns: numpy array
1700
"""
1701
return self.linspace
1702
1703
def Density(self, xs):
1704
"""Evaluates this Pdf at xs.
1705
1706
returns: float or NumPy array of probability density
1707
"""
1708
return self.kde.evaluate(xs)
1709
1710
def Sample(self, n):
1711
"""Generates a random sample from the estimated Pdf.
1712
1713
n: size of sample
1714
"""
1715
# NOTE: we have to flatten because resample returns a 2-D
1716
# array for some reason.
1717
return self.kde.resample(n).flatten()
1718
1719
1720
def CredibleInterval(pmf, percentage=90):
1721
"""Computes a credible interval for a given distribution.
1722
1723
If percentage=90, computes the 90% CI.
1724
1725
Args:
1726
pmf: Pmf object representing a posterior distribution
1727
percentage: float between 0 and 100
1728
1729
Returns:
1730
sequence of two floats, low and high
1731
"""
1732
cdf = pmf.MakeCdf()
1733
prob = (1 - percentage / 100) / 2
1734
interval = cdf.Value(prob), cdf.Value(1 - prob)
1735
return interval
1736
1737
1738
def PmfProbLess(pmf1, pmf2):
1739
"""Probability that a value from pmf1 is less than a value from pmf2.
1740
1741
Args:
1742
pmf1: Pmf object
1743
pmf2: Pmf object
1744
1745
Returns:
1746
float probability
1747
"""
1748
total = 0
1749
for v1, p1 in pmf1.Items():
1750
for v2, p2 in pmf2.Items():
1751
if v1 < v2:
1752
total += p1 * p2
1753
return total
1754
1755
1756
def PmfProbGreater(pmf1, pmf2):
1757
"""Probability that a value from pmf1 is less than a value from pmf2.
1758
1759
Args:
1760
pmf1: Pmf object
1761
pmf2: Pmf object
1762
1763
Returns:
1764
float probability
1765
"""
1766
total = 0
1767
for v1, p1 in pmf1.Items():
1768
for v2, p2 in pmf2.Items():
1769
if v1 > v2:
1770
total += p1 * p2
1771
return total
1772
1773
1774
def PmfProbEqual(pmf1, pmf2):
1775
"""Probability that a value from pmf1 equals a value from pmf2.
1776
1777
Args:
1778
pmf1: Pmf object
1779
pmf2: Pmf object
1780
1781
Returns:
1782
float probability
1783
"""
1784
total = 0
1785
for v1, p1 in pmf1.Items():
1786
for v2, p2 in pmf2.Items():
1787
if v1 == v2:
1788
total += p1 * p2
1789
return total
1790
1791
1792
def RandomSum(dists):
1793
"""Chooses a random value from each dist and returns the sum.
1794
1795
dists: sequence of Pmf or Cdf objects
1796
1797
returns: numerical sum
1798
"""
1799
total = sum(dist.Random() for dist in dists)
1800
return total
1801
1802
1803
def SampleSum(dists, n):
1804
"""Draws a sample of sums from a list of distributions.
1805
1806
dists: sequence of Pmf or Cdf objects
1807
n: sample size
1808
1809
returns: new Pmf of sums
1810
"""
1811
pmf = Pmf(RandomSum(dists) for i in range(n))
1812
return pmf
1813
1814
1815
def EvalNormalPdf(x, mu, sigma):
1816
"""Computes the unnormalized PDF of the normal distribution.
1817
1818
x: value
1819
mu: mean
1820
sigma: standard deviation
1821
1822
returns: float probability density
1823
"""
1824
return stats.norm.pdf(x, mu, sigma)
1825
1826
1827
def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
1828
"""Makes a PMF discrete approx to a Normal distribution.
1829
1830
mu: float mean
1831
sigma: float standard deviation
1832
num_sigmas: how many sigmas to extend in each direction
1833
n: number of values in the Pmf
1834
1835
returns: normalized Pmf
1836
"""
1837
pmf = Pmf()
1838
low = mu - num_sigmas * sigma
1839
high = mu + num_sigmas * sigma
1840
1841
for x in np.linspace(low, high, n):
1842
p = EvalNormalPdf(x, mu, sigma)
1843
pmf.Set(x, p)
1844
pmf.Normalize()
1845
return pmf
1846
1847
1848
def EvalBinomialPmf(k, n, p):
1849
"""Evaluates the binomial PMF.
1850
1851
Returns the probabily of k successes in n trials with probability p.
1852
"""
1853
return stats.binom.pmf(k, n, p)
1854
1855
1856
def MakeBinomialPmf(n, p):
1857
"""Evaluates the binomial PMF.
1858
1859
Returns the distribution of successes in n trials with probability p.
1860
"""
1861
pmf = Pmf()
1862
for k in range(n+1):
1863
pmf[k] = stats.binom.pmf(k, n, p)
1864
return pmf
1865
1866
1867
def EvalGammaPdf(x, a):
1868
"""Computes the Gamma PDF.
1869
1870
x: where to evaluate the PDF
1871
a: parameter of the gamma distribution
1872
1873
returns: float probability
1874
"""
1875
return x**(a-1) * np.exp(-x) / gamma(a)
1876
1877
1878
def MakeGammaPmf(xs, a):
1879
"""Makes a PMF discrete approx to a Gamma distribution.
1880
1881
lam: parameter lambda in events per unit time
1882
xs: upper bound of the Pmf
1883
1884
returns: normalized Pmf
1885
"""
1886
xs = np.asarray(xs)
1887
ps = EvalGammaPdf(xs, a)
1888
pmf = Pmf(dict(zip(xs, ps)))
1889
pmf.Normalize()
1890
return pmf
1891
1892
1893
def EvalGeometricPmf(k, p, loc=0):
1894
"""Evaluates the geometric PMF.
1895
1896
With loc=0: Probability of `k` trials to get one success.
1897
With loc=-1: Probability of `k` trials before first success.
1898
1899
k: number of trials
1900
p: probability of success on each trial
1901
"""
1902
return stats.geom.pmf(k, p, loc=loc)
1903
1904
1905
def MakeGeometricPmf(p, loc=0, high=10):
1906
"""Evaluates the binomial PMF.
1907
1908
With loc=0: PMF of trials to get one success.
1909
With loc=-1: PMF of trials before first success.
1910
1911
p: probability of success
1912
high: upper bound where PMF is truncated
1913
"""
1914
pmf = Pmf()
1915
for k in range(high):
1916
pmf[k] = stats.geom.pmf(k, p, loc=loc)
1917
pmf.Normalize()
1918
return pmf
1919
1920
1921
def EvalHypergeomPmf(k, N, K, n):
1922
"""Evaluates the hypergeometric PMF.
1923
1924
Returns the probabily of k successes in n trials from a population
1925
N with K successes in it.
1926
"""
1927
return stats.hypergeom.pmf(k, N, K, n)
1928
1929
1930
def EvalPoissonPmf(k, lam):
1931
"""Computes the Poisson PMF.
1932
1933
k: number of events
1934
lam: parameter lambda in events per unit time
1935
1936
returns: float probability
1937
"""
1938
return stats.poisson.pmf(k, lam)
1939
1940
1941
def MakePoissonPmf(lam, high, step=1):
1942
"""Makes a PMF discrete approx to a Poisson distribution.
1943
1944
lam: parameter lambda in events per unit time
1945
high: upper bound of the Pmf
1946
1947
returns: normalized Pmf
1948
"""
1949
pmf = Pmf()
1950
for k in range(0, high + 1, step):
1951
p = stats.poisson.pmf(k, lam)
1952
pmf.Set(k, p)
1953
pmf.Normalize()
1954
return pmf
1955
1956
1957
def EvalExponentialPdf(x, lam):
1958
"""Computes the exponential PDF.
1959
1960
x: value
1961
lam: parameter lambda in events per unit time
1962
1963
returns: float probability density
1964
"""
1965
return lam * math.exp(-lam * x)
1966
1967
1968
def EvalExponentialCdf(x, lam):
1969
"""Evaluates CDF of the exponential distribution with parameter lam."""
1970
return 1 - math.exp(-lam * x)
1971
1972
1973
def MakeExponentialPmf(lam, high, n=200):
1974
"""Makes a PMF discrete approx to an exponential distribution.
1975
1976
lam: parameter lambda in events per unit time
1977
high: upper bound
1978
n: number of values in the Pmf
1979
1980
returns: normalized Pmf
1981
"""
1982
pmf = Pmf()
1983
for x in np.linspace(0, high, n):
1984
p = EvalExponentialPdf(x, lam)
1985
pmf.Set(x, p)
1986
pmf.Normalize()
1987
return pmf
1988
1989
1990
def EvalWeibullPdf(x, lam, k):
1991
"""Computes the Weibull PDF.
1992
1993
x: value
1994
lam: parameter lambda in events per unit time
1995
k: parameter
1996
1997
returns: float probability density
1998
"""
1999
arg = (x / lam)
2000
return k / lam * arg**(k-1) * np.exp(-arg**k)
2001
2002
2003
def EvalWeibullCdf(x, lam, k):
2004
"""Evaluates CDF of the Weibull distribution."""
2005
arg = (x / lam)
2006
return 1 - np.exp(-arg**k)
2007
2008
2009
def MakeWeibullPmf(lam, k, high, n=200):
2010
"""Makes a PMF discrete approx to a Weibull distribution.
2011
2012
lam: parameter lambda in events per unit time
2013
k: parameter
2014
high: upper bound
2015
n: number of values in the Pmf
2016
2017
returns: normalized Pmf
2018
"""
2019
xs = np.linspace(0, high, n)
2020
ps = EvalWeibullPdf(xs, lam, k)
2021
ps[np.isinf(ps)] = 0
2022
return Pmf(dict(zip(xs, ps)))
2023
2024
2025
def EvalParetoPdf(x, xm, alpha):
2026
"""Computes the Pareto.
2027
2028
xm: minimum value (scale parameter)
2029
alpha: shape parameter
2030
2031
returns: float probability density
2032
"""
2033
return stats.pareto.pdf(x, alpha, scale=xm)
2034
2035
2036
def MakeParetoPmf(xm, alpha, high, num=101):
2037
"""Makes a PMF discrete approx to a Pareto distribution.
2038
2039
xm: minimum value (scale parameter)
2040
alpha: shape parameter
2041
high: upper bound value
2042
num: number of values
2043
2044
returns: normalized Pmf
2045
"""
2046
xs = np.linspace(xm, high, num)
2047
ps = stats.pareto.pdf(xs, alpha, scale=xm)
2048
pmf = Pmf(dict(zip(xs, ps)))
2049
return pmf
2050
2051
def StandardNormalCdf(x):
2052
"""Evaluates the CDF of the standard Normal distribution.
2053
2054
See http://en.wikipedia.org/wiki/Normal_distribution
2055
#Cumulative_distribution_function
2056
2057
Args:
2058
x: float
2059
2060
Returns:
2061
float
2062
"""
2063
return (math.erf(x / ROOT2) + 1) / 2
2064
2065
2066
def EvalNormalCdf(x, mu=0, sigma=1):
2067
"""Evaluates the CDF of the normal distribution.
2068
2069
Args:
2070
x: float
2071
2072
mu: mean parameter
2073
2074
sigma: standard deviation parameter
2075
2076
Returns:
2077
float
2078
"""
2079
return stats.norm.cdf(x, loc=mu, scale=sigma)
2080
2081
2082
def EvalNormalCdfInverse(p, mu=0, sigma=1):
2083
"""Evaluates the inverse CDF of the normal distribution.
2084
2085
See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function
2086
2087
Args:
2088
p: float
2089
2090
mu: mean parameter
2091
2092
sigma: standard deviation parameter
2093
2094
Returns:
2095
float
2096
"""
2097
return stats.norm.ppf(p, loc=mu, scale=sigma)
2098
2099
2100
def EvalLognormalCdf(x, mu=0, sigma=1):
2101
"""Evaluates the CDF of the lognormal distribution.
2102
2103
x: float or sequence
2104
mu: mean parameter
2105
sigma: standard deviation parameter
2106
2107
Returns: float or sequence
2108
"""
2109
return stats.lognorm.cdf(x, loc=mu, scale=sigma)
2110
2111
2112
def RenderExpoCdf(lam, low, high, n=101):
2113
"""Generates sequences of xs and ps for an exponential CDF.
2114
2115
lam: parameter
2116
low: float
2117
high: float
2118
n: number of points to render
2119
2120
returns: numpy arrays (xs, ps)
2121
"""
2122
xs = np.linspace(low, high, n)
2123
ps = 1 - np.exp(-lam * xs)
2124
#ps = stats.expon.cdf(xs, scale=1.0/lam)
2125
return xs, ps
2126
2127
2128
def RenderNormalCdf(mu, sigma, low, high, n=101):
2129
"""Generates sequences of xs and ps for a Normal CDF.
2130
2131
mu: parameter
2132
sigma: parameter
2133
low: float
2134
high: float
2135
n: number of points to render
2136
2137
returns: numpy arrays (xs, ps)
2138
"""
2139
xs = np.linspace(low, high, n)
2140
ps = stats.norm.cdf(xs, mu, sigma)
2141
return xs, ps
2142
2143
2144
def RenderParetoCdf(xmin, alpha, low, high, n=50):
2145
"""Generates sequences of xs and ps for a Pareto CDF.
2146
2147
xmin: parameter
2148
alpha: parameter
2149
low: float
2150
high: float
2151
n: number of points to render
2152
2153
returns: numpy arrays (xs, ps)
2154
"""
2155
if low < xmin:
2156
low = xmin
2157
xs = np.linspace(low, high, n)
2158
ps = 1 - (xs / xmin) ** -alpha
2159
#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
2160
return xs, ps
2161
2162
2163
class Beta:
2164
"""Represents a Beta distribution.
2165
2166
See http://en.wikipedia.org/wiki/Beta_distribution
2167
"""
2168
def __init__(self, alpha=1, beta=1, label=None):
2169
"""Initializes a Beta distribution."""
2170
self.alpha = alpha
2171
self.beta = beta
2172
self.label = label if label is not None else '_nolegend_'
2173
2174
def Update(self, data):
2175
"""Updates a Beta distribution.
2176
2177
data: pair of int (heads, tails)
2178
"""
2179
heads, tails = data
2180
self.alpha += heads
2181
self.beta += tails
2182
2183
def Mean(self):
2184
"""Computes the mean of this distribution."""
2185
return self.alpha / (self.alpha + self.beta)
2186
2187
def MAP(self):
2188
"""Computes the value with maximum a posteori probability."""
2189
a = self.alpha - 1
2190
b = self.beta - 1
2191
return a / (a + b)
2192
2193
def Random(self):
2194
"""Generates a random variate from this distribution."""
2195
return random.betavariate(self.alpha, self.beta)
2196
2197
def Sample(self, n):
2198
"""Generates a random sample from this distribution.
2199
2200
n: int sample size
2201
"""
2202
size = n,
2203
return np.random.beta(self.alpha, self.beta, size)
2204
2205
def EvalPdf(self, x):
2206
"""Evaluates the PDF at x."""
2207
return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
2208
2209
def MakePmf(self, steps=101, label=None):
2210
"""Returns a Pmf of this distribution.
2211
2212
Note: Normally, we just evaluate the PDF at a sequence
2213
of points and treat the probability density as a probability
2214
mass.
2215
2216
But if alpha or beta is less than one, we have to be
2217
more careful because the PDF goes to infinity at x=0
2218
and x=1. In that case we evaluate the CDF and compute
2219
differences.
2220
2221
The result is a little funny, because the values at 0 and 1
2222
are not symmetric. Nevertheless, it is a reasonable discrete
2223
model of the continuous distribution, and behaves well as
2224
the number of values increases.
2225
"""
2226
if label is None and self.label is not None:
2227
label = self.label
2228
2229
if self.alpha < 1 or self.beta < 1:
2230
cdf = self.MakeCdf()
2231
pmf = cdf.MakePmf()
2232
return pmf
2233
2234
xs = [i / (steps - 1.0) for i in range(steps)]
2235
probs = [self.EvalPdf(x) for x in xs]
2236
pmf = Pmf(dict(zip(xs, probs)), label=label)
2237
return pmf
2238
2239
def MakeCdf(self, steps=101):
2240
"""Returns the CDF of this distribution."""
2241
xs = [i / (steps - 1.0) for i in range(steps)]
2242
ps = special.betainc(self.alpha, self.beta, xs)
2243
cdf = Cdf(xs, ps)
2244
return cdf
2245
2246
def Percentile(self, ps):
2247
"""Returns the given percentiles from this distribution.
2248
2249
ps: scalar, array, or list of [0-100]
2250
"""
2251
ps = np.asarray(ps) / 100
2252
xs = special.betaincinv(self.alpha, self.beta, ps)
2253
return xs
2254
2255
2256
class Dirichlet(object):
2257
"""Represents a Dirichlet distribution.
2258
2259
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2260
"""
2261
2262
def __init__(self, n, conc=1, label=None):
2263
"""Initializes a Dirichlet distribution.
2264
2265
n: number of dimensions
2266
conc: concentration parameter (smaller yields more concentration)
2267
label: string label
2268
"""
2269
if n < 2:
2270
raise ValueError('A Dirichlet distribution with '
2271
'n<2 makes no sense')
2272
2273
self.n = n
2274
self.params = np.ones(n, dtype=np.float) * conc
2275
self.label = label if label is not None else '_nolegend_'
2276
2277
def Update(self, data):
2278
"""Updates a Dirichlet distribution.
2279
2280
data: sequence of observations, in order corresponding to params
2281
"""
2282
m = len(data)
2283
self.params[:m] += data
2284
2285
def Random(self):
2286
"""Generates a random variate from this distribution.
2287
2288
Returns: normalized vector of fractions
2289
"""
2290
p = np.random.gamma(self.params)
2291
return p / p.sum()
2292
2293
def Likelihood(self, data):
2294
"""Computes the likelihood of the data.
2295
2296
Selects a random vector of probabilities from this distribution.
2297
2298
Returns: float probability
2299
"""
2300
m = len(data)
2301
if self.n < m:
2302
return 0
2303
2304
x = data
2305
p = self.Random()
2306
q = p[:m] ** x
2307
return q.prod()
2308
2309
def LogLikelihood(self, data):
2310
"""Computes the log likelihood of the data.
2311
2312
Selects a random vector of probabilities from this distribution.
2313
2314
Returns: float log probability
2315
"""
2316
m = len(data)
2317
if self.n < m:
2318
return float('-inf')
2319
2320
x = self.Random()
2321
y = np.log(x[:m]) * data
2322
return y.sum()
2323
2324
def MarginalBeta(self, i):
2325
"""Computes the marginal distribution of the ith element.
2326
2327
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2328
#Marginal_distributions
2329
2330
i: int
2331
2332
Returns: Beta object
2333
"""
2334
alpha0 = self.params.sum()
2335
alpha = self.params[i]
2336
return Beta(alpha, alpha0 - alpha)
2337
2338
def PredictivePmf(self, xs, label=None):
2339
"""Makes a predictive distribution.
2340
2341
xs: values to go into the Pmf
2342
2343
Returns: Pmf that maps from x to the mean prevalence of x
2344
"""
2345
alpha0 = self.params.sum()
2346
ps = self.params / alpha0
2347
return Pmf(zip(xs, ps), label=label)
2348
2349
2350
def BinomialCoef(n, k):
2351
"""Compute the binomial coefficient "n choose k".
2352
2353
n: number of trials
2354
k: number of successes
2355
2356
Returns: float
2357
"""
2358
return scipy.misc.comb(n, k)
2359
2360
2361
def LogBinomialCoef(n, k):
2362
"""Computes the log of the binomial coefficient.
2363
2364
http://math.stackexchange.com/questions/64716/
2365
approximating-the-logarithm-of-the-binomial-coefficient
2366
2367
n: number of trials
2368
k: number of successes
2369
2370
Returns: float
2371
"""
2372
return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)
2373
2374
2375
def NormalProbability(ys, jitter=0):
2376
"""Generates data for a normal probability plot.
2377
2378
ys: sequence of values
2379
jitter: float magnitude of jitter added to the ys
2380
2381
returns: numpy arrays xs, ys
2382
"""
2383
n = len(ys)
2384
xs = np.random.normal(0, 1, n)
2385
xs.sort()
2386
2387
if jitter:
2388
ys = Jitter(ys, jitter)
2389
else:
2390
ys = np.array(ys)
2391
ys.sort()
2392
2393
return xs, ys
2394
2395
2396
def Jitter(values, jitter=0.5):
2397
"""Jitters the values by adding a uniform variate in (-jitter, jitter).
2398
2399
values: sequence
2400
jitter: scalar magnitude of jitter
2401
2402
returns: new numpy array
2403
"""
2404
n = len(values)
2405
return np.random.normal(0, jitter, n) + values
2406
2407
2408
def NormalProbabilityPlot(sample, fit_color='0.8', **options):
2409
"""Makes a normal probability plot with a fitted line.
2410
2411
sample: sequence of numbers
2412
fit_color: color string for the fitted line
2413
options: passed along to Plot
2414
"""
2415
xs, ys = NormalProbability(sample)
2416
mean, var = MeanVar(sample)
2417
std = math.sqrt(var)
2418
2419
fit = FitLine(xs, mean, std)
2420
thinkplot.Plot(*fit, color=fit_color, label='model')
2421
2422
xs, ys = NormalProbability(sample)
2423
thinkplot.Plot(xs, ys, **options)
2424
2425
2426
def Mean(xs):
2427
"""Computes mean.
2428
2429
xs: sequence of values
2430
2431
returns: float mean
2432
"""
2433
return np.mean(xs)
2434
2435
2436
def Var(xs, mu=None, ddof=0):
2437
"""Computes variance.
2438
2439
xs: sequence of values
2440
mu: option known mean
2441
ddof: delta degrees of freedom
2442
2443
returns: float
2444
"""
2445
xs = np.asarray(xs)
2446
2447
if mu is None:
2448
mu = xs.mean()
2449
2450
ds = xs - mu
2451
return np.dot(ds, ds) / (len(xs) - ddof)
2452
2453
2454
def Std(xs, mu=None, ddof=0):
2455
"""Computes standard deviation.
2456
2457
xs: sequence of values
2458
mu: option known mean
2459
ddof: delta degrees of freedom
2460
2461
returns: float
2462
"""
2463
var = Var(xs, mu, ddof)
2464
return math.sqrt(var)
2465
2466
2467
def MeanVar(xs, ddof=0):
2468
"""Computes mean and variance.
2469
2470
Based on http://stackoverflow.com/questions/19391149/
2471
numpy-mean-and-variance-from-single-function
2472
2473
xs: sequence of values
2474
ddof: delta degrees of freedom
2475
2476
returns: pair of float, mean and var
2477
"""
2478
xs = np.asarray(xs)
2479
mean = xs.mean()
2480
s2 = Var(xs, mean, ddof)
2481
return mean, s2
2482
2483
2484
def Trim(t, p=0.01):
2485
"""Trims the largest and smallest elements of t.
2486
2487
Args:
2488
t: sequence of numbers
2489
p: fraction of values to trim off each end
2490
2491
Returns:
2492
sequence of values
2493
"""
2494
n = int(p * len(t))
2495
t = sorted(t)[n:-n]
2496
return t
2497
2498
2499
def TrimmedMean(t, p=0.01):
2500
"""Computes the trimmed mean of a sequence of numbers.
2501
2502
Args:
2503
t: sequence of numbers
2504
p: fraction of values to trim off each end
2505
2506
Returns:
2507
float
2508
"""
2509
t = Trim(t, p)
2510
return Mean(t)
2511
2512
2513
def TrimmedMeanVar(t, p=0.01):
2514
"""Computes the trimmed mean and variance of a sequence of numbers.
2515
2516
Side effect: sorts the list.
2517
2518
Args:
2519
t: sequence of numbers
2520
p: fraction of values to trim off each end
2521
2522
Returns:
2523
float
2524
"""
2525
t = Trim(t, p)
2526
mu, var = MeanVar(t)
2527
return mu, var
2528
2529
2530
def CohenEffectSize(group1, group2):
2531
"""Compute Cohen's d.
2532
2533
group1: Series or NumPy array
2534
group2: Series or NumPy array
2535
2536
returns: float
2537
"""
2538
diff = group1.mean() - group2.mean()
2539
2540
n1, n2 = len(group1), len(group2)
2541
var1 = group1.var()
2542
var2 = group2.var()
2543
2544
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
2545
d = diff / math.sqrt(pooled_var)
2546
return d
2547
2548
2549
def Cov(xs, ys, meanx=None, meany=None):
2550
"""Computes Cov(X, Y).
2551
2552
Args:
2553
xs: sequence of values
2554
ys: sequence of values
2555
meanx: optional float mean of xs
2556
meany: optional float mean of ys
2557
2558
Returns:
2559
Cov(X, Y)
2560
"""
2561
xs = np.asarray(xs)
2562
ys = np.asarray(ys)
2563
2564
if meanx is None:
2565
meanx = np.mean(xs)
2566
if meany is None:
2567
meany = np.mean(ys)
2568
2569
cov = np.dot(xs-meanx, ys-meany) / len(xs)
2570
return cov
2571
2572
2573
def Corr(xs, ys):
2574
"""Computes Corr(X, Y).
2575
2576
Args:
2577
xs: sequence of values
2578
ys: sequence of values
2579
2580
Returns:
2581
Corr(X, Y)
2582
"""
2583
xs = np.asarray(xs)
2584
ys = np.asarray(ys)
2585
2586
meanx, varx = MeanVar(xs)
2587
meany, vary = MeanVar(ys)
2588
2589
corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)
2590
2591
return corr
2592
2593
2594
def SerialCorr(series, lag=1):
2595
"""Computes the serial correlation of a series.
2596
2597
series: Series
2598
lag: integer number of intervals to shift
2599
2600
returns: float correlation
2601
"""
2602
xs = series[lag:]
2603
ys = series.shift(lag)[lag:]
2604
corr = Corr(xs, ys)
2605
return corr
2606
2607
2608
def SpearmanCorr(xs, ys):
2609
"""Computes Spearman's rank correlation.
2610
2611
Args:
2612
xs: sequence of values
2613
ys: sequence of values
2614
2615
Returns:
2616
float Spearman's correlation
2617
"""
2618
xranks = pandas.Series(xs).rank()
2619
yranks = pandas.Series(ys).rank()
2620
return Corr(xranks, yranks)
2621
2622
2623
def MapToRanks(t):
2624
"""Returns a list of ranks corresponding to the elements in t.
2625
2626
Args:
2627
t: sequence of numbers
2628
2629
Returns:
2630
list of integer ranks, starting at 1
2631
"""
2632
# pair up each value with its index
2633
pairs = enumerate(t)
2634
2635
# sort by value
2636
sorted_pairs = sorted(pairs, key=itemgetter(1))
2637
2638
# pair up each pair with its rank
2639
ranked = enumerate(sorted_pairs)
2640
2641
# sort by index
2642
resorted = sorted(ranked, key=lambda trip: trip[1][0])
2643
2644
# extract the ranks
2645
ranks = [trip[0]+1 for trip in resorted]
2646
return ranks
2647
2648
2649
def LeastSquares(xs, ys):
2650
"""Computes a linear least squares fit for ys as a function of xs.
2651
2652
Args:
2653
xs: sequence of values
2654
ys: sequence of values
2655
2656
Returns:
2657
tuple of (intercept, slope)
2658
"""
2659
meanx, varx = MeanVar(xs)
2660
meany = Mean(ys)
2661
2662
slope = Cov(xs, ys, meanx, meany) / varx
2663
inter = meany - slope * meanx
2664
2665
return inter, slope
2666
2667
2668
def FitLine(xs, inter, slope):
2669
"""Fits a line to the given data.
2670
2671
xs: sequence of x
2672
2673
returns: tuple of numpy arrays (sorted xs, fit ys)
2674
"""
2675
fit_xs = np.sort(xs)
2676
fit_ys = inter + slope * fit_xs
2677
return fit_xs, fit_ys
2678
2679
2680
def Residuals(xs, ys, inter, slope):
2681
"""Computes residuals for a linear fit with parameters inter and slope.
2682
2683
Args:
2684
xs: independent variable
2685
ys: dependent variable
2686
inter: float intercept
2687
slope: float slope
2688
2689
Returns:
2690
list of residuals
2691
"""
2692
xs = np.asarray(xs)
2693
ys = np.asarray(ys)
2694
res = ys - (inter + slope * xs)
2695
return res
2696
2697
2698
def CoefDetermination(ys, res):
2699
"""Computes the coefficient of determination (R^2) for given residuals.
2700
2701
Args:
2702
ys: dependent variable
2703
res: residuals
2704
2705
Returns:
2706
float coefficient of determination
2707
"""
2708
return 1 - Var(res) / Var(ys)
2709
2710
2711
def CorrelatedGenerator(rho):
2712
"""Generates standard normal variates with serial correlation.
2713
2714
rho: target coefficient of correlation
2715
2716
Returns: iterable
2717
"""
2718
x = random.gauss(0, 1)
2719
yield x
2720
2721
sigma = math.sqrt(1 - rho**2)
2722
while True:
2723
x = random.gauss(x * rho, sigma)
2724
yield x
2725
2726
2727
def CorrelatedNormalGenerator(mu, sigma, rho):
2728
"""Generates normal variates with serial correlation.
2729
2730
mu: mean of variate
2731
sigma: standard deviation of variate
2732
rho: target coefficient of correlation
2733
2734
Returns: iterable
2735
"""
2736
for x in CorrelatedGenerator(rho):
2737
yield x * sigma + mu
2738
2739
2740
def RawMoment(xs, k):
2741
"""Computes the kth raw moment of xs.
2742
"""
2743
return sum(x**k for x in xs) / len(xs)
2744
2745
2746
def CentralMoment(xs, k):
2747
"""Computes the kth central moment of xs.
2748
"""
2749
mean = RawMoment(xs, 1)
2750
return sum((x - mean)**k for x in xs) / len(xs)
2751
2752
2753
def StandardizedMoment(xs, k):
2754
"""Computes the kth standardized moment of xs.
2755
"""
2756
var = CentralMoment(xs, 2)
2757
std = math.sqrt(var)
2758
return CentralMoment(xs, k) / std**k
2759
2760
2761
def Skewness(xs):
2762
"""Computes skewness.
2763
"""
2764
return StandardizedMoment(xs, 3)
2765
2766
2767
def Median(xs):
2768
"""Computes the median (50th percentile) of a sequence.
2769
2770
xs: sequence or anything else that can initialize a Cdf
2771
2772
returns: float
2773
"""
2774
cdf = Cdf(xs)
2775
return cdf.Value(0.5)
2776
2777
2778
def IQR(xs):
2779
"""Computes the interquartile of a sequence.
2780
2781
xs: sequence or anything else that can initialize a Cdf
2782
2783
returns: pair of floats
2784
"""
2785
cdf = Cdf(xs)
2786
return cdf.Value(0.25), cdf.Value(0.75)
2787
2788
2789
def PearsonMedianSkewness(xs):
2790
"""Computes the Pearson median skewness.
2791
"""
2792
median = Median(xs)
2793
mean = RawMoment(xs, 1)
2794
var = CentralMoment(xs, 2)
2795
std = math.sqrt(var)
2796
gp = 3 * (mean - median) / std
2797
return gp
2798
2799
2800
class FixedWidthVariables(object):
2801
"""Represents a set of variables in a fixed width file."""
2802
2803
def __init__(self, variables, index_base=0):
2804
"""Initializes.
2805
2806
variables: DataFrame
2807
index_base: are the indices 0 or 1 based?
2808
2809
Attributes:
2810
colspecs: list of (start, end) index tuples
2811
names: list of string variable names
2812
"""
2813
self.variables = variables
2814
2815
# note: by default, subtract 1 from colspecs
2816
self.colspecs = variables[['start', 'end']] - index_base
2817
2818
# convert colspecs to a list of pair of int
2819
self.colspecs = self.colspecs.astype(np.int).values.tolist()
2820
self.names = variables['name']
2821
2822
def ReadFixedWidth(self, filename, **options):
2823
"""Reads a fixed width ASCII file.
2824
2825
filename: string filename
2826
2827
returns: DataFrame
2828
"""
2829
df = pandas.read_fwf(filename,
2830
colspecs=self.colspecs,
2831
names=self.names,
2832
**options)
2833
return df
2834
2835
2836
def ReadStataDct(dct_file, **options):
2837
"""Reads a Stata dictionary file.
2838
2839
dct_file: string filename
2840
options: dict of options passed to open()
2841
2842
returns: FixedWidthVariables object
2843
"""
2844
type_map = dict(byte=int, int=int, long=int, float=float,
2845
double=float, numeric=float)
2846
2847
var_info = []
2848
with open(dct_file, **options) as f:
2849
for line in f:
2850
match = re.search( r'_column\(([^)]*)\)', line)
2851
if not match:
2852
continue
2853
start = int(match.group(1))
2854
t = line.split()
2855
vtype, name, fstring = t[1:4]
2856
name = name.lower()
2857
if vtype.startswith('str'):
2858
vtype = str
2859
else:
2860
vtype = type_map[vtype]
2861
long_desc = ' '.join(t[4:]).strip('"')
2862
var_info.append((start, vtype, name, fstring, long_desc))
2863
2864
columns = ['start', 'type', 'name', 'fstring', 'desc']
2865
variables = pandas.DataFrame(var_info, columns=columns)
2866
2867
# fill in the end column by shifting the start column
2868
variables['end'] = variables.start.shift(-1)
2869
variables.loc[len(variables)-1, 'end'] = 0
2870
2871
dct = FixedWidthVariables(variables, index_base=1)
2872
return dct
2873
2874
2875
def Resample(xs, n=None):
2876
"""Draw a sample from xs with the same length as xs.
2877
2878
xs: sequence
2879
n: sample size (default: len(xs))
2880
2881
returns: NumPy array
2882
"""
2883
if n is None:
2884
n = len(xs)
2885
return np.random.choice(xs, n, replace=True)
2886
2887
2888
def SampleRows(df, nrows, replace=False):
2889
"""Choose a sample of rows from a DataFrame.
2890
2891
df: DataFrame
2892
nrows: number of rows
2893
replace: whether to sample with replacement
2894
2895
returns: DataDf
2896
"""
2897
indices = np.random.choice(df.index, nrows, replace=replace)
2898
sample = df.loc[indices]
2899
return sample
2900
2901
2902
def ResampleRows(df):
2903
"""Resamples rows from a DataFrame.
2904
2905
df: DataFrame
2906
2907
returns: DataFrame
2908
"""
2909
return SampleRows(df, len(df), replace=True)
2910
2911
2912
def ResampleRowsWeighted(df, column='finalwgt'):
2913
"""Resamples a DataFrame using probabilities proportional to given column.
2914
2915
df: DataFrame
2916
column: string column name to use as weights
2917
2918
returns: DataFrame
2919
"""
2920
weights = df[column].copy()
2921
weights /= sum(weights)
2922
indices = np.random.choice(df.index, len(df), replace=True, p=weights)
2923
sample = df.loc[indices]
2924
return sample
2925
2926
2927
def PercentileRow(array, p):
2928
"""Selects the row from a sorted array that maps to percentile p.
2929
2930
p: float 0--100
2931
2932
returns: NumPy array (one row)
2933
"""
2934
rows, cols = array.shape
2935
index = int(rows * p / 100)
2936
return array[index,]
2937
2938
2939
def PercentileRows(ys_seq, percents):
2940
"""Given a collection of lines, selects percentiles along vertical axis.
2941
2942
For example, if ys_seq contains simulation results like ys as a
2943
function of time, and percents contains (5, 95), the result would
2944
be a 90% CI for each vertical slice of the simulation results.
2945
2946
ys_seq: sequence of lines (y values)
2947
percents: list of percentiles (0-100) to select
2948
2949
returns: list of NumPy arrays, one for each percentile
2950
"""
2951
nrows = len(ys_seq)
2952
ncols = len(ys_seq[0])
2953
array = np.zeros((nrows, ncols))
2954
2955
for i, ys in enumerate(ys_seq):
2956
array[i,] = ys
2957
2958
array = np.sort(array, axis=0)
2959
2960
rows = [PercentileRow(array, p) for p in percents]
2961
return rows
2962
2963
2964
def Smooth(xs, sigma=2, **options):
2965
"""Smooths a NumPy array with a Gaussian filter.
2966
2967
xs: sequence
2968
sigma: standard deviation of the filter
2969
"""
2970
return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
2971
2972
2973
class HypothesisTest(object):
2974
"""Represents a hypothesis test."""
2975
2976
def __init__(self, data):
2977
"""Initializes.
2978
2979
data: data in whatever form is relevant
2980
"""
2981
self.data = data
2982
self.MakeModel()
2983
self.actual = self.TestStatistic(data)
2984
self.test_stats = None
2985
self.test_cdf = None
2986
2987
def PValue(self, iters=1000):
2988
"""Computes the distribution of the test statistic and p-value.
2989
2990
iters: number of iterations
2991
2992
returns: float p-value
2993
"""
2994
self.test_stats = [self.TestStatistic(self.RunModel())
2995
for _ in range(iters)]
2996
self.test_cdf = Cdf(self.test_stats)
2997
2998
count = sum(1 for x in self.test_stats if x >= self.actual)
2999
return count / iters
3000
3001
def MaxTestStat(self):
3002
"""Returns the largest test statistic seen during simulations.
3003
"""
3004
return max(self.test_stats)
3005
3006
def PlotCdf(self, label=None):
3007
"""Draws a Cdf with vertical lines at the observed test stat.
3008
"""
3009
def VertLine(x):
3010
"""Draws a vertical line at x."""
3011
thinkplot.Plot([x, x], [0, 1], color='0.8')
3012
3013
VertLine(self.actual)
3014
thinkplot.Cdf(self.test_cdf, label=label)
3015
3016
def TestStatistic(self, data):
3017
"""Computes the test statistic.
3018
3019
data: data in whatever form is relevant
3020
"""
3021
raise UnimplementedMethodException()
3022
3023
def MakeModel(self):
3024
"""Build a model of the null hypothesis.
3025
"""
3026
pass
3027
3028
def RunModel(self):
3029
"""Run the model of the null hypothesis.
3030
3031
returns: simulated data
3032
"""
3033
raise UnimplementedMethodException()
3034
3035
3036
def main():
3037
pass
3038
3039
3040
if __name__ == '__main__':
3041
main()
3042
3043