| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7089License: GPL3
"""This file contains code for use with "Think Stats" and1"Think Bayes", both by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89"""This file contains class definitions for:1011Hist: represents a histogram (map from values to integer frequencies).1213Pmf: represents a probability mass function (map from values to probs).1415_DictWrapper: private parent class for Hist and Pmf.1617Cdf: represents a discrete cumulative distribution function1819Pdf: represents a continuous probability density function2021"""2223import bisect24import copy25import logging26import math27import random28import re2930from collections import Counter31from operator import itemgetter3233import thinkplot3435import numpy as np36import pandas3738import scipy39from scipy import stats40from scipy import special41from scipy import ndimage4243from scipy.special import gamma4445from io import open4647ROOT2 = math.sqrt(2)4849def RandomSeed(x):50"""Initialize the random and np.random generators.5152x: int seed53"""54random.seed(x)55np.random.seed(x)565758def Odds(p):59"""Computes odds for a given probability.6061Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.6263Note: when p=1, the formula for odds divides by zero, which is64normally undefined. But I think it is reasonable to define Odds(1)65to be infinity, so that's what this function does.6667p: float 0-16869Returns: float odds70"""71if p == 1:72return float('inf')73return p / (1 - p)747576def Probability(o):77"""Computes the probability corresponding to given odds.7879Example: o=2 means 2:1 odds in favor, or 2/3 probability8081o: float odds, strictly positive8283Returns: float probability84"""85return o / (o + 1)868788def Probability2(yes, no):89"""Computes the probability corresponding to given odds.9091Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.9293yes, no: int or float odds in favor94"""95return yes / (yes + no)969798class Interpolator(object):99"""Represents a mapping between sorted sequences; performs linear interp.100101Attributes:102xs: sorted list103ys: sorted list104"""105106def __init__(self, xs, ys):107self.xs = xs108self.ys = ys109110def Lookup(self, x):111"""Looks up x and returns the corresponding value of y."""112return self._Bisect(x, self.xs, self.ys)113114def Reverse(self, y):115"""Looks up y and returns the corresponding value of x."""116return self._Bisect(y, self.ys, self.xs)117118def _Bisect(self, x, xs, ys):119"""Helper function."""120if x <= xs[0]:121return ys[0]122if x >= xs[-1]:123return ys[-1]124i = bisect.bisect(xs, x)125frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])126y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])127return y128129130# When we plot Hist, Pmf and Cdf objects, they don't appear in131# the legend unless we override the default label.132DEFAULT_LABEL = '_nolegend_'133134135class _DictWrapper(object):136"""An object that contains a dictionary."""137138def __init__(self, obj=None, label=None):139"""Initializes the distribution.140141obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs142label: string label143"""144self.label = label if label is not None else DEFAULT_LABEL145self.d = {}146147# flag whether the distribution is under a log transform148self.log = False149150if obj is None:151return152153if isinstance(obj, (_DictWrapper, Cdf, Pdf)):154self.label = label if label is not None else obj.label155156if isinstance(obj, dict):157self.d.update(obj.items())158elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):159self.d.update(obj.Items())160elif isinstance(obj, pandas.Series):161self.d.update(obj.value_counts().iteritems())162else:163# finally, treat it like a list164self.d.update(Counter(obj))165166if len(self) > 0 and isinstance(self, Pmf):167self.Normalize()168169def __hash__(self):170return id(self)171172def __str__(self):173cls = self.__class__.__name__174if self.label == DEFAULT_LABEL:175return '%s(%s)' % (cls, str(self.d))176else:177return self.label178179def __repr__(self):180cls = self.__class__.__name__181if self.label == DEFAULT_LABEL:182return '%s(%s)' % (cls, repr(self.d))183else:184return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label))185186def __eq__(self, other):187try:188return self.d == other.d189except AttributeError:190return False191192def __len__(self):193return len(self.d)194195def __iter__(self):196return iter(self.d)197198def iterkeys(self):199"""Returns an iterator over keys."""200return iter(self.d)201202def __contains__(self, value):203return value in self.d204205def __getitem__(self, value):206return self.d.get(value, 0)207208def __setitem__(self, value, prob):209self.d[value] = prob210211def __delitem__(self, value):212del self.d[value]213214def Copy(self, label=None):215"""Returns a copy.216217Make a shallow copy of d. If you want a deep copy of d,218use copy.deepcopy on the whole object.219220label: string label for the new Hist221222returns: new _DictWrapper with the same type223"""224new = copy.copy(self)225new.d = copy.copy(self.d)226new.label = label if label is not None else self.label227return new228229def Scale(self, factor):230"""Multiplies the values by a factor.231232factor: what to multiply by233234Returns: new object235"""236new = self.Copy()237new.d.clear()238239for val, prob in self.Items():240new.Set(val * factor, prob)241return new242243def Log(self, m=None):244"""Log transforms the probabilities.245246Removes values with probability 0.247248Normalizes so that the largest logprob is 0.249"""250if self.log:251raise ValueError("Pmf/Hist already under a log transform")252self.log = True253254if m is None:255m = self.MaxLike()256257for x, p in self.d.items():258if p:259self.Set(x, math.log(p / m))260else:261self.Remove(x)262263def Exp(self, m=None):264"""Exponentiates the probabilities.265266m: how much to shift the ps before exponentiating267268If m is None, normalizes so that the largest prob is 1.269"""270if not self.log:271raise ValueError("Pmf/Hist not under a log transform")272self.log = False273274if m is None:275m = self.MaxLike()276277for x, p in self.d.items():278self.Set(x, math.exp(p - m))279280def GetDict(self):281"""Gets the dictionary."""282return self.d283284def SetDict(self, d):285"""Sets the dictionary."""286self.d = d287288def Values(self):289"""Gets an unsorted sequence of values.290291Note: one source of confusion is that the keys of this292dictionary are the values of the Hist/Pmf, and the293values of the dictionary are frequencies/probabilities.294"""295return self.d.keys()296297def Items(self):298"""Gets an unsorted sequence of (value, freq/prob) pairs."""299return self.d.items()300301def SortedItems(self):302"""Gets a sorted sequence of (value, freq/prob) pairs.303304It items are unsortable, the result is unsorted.305"""306def isnan(x):307try:308return math.isnan(x)309except TypeError:310return False311312if any([isnan(x) for x in self.Values()]):313msg = 'Keys contain NaN, may not sort correctly.'314logging.warning(msg)315316try:317return sorted(self.d.items())318except TypeError:319return self.d.items()320321def Render(self, **options):322"""Generates a sequence of points suitable for plotting.323324Note: options are ignored325326Returns:327tuple of (sorted value sequence, freq/prob sequence)328"""329return zip(*self.SortedItems())330331def MakeCdf(self, label=None):332"""Makes a Cdf."""333label = label if label is not None else self.label334return Cdf(self, label=label)335336def Print(self):337"""Prints the values and freqs/probs in ascending order."""338for val, prob in self.SortedItems():339print(val, prob)340341def Set(self, x, y=0):342"""Sets the freq/prob associated with the value x.343344Args:345x: number value346y: number freq or prob347"""348self.d[x] = y349350def Incr(self, x, term=1):351"""Increments the freq/prob associated with the value x.352353Args:354x: number value355term: how much to increment by356"""357self.d[x] = self.d.get(x, 0) + term358359def Mult(self, x, factor):360"""Scales the freq/prob associated with the value x.361362Args:363x: number value364factor: how much to multiply by365"""366self.d[x] = self.d.get(x, 0) * factor367368def Remove(self, x):369"""Removes a value.370371Throws an exception if the value is not there.372373Args:374x: value to remove375"""376del self.d[x]377378def Total(self):379"""Returns the total of the frequencies/probabilities in the map."""380total = sum(self.d.values())381return total382383def MaxLike(self):384"""Returns the largest frequency/probability in the map."""385return max(self.d.values())386387def Largest(self, n=10):388"""Returns the largest n values, with frequency/probability.389390n: number of items to return391"""392return sorted(self.d.items(), reverse=True)[:n]393394def Smallest(self, n=10):395"""Returns the smallest n values, with frequency/probability.396397n: number of items to return398"""399return sorted(self.d.items(), reverse=False)[:n]400401402class Hist(_DictWrapper):403"""Represents a histogram, which is a map from values to frequencies.404405Values can be any hashable type; frequencies are integer counters.406"""407def Freq(self, x):408"""Gets the frequency associated with the value x.409410Args:411x: number value412413Returns:414int frequency415"""416return self.d.get(x, 0)417418def Freqs(self, xs):419"""Gets frequencies for a sequence of values."""420return [self.Freq(x) for x in xs]421422def IsSubset(self, other):423"""Checks whether the values in this histogram are a subset of424the values in the given histogram."""425for val, freq in self.Items():426if freq > other.Freq(val):427return False428return True429430def Subtract(self, other):431"""Subtracts the values in the given histogram from this histogram."""432for val, freq in other.Items():433self.Incr(val, -freq)434435436class Pmf(_DictWrapper):437"""Represents a probability mass function.438439Values can be any hashable type; probabilities are floating-point.440Pmfs are not necessarily normalized.441"""442443def Prob(self, x, default=0):444"""Gets the probability associated with the value x.445446Args:447x: number value448default: value to return if the key is not there449450Returns:451float probability452"""453return self.d.get(x, default)454455def Probs(self, xs):456"""Gets probabilities for a sequence of values."""457return [self.Prob(x) for x in xs]458459def Percentile(self, percentage):460"""Computes a percentile of a given Pmf.461462Note: this is not super efficient. If you are planning463to compute more than a few percentiles, compute the Cdf.464465percentage: float 0-100466467returns: value from the Pmf468"""469p = percentage / 100470total = 0471for val, prob in sorted(self.Items()):472total += prob473if total >= p:474return val475476def ProbGreater(self, x):477"""Probability that a sample from this Pmf exceeds x.478479x: number480481returns: float probability482"""483if isinstance(x, _DictWrapper):484return PmfProbGreater(self, x)485else:486t = [prob for (val, prob) in self.d.items() if val > x]487return sum(t)488489def ProbLess(self, x):490"""Probability that a sample from this Pmf is less than x.491492x: number493494returns: float probability495"""496if isinstance(x, _DictWrapper):497return PmfProbLess(self, x)498else:499t = [prob for (val, prob) in self.d.items() if val < x]500return sum(t)501502def ProbEqual(self, x):503"""Probability that a sample from this Pmf is exactly x.504505x: number506507returns: float probability508"""509if isinstance(x, _DictWrapper):510return PmfProbEqual(self, x)511else:512return self[x]513514# NOTE: I've decided to remove the magic comparators because they515# have the side-effect of making Pmf sortable, but in fact they516# don't support sorting.517518def Normalize(self, fraction=1):519"""Normalizes this PMF so the sum of all probs is fraction.520521Args:522fraction: what the total should be after normalization523524Returns: the total probability before normalizing525"""526if self.log:527raise ValueError("Normalize: Pmf is under a log transform")528529total = self.Total()530if total == 0:531raise ValueError('Normalize: total probability is zero.')532533factor = fraction / total534for x in self.d:535self.d[x] *= factor536537return total538539def Random(self):540"""Chooses a random element from this PMF.541542Note: this is not very efficient. If you plan to call543this more than a few times, consider converting to a CDF.544545Returns:546float value from the Pmf547"""548target = random.random()549total = 0550for x, p in self.d.items():551total += p552if total >= target:553return x554555# we shouldn't get here556raise ValueError('Random: Pmf might not be normalized.')557558def Sample(self, n):559"""Generates a random sample from this distribution.560561n: int length of the sample562returns: NumPy array563"""564return self.MakeCdf().Sample(n)565566def Mean(self):567"""Computes the mean of a PMF.568569Returns:570float mean571"""572return sum(p * x for x, p in self.Items())573574def Median(self):575"""Computes the median of a PMF.576577Returns:578float median579"""580return self.MakeCdf().Percentile(50)581582def Var(self, mu=None):583"""Computes the variance of a PMF.584585mu: the point around which the variance is computed;586if omitted, computes the mean587588returns: float variance589"""590if mu is None:591mu = self.Mean()592593return sum(p * (x-mu)**2 for x, p in self.Items())594595def Expect(self, func):596"""Computes the expectation of func(x).597598Returns:599expectation600"""601return np.sum(p * func(x) for x, p in self.Items())602603def Std(self, mu=None):604"""Computes the standard deviation of a PMF.605606mu: the point around which the variance is computed;607if omitted, computes the mean608609returns: float standard deviation610"""611var = self.Var(mu)612return math.sqrt(var)613614def Mode(self):615"""Returns the value with the highest probability.616617Returns: float probability618"""619_, val = max((prob, val) for val, prob in self.Items())620return val621622# The mode of a posterior is the maximum aposteori probability (MAP)623MAP = Mode624625# If the distribution contains likelihoods only, the peak is the626# maximum likelihood estimator.627MaximumLikelihood = Mode628629def CredibleInterval(self, percentage=90):630"""Computes the central credible interval.631632If percentage=90, computes the 90% CI.633634Args:635percentage: float between 0 and 100636637Returns:638sequence of two floats, low and high639"""640cdf = self.MakeCdf()641return cdf.CredibleInterval(percentage)642643def __add__(self, other):644"""Computes the Pmf of the sum of values drawn from self and other.645646other: another Pmf or a scalar647648returns: new Pmf649"""650try:651return self.AddPmf(other)652except AttributeError:653return self.AddConstant(other)654655__radd__ = __add__656657def AddPmf(self, other):658"""Computes the Pmf of the sum of values drawn from self and other.659660other: another Pmf661662returns: new Pmf663"""664pmf = Pmf()665for v1, p1 in self.Items():666for v2, p2 in other.Items():667pmf[v1 + v2] += p1 * p2668return pmf669670def AddConstant(self, other):671"""Computes the Pmf of the sum a constant and values from self.672673other: a number674675returns: new Pmf676"""677if other == 0:678return self.Copy()679680pmf = Pmf()681for v1, p1 in self.Items():682pmf.Set(v1 + other, p1)683return pmf684685def __sub__(self, other):686"""Computes the Pmf of the diff of values drawn from self and other.687688other: another Pmf689690returns: new Pmf691"""692try:693return self.SubPmf(other)694except AttributeError:695return self.AddConstant(-other)696697def SubPmf(self, other):698"""Computes the Pmf of the diff of values drawn from self and other.699700other: another Pmf701702returns: new Pmf703"""704pmf = Pmf()705for v1, p1 in self.Items():706for v2, p2 in other.Items():707pmf.Incr(v1 - v2, p1 * p2)708return pmf709710def __mul__(self, other):711"""Computes the Pmf of the product of values drawn from self and other.712713other: another Pmf714715returns: new Pmf716"""717try:718return self.MulPmf(other)719except AttributeError:720return self.MulConstant(other)721722def MulPmf(self, other):723"""Computes the Pmf of the diff of values drawn from self and other.724725other: another Pmf726727returns: new Pmf728"""729pmf = Pmf()730for v1, p1 in self.Items():731for v2, p2 in other.Items():732pmf.Incr(v1 * v2, p1 * p2)733return pmf734735def MulConstant(self, other):736"""Computes the Pmf of the product of a constant and values from self.737738other: a number739740returns: new Pmf741"""742pmf = Pmf()743for v1, p1 in self.Items():744pmf.Set(v1 * other, p1)745return pmf746747def __div__(self, other):748"""Computes the Pmf of the ratio of values drawn from self and other.749750other: another Pmf751752returns: new Pmf753"""754try:755return self.DivPmf(other)756except AttributeError:757return self.MulConstant(1/other)758759__truediv__ = __div__760761def DivPmf(self, other):762"""Computes the Pmf of the ratio of values drawn from self and other.763764other: another Pmf765766returns: new Pmf767"""768pmf = Pmf()769for v1, p1 in self.Items():770for v2, p2 in other.Items():771pmf.Incr(v1 / v2, p1 * p2)772return pmf773774def Max(self, k):775"""Computes the CDF of the maximum of k selections from this dist.776777k: int778779returns: new Cdf780"""781cdf = self.MakeCdf()782cdf.ps **= k783return cdf784785786class Joint(Pmf):787"""Represents a joint distribution.788789The values are sequences (usually tuples)790"""791792def Marginal(self, i, label=None):793"""Gets the marginal distribution of the indicated variable.794795i: index of the variable we want796797Returns: Pmf798"""799pmf = Pmf(label=label)800for vs, prob in self.Items():801pmf.Incr(vs[i], prob)802return pmf803804def Conditional(self, i, j, val, label=None):805"""Gets the conditional distribution of the indicated variable.806807Distribution of vs[i], conditioned on vs[j] = val.808809i: index of the variable we want810j: which variable is conditioned on811val: the value the jth variable has to have812813Returns: Pmf814"""815pmf = Pmf(label=label)816for vs, prob in self.Items():817if vs[j] != val:818continue819pmf.Incr(vs[i], prob)820821pmf.Normalize()822return pmf823824def MaxLikeInterval(self, percentage=90):825"""Returns the maximum-likelihood credible interval.826827If percentage=90, computes a 90% CI containing the values828with the highest likelihoods.829830percentage: float between 0 and 100831832Returns: list of values from the suite833"""834interval = []835total = 0836837t = [(prob, val) for val, prob in self.Items()]838t.sort(reverse=True)839840for prob, val in t:841interval.append(val)842total += prob843if total >= percentage / 100:844break845846return interval847848849def MakeJoint(pmf1, pmf2):850"""Joint distribution of values from pmf1 and pmf2.851852Assumes that the PMFs represent independent random variables.853854Args:855pmf1: Pmf object856pmf2: Pmf object857858Returns:859Joint pmf of value pairs860"""861joint = Joint()862for v1, p1 in pmf1.Items():863for v2, p2 in pmf2.Items():864joint.Set((v1, v2), p1 * p2)865return joint866867868def MakeHistFromList(t, label=None):869"""Makes a histogram from an unsorted sequence of values.870871Args:872t: sequence of numbers873label: string label for this histogram874875Returns:876Hist object877"""878return Hist(t, label=label)879880881def MakeHistFromDict(d, label=None):882"""Makes a histogram from a map from values to frequencies.883884Args:885d: dictionary that maps values to frequencies886label: string label for this histogram887888Returns:889Hist object890"""891return Hist(d, label)892893894def MakePmfFromList(t, label=None):895"""Makes a PMF from an unsorted sequence of values.896897Args:898t: sequence of numbers899label: string label for this PMF900901Returns:902Pmf object903"""904return Pmf(t, label=label)905906907def MakePmfFromDict(d, label=None):908"""Makes a PMF from a map from values to probabilities.909910Args:911d: dictionary that maps values to probabilities912label: string label for this PMF913914Returns:915Pmf object916"""917return Pmf(d, label=label)918919920def MakePmfFromItems(t, label=None):921"""Makes a PMF from a sequence of value-probability pairs922923Args:924t: sequence of value-probability pairs925label: string label for this PMF926927Returns:928Pmf object929"""930return Pmf(dict(t), label=label)931932933def MakePmfFromHist(hist, label=None):934"""Makes a normalized PMF from a Hist object.935936Args:937hist: Hist object938label: string label939940Returns:941Pmf object942"""943if label is None:944label = hist.label945946return Pmf(hist, label=label)947948949def MakeMixture(metapmf, label='mix'):950"""Make a mixture distribution.951952Args:953metapmf: Pmf that maps from Pmfs to probs.954label: string label for the new Pmf.955956Returns: Pmf object.957"""958mix = Pmf(label=label)959for pmf, p1 in metapmf.Items():960for x, p2 in pmf.Items():961mix[x] += p1 * p2962return mix963964965def MakeUniformPmf(low, high, n):966"""Make a uniform Pmf.967968low: lowest value (inclusive)969high: highest value (inclusize)970n: number of values971"""972pmf = Pmf()973for x in np.linspace(low, high, n):974pmf.Set(x, 1)975pmf.Normalize()976return pmf977978979class Cdf:980"""Represents a cumulative distribution function.981982Attributes:983xs: sequence of values984ps: sequence of probabilities985label: string used as a graph label.986"""987def __init__(self, obj=None, ps=None, label=None):988"""Initializes.989990If ps is provided, obj must be the corresponding list of values.991992obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs993ps: list of cumulative probabilities994label: string label995"""996self.label = label if label is not None else DEFAULT_LABEL997998if isinstance(obj, (_DictWrapper, Cdf, Pdf)):999if not label:1000self.label = label if label is not None else obj.label10011002if obj is None:1003# caller does not provide obj, make an empty Cdf1004self.xs = np.asarray([])1005self.ps = np.asarray([])1006if ps is not None:1007logging.warning("Cdf: can't pass ps without also passing xs.")1008return1009else:1010# if the caller provides xs and ps, just store them1011if ps is not None:1012if isinstance(ps, str):1013logging.warning("Cdf: ps can't be a string")10141015self.xs = np.asarray(obj)1016self.ps = np.asarray(ps)1017return10181019# caller has provided just obj, not ps1020if isinstance(obj, Cdf):1021self.xs = copy.copy(obj.xs)1022self.ps = copy.copy(obj.ps)1023return10241025if isinstance(obj, _DictWrapper):1026dw = obj1027else:1028dw = Hist(obj)10291030if len(dw) == 0:1031self.xs = np.asarray([])1032self.ps = np.asarray([])1033return10341035xs, freqs = zip(*sorted(dw.Items()))1036self.xs = np.asarray(xs)1037self.ps = np.cumsum(freqs, dtype=np.float)1038self.ps /= self.ps[-1]10391040def __str__(self):1041cls = self.__class__.__name__1042if self.label == DEFAULT_LABEL:1043return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))1044else:1045return self.label10461047def __repr__(self):1048cls = self.__class__.__name__1049if self.label == DEFAULT_LABEL:1050return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))1051else:1052return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps),1053repr(self.label))10541055def __len__(self):1056return len(self.xs)10571058def __getitem__(self, x):1059return self.Prob(x)10601061def __setitem__(self):1062raise UnimplementedMethodException()10631064def __delitem__(self):1065raise UnimplementedMethodException()10661067def __eq__(self, other):1068return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)10691070def Print(self):1071"""Prints the values and freqs/probs in ascending order."""1072for val, prob in zip(self.xs, self.ps):1073print(val, prob)10741075def Copy(self, label=None):1076"""Returns a copy of this Cdf.10771078label: string label for the new Cdf1079"""1080if label is None:1081label = self.label1082return Cdf(list(self.xs), list(self.ps), label=label)10831084def MakePmf(self, label=None):1085"""Makes a Pmf."""1086if label is None:1087label = self.label1088return Pmf(self, label=label)10891090def Items(self):1091"""Returns a sorted sequence of (value, probability) pairs.10921093Note: in Python3, returns an iterator.1094"""1095a = self.ps1096b = np.roll(a, 1)1097b[0] = 01098return zip(self.xs, a-b)10991100def Shift(self, term):1101"""Adds a term to the xs.11021103term: how much to add1104"""1105new = self.Copy()1106# don't use +=, or else an int array + float yields int array1107new.xs = new.xs + term1108return new11091110def Scale(self, factor):1111"""Multiplies the xs by a factor.11121113factor: what to multiply by1114"""1115new = self.Copy()1116# don't use *=, or else an int array * float yields int array1117new.xs = new.xs * factor1118return new11191120def Prob(self, x):1121"""Returns CDF(x), the probability that corresponds to value x.11221123Args:1124x: number11251126Returns:1127float probability1128"""1129if x < self.xs[0]:1130return 01131index = bisect.bisect(self.xs, x)1132p = self.ps[index-1]1133return p11341135def Probs(self, xs):1136"""Gets probabilities for a sequence of values.11371138xs: any sequence that can be converted to NumPy array11391140returns: NumPy array of cumulative probabilities1141"""1142xs = np.asarray(xs)1143index = np.searchsorted(self.xs, xs, side='right')1144ps = self.ps[index-1]1145ps[xs < self.xs[0]] = 01146return ps11471148ProbArray = Probs11491150def Value(self, p):1151"""Returns InverseCDF(p), the value that corresponds to probability p.11521153Args:1154p: number in the range [0, 1]11551156Returns:1157number value1158"""1159if p < 0 or p > 1:1160raise ValueError('Probability p must be in range [0, 1]')11611162index = bisect.bisect_left(self.ps, p)1163return self.xs[index]11641165def Values(self, ps=None):1166"""Returns InverseCDF(p), the value that corresponds to probability p.11671168If ps is not provided, returns all values.11691170Args:1171ps: NumPy array of numbers in the range [0, 1]11721173Returns:1174NumPy array of values1175"""1176if ps is None:1177return self.xs11781179ps = np.asarray(ps)1180if np.any(ps < 0) or np.any(ps > 1):1181raise ValueError('Probability p must be in range [0, 1]')11821183index = np.searchsorted(self.ps, ps, side='left')1184return self.xs[index]11851186ValueArray = Values11871188def Percentile(self, p):1189"""Returns the value that corresponds to percentile p.11901191Args:1192p: number in the range [0, 100]11931194Returns:1195number value1196"""1197return self.Value(p / 100)11981199def Percentiles(self, ps):1200"""Returns the value that corresponds to percentiles ps.12011202Args:1203ps: numbers in the range [0, 100]12041205Returns:1206array of values1207"""1208ps = np.asarray(ps)1209return self.Values(ps / 100)12101211def PercentileRank(self, x):1212"""Returns the percentile rank of the value x.12131214x: potential value in the CDF12151216returns: percentile rank in the range 0 to 1001217"""1218return self.Prob(x) * 10012191220def PercentileRanks(self, xs):1221"""Returns the percentile ranks of the values in xs.12221223xs: potential value in the CDF12241225returns: array of percentile ranks in the range 0 to 1001226"""1227return self.Probs(x) * 10012281229def Random(self):1230"""Chooses a random value from this distribution."""1231return self.Value(random.random())12321233def Sample(self, n):1234"""Generates a random sample from this distribution.12351236n: int length of the sample1237returns: NumPy array1238"""1239ps = np.random.random(n)1240return self.ValueArray(ps)12411242def Mean(self):1243"""Computes the mean of a CDF.12441245Returns:1246float mean1247"""1248old_p = 01249total = 01250for x, new_p in zip(self.xs, self.ps):1251p = new_p - old_p1252total += p * x1253old_p = new_p1254return total12551256def CredibleInterval(self, percentage=90):1257"""Computes the central credible interval.12581259If percentage=90, computes the 90% CI.12601261Args:1262percentage: float between 0 and 10012631264Returns:1265sequence of two floats, low and high1266"""1267prob = (1 - percentage / 100) / 21268interval = self.Value(prob), self.Value(1 - prob)1269return interval12701271ConfidenceInterval = CredibleInterval12721273def _Round(self, multiplier=1000):1274"""1275An entry is added to the cdf only if the percentile differs1276from the previous value in a significant digit, where the number1277of significant digits is determined by multiplier. The1278default is 1000, which keeps log10(1000) = 3 significant digits.1279"""1280# TODO(write this method)1281raise UnimplementedMethodException()12821283def Render(self, **options):1284"""Generates a sequence of points suitable for plotting.12851286An empirical CDF is a step function; linear interpolation1287can be misleading.12881289Note: options are ignored12901291Returns:1292tuple of (xs, ps)1293"""1294def interleave(a, b):1295c = np.empty(a.shape[0] + b.shape[0])1296c[::2] = a1297c[1::2] = b1298return c12991300a = np.array(self.xs)1301xs = interleave(a, a)1302shift_ps = np.roll(self.ps, 1)1303shift_ps[0] = 01304ps = interleave(shift_ps, self.ps)1305return xs, ps13061307def Max(self, k):1308"""Computes the CDF of the maximum of k selections from this dist.13091310k: int13111312returns: new Cdf1313"""1314cdf = self.Copy()1315cdf.ps **= k1316return cdf131713181319def MakeCdfFromItems(items, label=None):1320"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.13211322Args:1323items: unsorted sequence of (value, frequency) pairs1324label: string label for this CDF13251326Returns:1327cdf: list of (value, fraction) pairs1328"""1329return Cdf(dict(items), label=label)133013311332def MakeCdfFromDict(d, label=None):1333"""Makes a CDF from a dictionary that maps values to frequencies.13341335Args:1336d: dictionary that maps values to frequencies.1337label: string label for the data.13381339Returns:1340Cdf object1341"""1342return Cdf(d, label=label)134313441345def MakeCdfFromList(seq, label=None):1346"""Creates a CDF from an unsorted sequence.13471348Args:1349seq: unsorted sequence of sortable values1350label: string label for the cdf13511352Returns:1353Cdf object1354"""1355return Cdf(seq, label=label)135613571358def MakeCdfFromHist(hist, label=None):1359"""Makes a CDF from a Hist object.13601361Args:1362hist: Pmf.Hist object1363label: string label for the data.13641365Returns:1366Cdf object1367"""1368if label is None:1369label = hist.label13701371return Cdf(hist, label=label)137213731374def MakeCdfFromPmf(pmf, label=None):1375"""Makes a CDF from a Pmf object.13761377Args:1378pmf: Pmf.Pmf object1379label: string label for the data.13801381Returns:1382Cdf object1383"""1384if label is None:1385label = pmf.label13861387return Cdf(pmf, label=label)138813891390class UnimplementedMethodException(Exception):1391"""Exception if someone calls a method that should be overridden."""139213931394class Suite(Pmf):1395"""Represents a suite of hypotheses and their probabilities."""13961397def Update(self, data):1398"""Updates each hypothesis based on the data.13991400data: any representation of the data14011402returns: the normalizing constant1403"""1404for hypo in self.Values():1405like = self.Likelihood(data, hypo)1406self.Mult(hypo, like)1407return self.Normalize()14081409def LogUpdate(self, data):1410"""Updates a suite of hypotheses based on new data.14111412Modifies the suite directly; if you want to keep the original, make1413a copy.14141415Note: unlike Update, LogUpdate does not normalize.14161417Args:1418data: any representation of the data1419"""1420for hypo in self.Values():1421like = self.LogLikelihood(data, hypo)1422self.Incr(hypo, like)14231424def UpdateSet(self, dataset):1425"""Updates each hypothesis based on the dataset.14261427This is more efficient than calling Update repeatedly because1428it waits until the end to Normalize.14291430Modifies the suite directly; if you want to keep the original, make1431a copy.14321433dataset: a sequence of data14341435returns: the normalizing constant1436"""1437for data in dataset:1438for hypo in self.Values():1439like = self.Likelihood(data, hypo)1440self.Mult(hypo, like)1441return self.Normalize()14421443def LogUpdateSet(self, dataset):1444"""Updates each hypothesis based on the dataset.14451446Modifies the suite directly; if you want to keep the original, make1447a copy.14481449dataset: a sequence of data14501451returns: None1452"""1453for data in dataset:1454self.LogUpdate(data)14551456def Likelihood(self, data, hypo):1457"""Computes the likelihood of the data under the hypothesis.14581459hypo: some representation of the hypothesis1460data: some representation of the data1461"""1462raise UnimplementedMethodException()14631464def LogLikelihood(self, data, hypo):1465"""Computes the log likelihood of the data under the hypothesis.14661467hypo: some representation of the hypothesis1468data: some representation of the data1469"""1470raise UnimplementedMethodException()14711472def Print(self):1473"""Prints the hypotheses and their probabilities."""1474for hypo, prob in sorted(self.Items()):1475print(hypo, prob)14761477def MakeOdds(self):1478"""Transforms from probabilities to odds.14791480Values with prob=0 are removed.1481"""1482for hypo, prob in self.Items():1483if prob:1484self.Set(hypo, Odds(prob))1485else:1486self.Remove(hypo)14871488def MakeProbs(self):1489"""Transforms from odds to probabilities."""1490for hypo, odds in self.Items():1491self.Set(hypo, Probability(odds))149214931494def MakeSuiteFromList(t, label=None):1495"""Makes a suite from an unsorted sequence of values.14961497Args:1498t: sequence of numbers1499label: string label for this suite15001501Returns:1502Suite object1503"""1504hist = MakeHistFromList(t, label=label)1505d = hist.GetDict()1506return MakeSuiteFromDict(d)150715081509def MakeSuiteFromHist(hist, label=None):1510"""Makes a normalized suite from a Hist object.15111512Args:1513hist: Hist object1514label: string label15151516Returns:1517Suite object1518"""1519if label is None:1520label = hist.label15211522# make a copy of the dictionary1523d = dict(hist.GetDict())1524return MakeSuiteFromDict(d, label)152515261527def MakeSuiteFromDict(d, label=None):1528"""Makes a suite from a map from values to probabilities.15291530Args:1531d: dictionary that maps values to probabilities1532label: string label for this suite15331534Returns:1535Suite object1536"""1537suite = Suite(label=label)1538suite.SetDict(d)1539suite.Normalize()1540return suite154115421543class Pdf(object):1544"""Represents a probability density function (PDF)."""15451546def Density(self, x):1547"""Evaluates this Pdf at x.15481549Returns: float or NumPy array of probability density1550"""1551raise UnimplementedMethodException()15521553def GetLinspace(self):1554"""Get a linspace for plotting.15551556Not all subclasses of Pdf implement this.15571558Returns: numpy array1559"""1560raise UnimplementedMethodException()15611562def MakePmf(self, **options):1563"""Makes a discrete version of this Pdf.15641565options can include1566label: string1567low: low end of range1568high: high end of range1569n: number of places to evaluate15701571Returns: new Pmf1572"""1573label = options.pop('label', '')1574xs, ds = self.Render(**options)1575return Pmf(dict(zip(xs, ds)), label=label)15761577def Render(self, **options):1578"""Generates a sequence of points suitable for plotting.15791580If options includes low and high, it must also include n;1581in that case the density is evaluated an n locations between1582low and high, including both.15831584If options includes xs, the density is evaluate at those location.15851586Otherwise, self.GetLinspace is invoked to provide the locations.15871588Returns:1589tuple of (xs, densities)1590"""1591low, high = options.pop('low', None), options.pop('high', None)1592if low is not None and high is not None:1593n = options.pop('n', 101)1594xs = np.linspace(low, high, n)1595else:1596xs = options.pop('xs', None)1597if xs is None:1598xs = self.GetLinspace()15991600ds = self.Density(xs)1601return xs, ds16021603def Items(self):1604"""Generates a sequence of (value, probability) pairs.1605"""1606return zip(*self.Render())160716081609class NormalPdf(Pdf):1610"""Represents the PDF of a Normal distribution."""16111612def __init__(self, mu=0, sigma=1, label=None):1613"""Constructs a Normal Pdf with given mu and sigma.16141615mu: mean1616sigma: standard deviation1617label: string1618"""1619self.mu = mu1620self.sigma = sigma1621self.label = label if label is not None else '_nolegend_'16221623def __str__(self):1624return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)16251626def GetLinspace(self):1627"""Get a linspace for plotting.16281629Returns: numpy array1630"""1631low, high = self.mu-3*self.sigma, self.mu+3*self.sigma1632return np.linspace(low, high, 101)16331634def Density(self, xs):1635"""Evaluates this Pdf at xs.16361637xs: scalar or sequence of floats16381639returns: float or NumPy array of probability density1640"""1641return stats.norm.pdf(xs, self.mu, self.sigma)164216431644class ExponentialPdf(Pdf):1645"""Represents the PDF of an exponential distribution."""16461647def __init__(self, lam=1, label=None):1648"""Constructs an exponential Pdf with given parameter.16491650lam: rate parameter1651label: string1652"""1653self.lam = lam1654self.label = label if label is not None else '_nolegend_'16551656def __str__(self):1657return 'ExponentialPdf(%f)' % (self.lam)16581659def GetLinspace(self):1660"""Get a linspace for plotting.16611662Returns: numpy array1663"""1664low, high = 0, 5.0/self.lam1665return np.linspace(low, high, 101)16661667def Density(self, xs):1668"""Evaluates this Pdf at xs.16691670xs: scalar or sequence of floats16711672returns: float or NumPy array of probability density1673"""1674return stats.expon.pdf(xs, scale=1.0/self.lam)167516761677class EstimatedPdf(Pdf):1678"""Represents a PDF estimated by KDE."""16791680def __init__(self, sample, label=None):1681"""Estimates the density function based on a sample.16821683sample: sequence of data1684label: string1685"""1686self.label = label if label is not None else '_nolegend_'1687self.kde = stats.gaussian_kde(sample)1688low = min(sample)1689high = max(sample)1690self.linspace = np.linspace(low, high, 101)16911692def __str__(self):1693return 'EstimatedPdf(label=%s)' % str(self.label)16941695def GetLinspace(self):1696"""Get a linspace for plotting.16971698Returns: numpy array1699"""1700return self.linspace17011702def Density(self, xs):1703"""Evaluates this Pdf at xs.17041705returns: float or NumPy array of probability density1706"""1707return self.kde.evaluate(xs)17081709def Sample(self, n):1710"""Generates a random sample from the estimated Pdf.17111712n: size of sample1713"""1714# NOTE: we have to flatten because resample returns a 2-D1715# array for some reason.1716return self.kde.resample(n).flatten()171717181719def CredibleInterval(pmf, percentage=90):1720"""Computes a credible interval for a given distribution.17211722If percentage=90, computes the 90% CI.17231724Args:1725pmf: Pmf object representing a posterior distribution1726percentage: float between 0 and 10017271728Returns:1729sequence of two floats, low and high1730"""1731cdf = pmf.MakeCdf()1732prob = (1 - percentage / 100) / 21733interval = cdf.Value(prob), cdf.Value(1 - prob)1734return interval173517361737def PmfProbLess(pmf1, pmf2):1738"""Probability that a value from pmf1 is less than a value from pmf2.17391740Args:1741pmf1: Pmf object1742pmf2: Pmf object17431744Returns:1745float probability1746"""1747total = 01748for v1, p1 in pmf1.Items():1749for v2, p2 in pmf2.Items():1750if v1 < v2:1751total += p1 * p21752return total175317541755def PmfProbGreater(pmf1, pmf2):1756"""Probability that a value from pmf1 is less than a value from pmf2.17571758Args:1759pmf1: Pmf object1760pmf2: Pmf object17611762Returns:1763float probability1764"""1765total = 01766for v1, p1 in pmf1.Items():1767for v2, p2 in pmf2.Items():1768if v1 > v2:1769total += p1 * p21770return total177117721773def PmfProbEqual(pmf1, pmf2):1774"""Probability that a value from pmf1 equals a value from pmf2.17751776Args:1777pmf1: Pmf object1778pmf2: Pmf object17791780Returns:1781float probability1782"""1783total = 01784for v1, p1 in pmf1.Items():1785for v2, p2 in pmf2.Items():1786if v1 == v2:1787total += p1 * p21788return total178917901791def RandomSum(dists):1792"""Chooses a random value from each dist and returns the sum.17931794dists: sequence of Pmf or Cdf objects17951796returns: numerical sum1797"""1798total = sum(dist.Random() for dist in dists)1799return total180018011802def SampleSum(dists, n):1803"""Draws a sample of sums from a list of distributions.18041805dists: sequence of Pmf or Cdf objects1806n: sample size18071808returns: new Pmf of sums1809"""1810pmf = Pmf(RandomSum(dists) for i in range(n))1811return pmf181218131814def EvalNormalPdf(x, mu, sigma):1815"""Computes the unnormalized PDF of the normal distribution.18161817x: value1818mu: mean1819sigma: standard deviation18201821returns: float probability density1822"""1823return stats.norm.pdf(x, mu, sigma)182418251826def MakeNormalPmf(mu, sigma, num_sigmas, n=201):1827"""Makes a PMF discrete approx to a Normal distribution.18281829mu: float mean1830sigma: float standard deviation1831num_sigmas: how many sigmas to extend in each direction1832n: number of values in the Pmf18331834returns: normalized Pmf1835"""1836pmf = Pmf()1837low = mu - num_sigmas * sigma1838high = mu + num_sigmas * sigma18391840for x in np.linspace(low, high, n):1841p = EvalNormalPdf(x, mu, sigma)1842pmf.Set(x, p)1843pmf.Normalize()1844return pmf184518461847def EvalBinomialPmf(k, n, p):1848"""Evaluates the binomial PMF.18491850Returns the probabily of k successes in n trials with probability p.1851"""1852return stats.binom.pmf(k, n, p)185318541855def MakeBinomialPmf(n, p):1856"""Evaluates the binomial PMF.18571858Returns the distribution of successes in n trials with probability p.1859"""1860pmf = Pmf()1861for k in range(n+1):1862pmf[k] = stats.binom.pmf(k, n, p)1863return pmf186418651866def EvalGammaPdf(x, a):1867"""Computes the Gamma PDF.18681869x: where to evaluate the PDF1870a: parameter of the gamma distribution18711872returns: float probability1873"""1874return x**(a-1) * np.exp(-x) / gamma(a)187518761877def MakeGammaPmf(xs, a):1878"""Makes a PMF discrete approx to a Gamma distribution.18791880lam: parameter lambda in events per unit time1881xs: upper bound of the Pmf18821883returns: normalized Pmf1884"""1885xs = np.asarray(xs)1886ps = EvalGammaPdf(xs, a)1887pmf = Pmf(dict(zip(xs, ps)))1888pmf.Normalize()1889return pmf189018911892def EvalGeometricPmf(k, p, loc=0):1893"""Evaluates the geometric PMF.18941895With loc=0: Probability of `k` trials to get one success.1896With loc=-1: Probability of `k` trials before first success.18971898k: number of trials1899p: probability of success on each trial1900"""1901return stats.geom.pmf(k, p, loc=loc)190219031904def MakeGeometricPmf(p, loc=0, high=10):1905"""Evaluates the binomial PMF.19061907With loc=0: PMF of trials to get one success.1908With loc=-1: PMF of trials before first success.19091910p: probability of success1911high: upper bound where PMF is truncated1912"""1913pmf = Pmf()1914for k in range(high):1915pmf[k] = stats.geom.pmf(k, p, loc=loc)1916pmf.Normalize()1917return pmf191819191920def EvalHypergeomPmf(k, N, K, n):1921"""Evaluates the hypergeometric PMF.19221923Returns the probabily of k successes in n trials from a population1924N with K successes in it.1925"""1926return stats.hypergeom.pmf(k, N, K, n)192719281929def EvalPoissonPmf(k, lam):1930"""Computes the Poisson PMF.19311932k: number of events1933lam: parameter lambda in events per unit time19341935returns: float probability1936"""1937return stats.poisson.pmf(k, lam)193819391940def MakePoissonPmf(lam, high, step=1):1941"""Makes a PMF discrete approx to a Poisson distribution.19421943lam: parameter lambda in events per unit time1944high: upper bound of the Pmf19451946returns: normalized Pmf1947"""1948pmf = Pmf()1949for k in range(0, high + 1, step):1950p = stats.poisson.pmf(k, lam)1951pmf.Set(k, p)1952pmf.Normalize()1953return pmf195419551956def EvalExponentialPdf(x, lam):1957"""Computes the exponential PDF.19581959x: value1960lam: parameter lambda in events per unit time19611962returns: float probability density1963"""1964return lam * math.exp(-lam * x)196519661967def EvalExponentialCdf(x, lam):1968"""Evaluates CDF of the exponential distribution with parameter lam."""1969return 1 - math.exp(-lam * x)197019711972def MakeExponentialPmf(lam, high, n=200):1973"""Makes a PMF discrete approx to an exponential distribution.19741975lam: parameter lambda in events per unit time1976high: upper bound1977n: number of values in the Pmf19781979returns: normalized Pmf1980"""1981pmf = Pmf()1982for x in np.linspace(0, high, n):1983p = EvalExponentialPdf(x, lam)1984pmf.Set(x, p)1985pmf.Normalize()1986return pmf198719881989def EvalWeibullPdf(x, lam, k):1990"""Computes the Weibull PDF.19911992x: value1993lam: parameter lambda in events per unit time1994k: parameter19951996returns: float probability density1997"""1998arg = (x / lam)1999return k / lam * arg**(k-1) * np.exp(-arg**k)200020012002def EvalWeibullCdf(x, lam, k):2003"""Evaluates CDF of the Weibull distribution."""2004arg = (x / lam)2005return 1 - np.exp(-arg**k)200620072008def MakeWeibullPmf(lam, k, high, n=200):2009"""Makes a PMF discrete approx to a Weibull distribution.20102011lam: parameter lambda in events per unit time2012k: parameter2013high: upper bound2014n: number of values in the Pmf20152016returns: normalized Pmf2017"""2018xs = np.linspace(0, high, n)2019ps = EvalWeibullPdf(xs, lam, k)2020ps[np.isinf(ps)] = 02021return Pmf(dict(zip(xs, ps)))202220232024def EvalParetoPdf(x, xm, alpha):2025"""Computes the Pareto.20262027xm: minimum value (scale parameter)2028alpha: shape parameter20292030returns: float probability density2031"""2032return stats.pareto.pdf(x, alpha, scale=xm)203320342035def MakeParetoPmf(xm, alpha, high, num=101):2036"""Makes a PMF discrete approx to a Pareto distribution.20372038xm: minimum value (scale parameter)2039alpha: shape parameter2040high: upper bound value2041num: number of values20422043returns: normalized Pmf2044"""2045xs = np.linspace(xm, high, num)2046ps = stats.pareto.pdf(xs, alpha, scale=xm)2047pmf = Pmf(dict(zip(xs, ps)))2048return pmf20492050def StandardNormalCdf(x):2051"""Evaluates the CDF of the standard Normal distribution.20522053See http://en.wikipedia.org/wiki/Normal_distribution2054#Cumulative_distribution_function20552056Args:2057x: float20582059Returns:2060float2061"""2062return (math.erf(x / ROOT2) + 1) / 2206320642065def EvalNormalCdf(x, mu=0, sigma=1):2066"""Evaluates the CDF of the normal distribution.20672068Args:2069x: float20702071mu: mean parameter20722073sigma: standard deviation parameter20742075Returns:2076float2077"""2078return stats.norm.cdf(x, loc=mu, scale=sigma)207920802081def EvalNormalCdfInverse(p, mu=0, sigma=1):2082"""Evaluates the inverse CDF of the normal distribution.20832084See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function20852086Args:2087p: float20882089mu: mean parameter20902091sigma: standard deviation parameter20922093Returns:2094float2095"""2096return stats.norm.ppf(p, loc=mu, scale=sigma)209720982099def EvalLognormalCdf(x, mu=0, sigma=1):2100"""Evaluates the CDF of the lognormal distribution.21012102x: float or sequence2103mu: mean parameter2104sigma: standard deviation parameter21052106Returns: float or sequence2107"""2108return stats.lognorm.cdf(x, loc=mu, scale=sigma)210921102111def RenderExpoCdf(lam, low, high, n=101):2112"""Generates sequences of xs and ps for an exponential CDF.21132114lam: parameter2115low: float2116high: float2117n: number of points to render21182119returns: numpy arrays (xs, ps)2120"""2121xs = np.linspace(low, high, n)2122ps = 1 - np.exp(-lam * xs)2123#ps = stats.expon.cdf(xs, scale=1.0/lam)2124return xs, ps212521262127def RenderNormalCdf(mu, sigma, low, high, n=101):2128"""Generates sequences of xs and ps for a Normal CDF.21292130mu: parameter2131sigma: parameter2132low: float2133high: float2134n: number of points to render21352136returns: numpy arrays (xs, ps)2137"""2138xs = np.linspace(low, high, n)2139ps = stats.norm.cdf(xs, mu, sigma)2140return xs, ps214121422143def RenderParetoCdf(xmin, alpha, low, high, n=50):2144"""Generates sequences of xs and ps for a Pareto CDF.21452146xmin: parameter2147alpha: parameter2148low: float2149high: float2150n: number of points to render21512152returns: numpy arrays (xs, ps)2153"""2154if low < xmin:2155low = xmin2156xs = np.linspace(low, high, n)2157ps = 1 - (xs / xmin) ** -alpha2158#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)2159return xs, ps216021612162class Beta:2163"""Represents a Beta distribution.21642165See http://en.wikipedia.org/wiki/Beta_distribution2166"""2167def __init__(self, alpha=1, beta=1, label=None):2168"""Initializes a Beta distribution."""2169self.alpha = alpha2170self.beta = beta2171self.label = label if label is not None else '_nolegend_'21722173def Update(self, data):2174"""Updates a Beta distribution.21752176data: pair of int (heads, tails)2177"""2178heads, tails = data2179self.alpha += heads2180self.beta += tails21812182def Mean(self):2183"""Computes the mean of this distribution."""2184return self.alpha / (self.alpha + self.beta)21852186def MAP(self):2187"""Computes the value with maximum a posteori probability."""2188a = self.alpha - 12189b = self.beta - 12190return a / (a + b)21912192def Random(self):2193"""Generates a random variate from this distribution."""2194return random.betavariate(self.alpha, self.beta)21952196def Sample(self, n):2197"""Generates a random sample from this distribution.21982199n: int sample size2200"""2201size = n,2202return np.random.beta(self.alpha, self.beta, size)22032204def EvalPdf(self, x):2205"""Evaluates the PDF at x."""2206return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)22072208def MakePmf(self, steps=101, label=None):2209"""Returns a Pmf of this distribution.22102211Note: Normally, we just evaluate the PDF at a sequence2212of points and treat the probability density as a probability2213mass.22142215But if alpha or beta is less than one, we have to be2216more careful because the PDF goes to infinity at x=02217and x=1. In that case we evaluate the CDF and compute2218differences.22192220The result is a little funny, because the values at 0 and 12221are not symmetric. Nevertheless, it is a reasonable discrete2222model of the continuous distribution, and behaves well as2223the number of values increases.2224"""2225if label is None and self.label is not None:2226label = self.label22272228if self.alpha < 1 or self.beta < 1:2229cdf = self.MakeCdf()2230pmf = cdf.MakePmf()2231return pmf22322233xs = [i / (steps - 1.0) for i in range(steps)]2234probs = [self.EvalPdf(x) for x in xs]2235pmf = Pmf(dict(zip(xs, probs)), label=label)2236return pmf22372238def MakeCdf(self, steps=101):2239"""Returns the CDF of this distribution."""2240xs = [i / (steps - 1.0) for i in range(steps)]2241ps = special.betainc(self.alpha, self.beta, xs)2242cdf = Cdf(xs, ps)2243return cdf22442245def Percentile(self, ps):2246"""Returns the given percentiles from this distribution.22472248ps: scalar, array, or list of [0-100]2249"""2250ps = np.asarray(ps) / 1002251xs = special.betaincinv(self.alpha, self.beta, ps)2252return xs225322542255class Dirichlet(object):2256"""Represents a Dirichlet distribution.22572258See http://en.wikipedia.org/wiki/Dirichlet_distribution2259"""22602261def __init__(self, n, conc=1, label=None):2262"""Initializes a Dirichlet distribution.22632264n: number of dimensions2265conc: concentration parameter (smaller yields more concentration)2266label: string label2267"""2268if n < 2:2269raise ValueError('A Dirichlet distribution with '2270'n<2 makes no sense')22712272self.n = n2273self.params = np.ones(n, dtype=np.float) * conc2274self.label = label if label is not None else '_nolegend_'22752276def Update(self, data):2277"""Updates a Dirichlet distribution.22782279data: sequence of observations, in order corresponding to params2280"""2281m = len(data)2282self.params[:m] += data22832284def Random(self):2285"""Generates a random variate from this distribution.22862287Returns: normalized vector of fractions2288"""2289p = np.random.gamma(self.params)2290return p / p.sum()22912292def Likelihood(self, data):2293"""Computes the likelihood of the data.22942295Selects a random vector of probabilities from this distribution.22962297Returns: float probability2298"""2299m = len(data)2300if self.n < m:2301return 023022303x = data2304p = self.Random()2305q = p[:m] ** x2306return q.prod()23072308def LogLikelihood(self, data):2309"""Computes the log likelihood of the data.23102311Selects a random vector of probabilities from this distribution.23122313Returns: float log probability2314"""2315m = len(data)2316if self.n < m:2317return float('-inf')23182319x = self.Random()2320y = np.log(x[:m]) * data2321return y.sum()23222323def MarginalBeta(self, i):2324"""Computes the marginal distribution of the ith element.23252326See http://en.wikipedia.org/wiki/Dirichlet_distribution2327#Marginal_distributions23282329i: int23302331Returns: Beta object2332"""2333alpha0 = self.params.sum()2334alpha = self.params[i]2335return Beta(alpha, alpha0 - alpha)23362337def PredictivePmf(self, xs, label=None):2338"""Makes a predictive distribution.23392340xs: values to go into the Pmf23412342Returns: Pmf that maps from x to the mean prevalence of x2343"""2344alpha0 = self.params.sum()2345ps = self.params / alpha02346return Pmf(zip(xs, ps), label=label)234723482349def BinomialCoef(n, k):2350"""Compute the binomial coefficient "n choose k".23512352n: number of trials2353k: number of successes23542355Returns: float2356"""2357return scipy.misc.comb(n, k)235823592360def LogBinomialCoef(n, k):2361"""Computes the log of the binomial coefficient.23622363http://math.stackexchange.com/questions/64716/2364approximating-the-logarithm-of-the-binomial-coefficient23652366n: number of trials2367k: number of successes23682369Returns: float2370"""2371return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)237223732374def NormalProbability(ys, jitter=0):2375"""Generates data for a normal probability plot.23762377ys: sequence of values2378jitter: float magnitude of jitter added to the ys23792380returns: numpy arrays xs, ys2381"""2382n = len(ys)2383xs = np.random.normal(0, 1, n)2384xs.sort()23852386if jitter:2387ys = Jitter(ys, jitter)2388else:2389ys = np.array(ys)2390ys.sort()23912392return xs, ys239323942395def Jitter(values, jitter=0.5):2396"""Jitters the values by adding a uniform variate in (-jitter, jitter).23972398values: sequence2399jitter: scalar magnitude of jitter24002401returns: new numpy array2402"""2403n = len(values)2404return np.random.normal(0, jitter, n) + values240524062407def NormalProbabilityPlot(sample, fit_color='0.8', **options):2408"""Makes a normal probability plot with a fitted line.24092410sample: sequence of numbers2411fit_color: color string for the fitted line2412options: passed along to Plot2413"""2414xs, ys = NormalProbability(sample)2415mean, var = MeanVar(sample)2416std = math.sqrt(var)24172418fit = FitLine(xs, mean, std)2419thinkplot.Plot(*fit, color=fit_color, label='model')24202421xs, ys = NormalProbability(sample)2422thinkplot.Plot(xs, ys, **options)242324242425def Mean(xs):2426"""Computes mean.24272428xs: sequence of values24292430returns: float mean2431"""2432return np.mean(xs)243324342435def Var(xs, mu=None, ddof=0):2436"""Computes variance.24372438xs: sequence of values2439mu: option known mean2440ddof: delta degrees of freedom24412442returns: float2443"""2444xs = np.asarray(xs)24452446if mu is None:2447mu = xs.mean()24482449ds = xs - mu2450return np.dot(ds, ds) / (len(xs) - ddof)245124522453def Std(xs, mu=None, ddof=0):2454"""Computes standard deviation.24552456xs: sequence of values2457mu: option known mean2458ddof: delta degrees of freedom24592460returns: float2461"""2462var = Var(xs, mu, ddof)2463return math.sqrt(var)246424652466def MeanVar(xs, ddof=0):2467"""Computes mean and variance.24682469Based on http://stackoverflow.com/questions/19391149/2470numpy-mean-and-variance-from-single-function24712472xs: sequence of values2473ddof: delta degrees of freedom24742475returns: pair of float, mean and var2476"""2477xs = np.asarray(xs)2478mean = xs.mean()2479s2 = Var(xs, mean, ddof)2480return mean, s2248124822483def Trim(t, p=0.01):2484"""Trims the largest and smallest elements of t.24852486Args:2487t: sequence of numbers2488p: fraction of values to trim off each end24892490Returns:2491sequence of values2492"""2493n = int(p * len(t))2494t = sorted(t)[n:-n]2495return t249624972498def TrimmedMean(t, p=0.01):2499"""Computes the trimmed mean of a sequence of numbers.25002501Args:2502t: sequence of numbers2503p: fraction of values to trim off each end25042505Returns:2506float2507"""2508t = Trim(t, p)2509return Mean(t)251025112512def TrimmedMeanVar(t, p=0.01):2513"""Computes the trimmed mean and variance of a sequence of numbers.25142515Side effect: sorts the list.25162517Args:2518t: sequence of numbers2519p: fraction of values to trim off each end25202521Returns:2522float2523"""2524t = Trim(t, p)2525mu, var = MeanVar(t)2526return mu, var252725282529def CohenEffectSize(group1, group2):2530"""Compute Cohen's d.25312532group1: Series or NumPy array2533group2: Series or NumPy array25342535returns: float2536"""2537diff = group1.mean() - group2.mean()25382539n1, n2 = len(group1), len(group2)2540var1 = group1.var()2541var2 = group2.var()25422543pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)2544d = diff / math.sqrt(pooled_var)2545return d254625472548def Cov(xs, ys, meanx=None, meany=None):2549"""Computes Cov(X, Y).25502551Args:2552xs: sequence of values2553ys: sequence of values2554meanx: optional float mean of xs2555meany: optional float mean of ys25562557Returns:2558Cov(X, Y)2559"""2560xs = np.asarray(xs)2561ys = np.asarray(ys)25622563if meanx is None:2564meanx = np.mean(xs)2565if meany is None:2566meany = np.mean(ys)25672568cov = np.dot(xs-meanx, ys-meany) / len(xs)2569return cov257025712572def Corr(xs, ys):2573"""Computes Corr(X, Y).25742575Args:2576xs: sequence of values2577ys: sequence of values25782579Returns:2580Corr(X, Y)2581"""2582xs = np.asarray(xs)2583ys = np.asarray(ys)25842585meanx, varx = MeanVar(xs)2586meany, vary = MeanVar(ys)25872588corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)25892590return corr259125922593def SerialCorr(series, lag=1):2594"""Computes the serial correlation of a series.25952596series: Series2597lag: integer number of intervals to shift25982599returns: float correlation2600"""2601xs = series[lag:]2602ys = series.shift(lag)[lag:]2603corr = Corr(xs, ys)2604return corr260526062607def SpearmanCorr(xs, ys):2608"""Computes Spearman's rank correlation.26092610Args:2611xs: sequence of values2612ys: sequence of values26132614Returns:2615float Spearman's correlation2616"""2617xranks = pandas.Series(xs).rank()2618yranks = pandas.Series(ys).rank()2619return Corr(xranks, yranks)262026212622def MapToRanks(t):2623"""Returns a list of ranks corresponding to the elements in t.26242625Args:2626t: sequence of numbers26272628Returns:2629list of integer ranks, starting at 12630"""2631# pair up each value with its index2632pairs = enumerate(t)26332634# sort by value2635sorted_pairs = sorted(pairs, key=itemgetter(1))26362637# pair up each pair with its rank2638ranked = enumerate(sorted_pairs)26392640# sort by index2641resorted = sorted(ranked, key=lambda trip: trip[1][0])26422643# extract the ranks2644ranks = [trip[0]+1 for trip in resorted]2645return ranks264626472648def LeastSquares(xs, ys):2649"""Computes a linear least squares fit for ys as a function of xs.26502651Args:2652xs: sequence of values2653ys: sequence of values26542655Returns:2656tuple of (intercept, slope)2657"""2658meanx, varx = MeanVar(xs)2659meany = Mean(ys)26602661slope = Cov(xs, ys, meanx, meany) / varx2662inter = meany - slope * meanx26632664return inter, slope266526662667def FitLine(xs, inter, slope):2668"""Fits a line to the given data.26692670xs: sequence of x26712672returns: tuple of numpy arrays (sorted xs, fit ys)2673"""2674fit_xs = np.sort(xs)2675fit_ys = inter + slope * fit_xs2676return fit_xs, fit_ys267726782679def Residuals(xs, ys, inter, slope):2680"""Computes residuals for a linear fit with parameters inter and slope.26812682Args:2683xs: independent variable2684ys: dependent variable2685inter: float intercept2686slope: float slope26872688Returns:2689list of residuals2690"""2691xs = np.asarray(xs)2692ys = np.asarray(ys)2693res = ys - (inter + slope * xs)2694return res269526962697def CoefDetermination(ys, res):2698"""Computes the coefficient of determination (R^2) for given residuals.26992700Args:2701ys: dependent variable2702res: residuals27032704Returns:2705float coefficient of determination2706"""2707return 1 - Var(res) / Var(ys)270827092710def CorrelatedGenerator(rho):2711"""Generates standard normal variates with serial correlation.27122713rho: target coefficient of correlation27142715Returns: iterable2716"""2717x = random.gauss(0, 1)2718yield x27192720sigma = math.sqrt(1 - rho**2)2721while True:2722x = random.gauss(x * rho, sigma)2723yield x272427252726def CorrelatedNormalGenerator(mu, sigma, rho):2727"""Generates normal variates with serial correlation.27282729mu: mean of variate2730sigma: standard deviation of variate2731rho: target coefficient of correlation27322733Returns: iterable2734"""2735for x in CorrelatedGenerator(rho):2736yield x * sigma + mu273727382739def RawMoment(xs, k):2740"""Computes the kth raw moment of xs.2741"""2742return sum(x**k for x in xs) / len(xs)274327442745def CentralMoment(xs, k):2746"""Computes the kth central moment of xs.2747"""2748mean = RawMoment(xs, 1)2749return sum((x - mean)**k for x in xs) / len(xs)275027512752def StandardizedMoment(xs, k):2753"""Computes the kth standardized moment of xs.2754"""2755var = CentralMoment(xs, 2)2756std = math.sqrt(var)2757return CentralMoment(xs, k) / std**k275827592760def Skewness(xs):2761"""Computes skewness.2762"""2763return StandardizedMoment(xs, 3)276427652766def Median(xs):2767"""Computes the median (50th percentile) of a sequence.27682769xs: sequence or anything else that can initialize a Cdf27702771returns: float2772"""2773cdf = Cdf(xs)2774return cdf.Value(0.5)277527762777def IQR(xs):2778"""Computes the interquartile of a sequence.27792780xs: sequence or anything else that can initialize a Cdf27812782returns: pair of floats2783"""2784cdf = Cdf(xs)2785return cdf.Value(0.25), cdf.Value(0.75)278627872788def PearsonMedianSkewness(xs):2789"""Computes the Pearson median skewness.2790"""2791median = Median(xs)2792mean = RawMoment(xs, 1)2793var = CentralMoment(xs, 2)2794std = math.sqrt(var)2795gp = 3 * (mean - median) / std2796return gp279727982799class FixedWidthVariables(object):2800"""Represents a set of variables in a fixed width file."""28012802def __init__(self, variables, index_base=0):2803"""Initializes.28042805variables: DataFrame2806index_base: are the indices 0 or 1 based?28072808Attributes:2809colspecs: list of (start, end) index tuples2810names: list of string variable names2811"""2812self.variables = variables28132814# note: by default, subtract 1 from colspecs2815self.colspecs = variables[['start', 'end']] - index_base28162817# convert colspecs to a list of pair of int2818self.colspecs = self.colspecs.astype(np.int).values.tolist()2819self.names = variables['name']28202821def ReadFixedWidth(self, filename, **options):2822"""Reads a fixed width ASCII file.28232824filename: string filename28252826returns: DataFrame2827"""2828df = pandas.read_fwf(filename,2829colspecs=self.colspecs,2830names=self.names,2831**options)2832return df283328342835def ReadStataDct(dct_file, **options):2836"""Reads a Stata dictionary file.28372838dct_file: string filename2839options: dict of options passed to open()28402841returns: FixedWidthVariables object2842"""2843type_map = dict(byte=int, int=int, long=int, float=float,2844double=float, numeric=float)28452846var_info = []2847with open(dct_file, **options) as f:2848for line in f:2849match = re.search( r'_column\(([^)]*)\)', line)2850if not match:2851continue2852start = int(match.group(1))2853t = line.split()2854vtype, name, fstring = t[1:4]2855name = name.lower()2856if vtype.startswith('str'):2857vtype = str2858else:2859vtype = type_map[vtype]2860long_desc = ' '.join(t[4:]).strip('"')2861var_info.append((start, vtype, name, fstring, long_desc))28622863columns = ['start', 'type', 'name', 'fstring', 'desc']2864variables = pandas.DataFrame(var_info, columns=columns)28652866# fill in the end column by shifting the start column2867variables['end'] = variables.start.shift(-1)2868variables.loc[len(variables)-1, 'end'] = 028692870dct = FixedWidthVariables(variables, index_base=1)2871return dct287228732874def Resample(xs, n=None):2875"""Draw a sample from xs with the same length as xs.28762877xs: sequence2878n: sample size (default: len(xs))28792880returns: NumPy array2881"""2882if n is None:2883n = len(xs)2884return np.random.choice(xs, n, replace=True)288528862887def SampleRows(df, nrows, replace=False):2888"""Choose a sample of rows from a DataFrame.28892890df: DataFrame2891nrows: number of rows2892replace: whether to sample with replacement28932894returns: DataDf2895"""2896indices = np.random.choice(df.index, nrows, replace=replace)2897sample = df.loc[indices]2898return sample289929002901def ResampleRows(df):2902"""Resamples rows from a DataFrame.29032904df: DataFrame29052906returns: DataFrame2907"""2908return SampleRows(df, len(df), replace=True)290929102911def ResampleRowsWeighted(df, column='finalwgt'):2912"""Resamples a DataFrame using probabilities proportional to given column.29132914df: DataFrame2915column: string column name to use as weights29162917returns: DataFrame2918"""2919weights = df[column].copy()2920weights /= sum(weights)2921indices = np.random.choice(df.index, len(df), replace=True, p=weights)2922sample = df.loc[indices]2923return sample292429252926def PercentileRow(array, p):2927"""Selects the row from a sorted array that maps to percentile p.29282929p: float 0--10029302931returns: NumPy array (one row)2932"""2933rows, cols = array.shape2934index = int(rows * p / 100)2935return array[index,]293629372938def PercentileRows(ys_seq, percents):2939"""Given a collection of lines, selects percentiles along vertical axis.29402941For example, if ys_seq contains simulation results like ys as a2942function of time, and percents contains (5, 95), the result would2943be a 90% CI for each vertical slice of the simulation results.29442945ys_seq: sequence of lines (y values)2946percents: list of percentiles (0-100) to select29472948returns: list of NumPy arrays, one for each percentile2949"""2950nrows = len(ys_seq)2951ncols = len(ys_seq[0])2952array = np.zeros((nrows, ncols))29532954for i, ys in enumerate(ys_seq):2955array[i,] = ys29562957array = np.sort(array, axis=0)29582959rows = [PercentileRow(array, p) for p in percents]2960return rows296129622963def Smooth(xs, sigma=2, **options):2964"""Smooths a NumPy array with a Gaussian filter.29652966xs: sequence2967sigma: standard deviation of the filter2968"""2969return ndimage.filters.gaussian_filter1d(xs, sigma, **options)297029712972class HypothesisTest(object):2973"""Represents a hypothesis test."""29742975def __init__(self, data):2976"""Initializes.29772978data: data in whatever form is relevant2979"""2980self.data = data2981self.MakeModel()2982self.actual = self.TestStatistic(data)2983self.test_stats = None2984self.test_cdf = None29852986def PValue(self, iters=1000):2987"""Computes the distribution of the test statistic and p-value.29882989iters: number of iterations29902991returns: float p-value2992"""2993self.test_stats = [self.TestStatistic(self.RunModel())2994for _ in range(iters)]2995self.test_cdf = Cdf(self.test_stats)29962997count = sum(1 for x in self.test_stats if x >= self.actual)2998return count / iters29993000def MaxTestStat(self):3001"""Returns the largest test statistic seen during simulations.3002"""3003return max(self.test_stats)30043005def PlotCdf(self, label=None):3006"""Draws a Cdf with vertical lines at the observed test stat.3007"""3008def VertLine(x):3009"""Draws a vertical line at x."""3010thinkplot.Plot([x, x], [0, 1], color='0.8')30113012VertLine(self.actual)3013thinkplot.Cdf(self.test_cdf, label=label)30143015def TestStatistic(self, data):3016"""Computes the test statistic.30173018data: data in whatever form is relevant3019"""3020raise UnimplementedMethodException()30213022def MakeModel(self):3023"""Build a model of the null hypothesis.3024"""3025pass30263027def RunModel(self):3028"""Run the model of the null hypothesis.30293030returns: simulated data3031"""3032raise UnimplementedMethodException()303330343035def main():3036pass303730383039if __name__ == '__main__':3040main()304130423043