In [11]:

import pystan import numpy as np import matplotlib.pyplot as plt

In [12]:

# For Stan we provide all known quantities as data, namely the observed data # and our prior hyperparameters. eczema_data = { 'treatment': { 'alpha': 1, # fixed prior hyperparameters for the 'beta': 1, # beta distribution 'num_trials': 6, # number of trials in the data set 'patients': [23, 16, 16, 45, 31, 10], # number of patients per trial 'improved': [20, 10, 13, 35, 22, 7]}, # number of improved patients per trial 'control': { 'alpha': 1, 'beta': 1, 'num_trials': 6, 'patients': [15, 18, 10, 39, 29, 10], 'improved': [9, 11, 4, 21, 12, 0]}}

In [13]:

# Below is the Stan code for the medical trial data set. Note that the Stan # code is a string that is passed to the StanModel object below. # We have to tell Stan what data to expect, what our parameters are and what # the likelihood and prior are. Since the posterior is just proportional to # the product of the likelihood and the prior, we don't distinguish between # them explicitly in the model below. Every distribution we specify is # automatically incorporated into the product of likelihood * prior. stan_code = """ // The data block contains all known quantities - typically the observed // data and any constant hyperparameters. data { int<lower=1> num_trials; // number of trials in the data set int<lower=0> patients[num_trials]; // number of patients per trial int<lower=0> improved[num_trials]; // number of improved patients per trial real<lower=0> alpha; // fixed prior hyperparameter real<lower=0> beta; // fixed prior hyperparameter } // The parameters block contains all unknown quantities - typically the // parameters of the model. Stan will generate samples from the posterior // distributions over all parameters. parameters { real<lower=0,upper=1> p; // probability of improvement - the // parameter of the binomial likelihood } // The model block contains all probability distributions in the model. // This of this as specifying the generative model for the scenario. model { p ~ beta(alpha, beta); // prior over p for(i in 1:num_trials) { improved[i] ~ binomial(patients[i], p); // likelihood function } } """

In [14]:

# This cell takes a while to run. Compiling a Stan model will feel slow even # on simple models, but it isn't much slower for really complex models. Stan # is translating the model specified above to C++ code and compiling the C++ # code to a binary that it can executed. The advantage is that the model needs # to be compiled only once. Once that is done, the same code can be reused # to generate samples for different data sets really quickly. stan_model = pystan.StanModel(model_code=stan_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_4822bea325d0250e03828b3bc1bb8bdd NOW.

In [15]:

# Fit the model to the data. This will generate samples from the posterior over # all parameters of the model. We start by computing posteriors for the treatment # data. stan_results = stan_model.sampling(data=eczema_data['treatment'])

In [16]:

# Print out the mean, standard deviation and quantiles of all parameters. # These are approximate values derived from the samples generated by Stan. # You can ignore the "lp__" row for now. Pay attention to the row for # the "p" parameter of the model. # # The columns in the summary are # # * mean: The expected value of the posterior over the parameter # * se_mean: The estimated error in the posterior mean # * sd: The standard deviation of the posterior over the parameter # * 2.5%, etc.: Percentiles of the posterior over the parameter # * n_eff: The effective number of samples generated by Stan. The # larger this value, the better. # * Rhat: An estimate of the quality of the samples. This should be # close to 1.0, otherwise there might be a problem with the # convergence of the sampler. print(stan_results)

Inference for Stan model: anon_model_4822bea325d0250e03828b3bc1bb8bdd.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
p 0.76 8.8e-4 0.03 0.69 0.73 0.76 0.78 0.82 1530 1.0
lp__ -80.05 0.02 0.69 -82.12 -80.18 -79.78 -79.62 -79.58 1312 1.0
Samples were drawn using NUTS at Mon Oct 8 12:20:25 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

In [17]:

# Finally, we can extract the samples generated by Stan so that we # can plot them or calculate any other functions or expected values # we might be interested in. posterior_samples = stan_results.extract() plt.hist(posterior_samples['p'], bins=50, density=True) plt.title('Sampled posterior probability density for p') print( "Posterior 95% confidence interval for p:", np.percentile(posterior_samples['p'], [2.5, 97.5])) plt.show()

Posterior 95% confidence interval for p: [0.685807 0.82461639]

Task 1

In [18]:

stan_results2 = stan_model.sampling(data=eczema_data['control'])

In [19]:

print(stan_results2)

Inference for Stan model: anon_model_4822bea325d0250e03828b3bc1bb8bdd.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
p 0.47 1.1e-3 0.04 0.39 0.44 0.47 0.5 0.56 1551 1.0
lp__ -85.56 0.02 0.7 -87.53 -85.72 -85.28 -85.11 -85.06 1247 1.0
Samples were drawn using NUTS at Mon Oct 8 12:20:25 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

In [20]:

print(stan_results2.stansummary(pars=['p'], probs=[0.025, 0.5, 0.975]))

Inference for Stan model: anon_model_4822bea325d0250e03828b3bc1bb8bdd.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 50% 97.5% n_eff Rhat
p 0.47 1.1e-3 0.04 0.39 0.47 0.56 1551 1.0
Samples were drawn using NUTS at Mon Oct 8 12:20:25 2018.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

In [21]:

posterior_samples = stan_results.extract() posterior_samples2 = stan_results2.extract() plt.hist(posterior_samples['p'], bins=50, density=True, color = 'r') plt.hist(posterior_samples2['p'], bins=50, density=True) plt.title('Sampled posterior probability density for p') print( "Posterior 95% confidence interval for p (treatment)", np.percentile(posterior_samples['p'], [2.5, 97.5])) print( "Posterior 95% confidence interval for p (control):", np.percentile(posterior_samples2['p'], [2.5, 97.5])) plt.show()

Posterior 95% confidence interval for p (treatment) [0.685807 0.82461639]
Posterior 95% confidence interval for p (control): [0.38511728 0.56132309]

In [26]:

treatment = posterior_samples['p'] control = posterior_samples2['p'] print (treatment) p_treatment_better = np.mean(treatment > 0.19 + control) print(p_treatment_better)

[0.70743952 0.75391949 0.67022999 ... 0.7690026 0.75461525 0.70544399]
0.9525