CoCalc -- Chp_05.ipynb

📚 The CoCalc Library - books, templates and other resources

Project: 📚 The Library - Shared Public Version

Path: cocalc-examples / statistical-rethinking-python-PyMC3 / Chp_05.ipynb

Views: ⁹⁵⁶⁴⁵
License: OTHER

Kernel: Python 3

In [1]:

%matplotlib inline
import pymc3 as pm
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

%config InlineBackend.figure_format = 'retina'
plt.style.use(['seaborn-colorblind', 'seaborn-darkgrid'])

/home/osvaldo/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Code 5.1

In [2]:

# load data
d = pd.read_csv('Data/WaffleDivorce.csv', sep=';')
# standardize predictor
d['MedianAgeMarriage_s'] = (d.MedianAgeMarriage - d.MedianAgeMarriage.mean()) / d.MedianAgeMarriage.std()

In [3]:

with pm.Model() as model_5_1:
    a = pm.Normal('a', mu=10, sd=10)
    bA = pm.Normal('bA', mu=0, sd=1)
    sigma = pm.Uniform('sigma', lower=0, upper=10)
    # good (default) alternatives for sigma (in this and other models) are
    # sigma = pm.HalfNormal('sigma', 5)
    # sigma = pm.HalfCauchy('sigma', 5)
    # some people recomed avoiding "hard" boundaries unless they have a theoretical/data-based justification, like a correlation that is restricted to be [-1, 1].
    mu = pm.Deterministic('mu', a + bA * d.MedianAgeMarriage_s)
    Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce)
    trace_5_1 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bA, a]
100%|██████████| 2000/2000 [00:01<00:00, 1027.91it/s]

In [4]:

varnames = ['a', 'bA', 'sigma']
pm.traceplot(trace_5_1, varnames);

Code 5.2

In [5]:

mu_mean = trace_5_1['mu']
mu_hpd = pm.hpd(mu_mean)

plt.plot(d.MedianAgeMarriage_s, d.Divorce, 'C0o')
plt.plot(d.MedianAgeMarriage_s, mu_mean.mean(0), 'C2')

idx = np.argsort(d.MedianAgeMarriage_s)
plt.fill_between(d.MedianAgeMarriage_s[idx],
                 mu_hpd[:,0][idx], mu_hpd[:,1][idx], color='C2', alpha=0.25)

plt.xlabel('Meadian Age Marriage', fontsize=14)
plt.ylabel('Divorce', fontsize=14);

Code 5.3

In [6]:

d['Marriage_s'] = (d.Marriage - d.Marriage.mean()) / d.Marriage.std()

In [7]:

with pm.Model() as model_5_2:
    a = pm.Normal('a', mu=10, sd=10)
    bA = pm.Normal('bA', mu=0, sd=1)
    sigma = pm.Uniform('sigma', lower=0, upper=10)
    mu = pm.Deterministic('mu', a + bA * d.Marriage_s)
    Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce)
    trace_5_2 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bA, a]
100%|██████████| 2000/2000 [00:01<00:00, 1166.50it/s]

In [8]:

pm.traceplot(trace_5_2, varnames);

In [9]:

mu_mean = trace_5_2['mu']
mu_hpd = pm.hpd(mu_mean)

d.plot('Marriage_s', 'Divorce', kind='scatter', xlim = (-2, 3))
plt.plot(d.Marriage_s, mu_mean.mean(0), 'C2')

idx = np.argsort(d.Marriage_s)
plt.fill_between(d.Marriage_s[idx], mu_hpd[:,0][idx], mu_hpd[:,1][idx],
                 color='C2', alpha=0.25);

Code 5.4

In [10]:

with pm.Model() as model_5_3:
    a = pm.Normal('a', mu=10, sd=10)
    bA = pm.Normal('bA', mu=0, sd=1, shape=2)
    sigma = pm.Uniform('sigma', lower=0, upper=10)
    mu = pm.Deterministic('mu', a + bA[0] * d.Marriage_s + bA[1] * d.MedianAgeMarriage_s)
    
    Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce)
    trace_5_3 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bA, a]
100%|██████████| 2000/2000 [00:02<00:00, 699.95it/s]

In [11]:

varnames = ['a', 'bA', 'sigma']
pm.traceplot(trace_5_3, varnames);

In [12]:

pm.summary(trace_5_3, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	9.688	0.219	0.005	9.337	10.036	1820.070	1.0
bA__0	-0.122	0.289	0.007	-0.579	0.339	1506.622	1.0
bA__1	-1.120	0.288	0.007	-1.578	-0.676	1540.244	1.0
sigma	1.522	0.157	0.003	1.277	1.771	1561.718	1.0

Code 5.5

In [14]:

pm.forestplot(trace_5_3, varnames=varnames);

Code 5.6

In [15]:

with pm.Model() as model_5_4:
    a = pm.Normal('a', mu=10, sd=10)
    b = pm.Normal('b', mu=0, sd=1)
    sigma = pm.Uniform('sigma', lower=0, upper=10)
    mu = pm.Deterministic('mu', a + b * d.MedianAgeMarriage_s)
    Marriage = pm.Normal('Marriage', mu=mu, sd=sigma, observed=d.Marriage_s)
    trace_5_4 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, b, a]
100%|██████████| 2000/2000 [00:01<00:00, 1320.20it/s]

In [16]:

varnames = ['a', 'b', 'sigma']
pm.traceplot(trace_5_4, varnames);

Code 5.7

In [17]:

mu_pred = trace_5_4['mu'].mean(0)
residuals = d.Marriage_s - mu_pred

Code 5.8

In [18]:

idx = np.argsort(d.MedianAgeMarriage_s)
d.plot('MedianAgeMarriage_s', 'Marriage_s', kind='scatter', xlim = (-3, 3), ylim = (-3, 3))
plt.plot(d.MedianAgeMarriage_s[idx], mu_pred[idx], 'k')
plt.vlines(d.MedianAgeMarriage_s, mu_pred, mu_pred + residuals, colors='grey');

Code 5.9

In [19]:

R_avg = np.linspace(-3, 3, 100)

mu_pred = trace_5_3['a'] + trace_5_3['bA'][:,0] * R_avg[:,None]
mu_hpd = pm.hpd(mu_pred.T)
divorce_hpd = pm.hpd(stats.norm.rvs(mu_pred, trace_5_3['sigma']).T)

plt.plot(R_avg, mu_pred.mean(1), 'C0');
plt.fill_between(R_avg, mu_hpd[:,0], mu_hpd[:,1], color='C2', alpha=0.25)
plt.fill_between(R_avg, divorce_hpd[:,0], divorce_hpd[:,1], color='C2', alpha=0.25)

plt.xlabel('Marriage.s')
plt.ylabel('Divorce')
plt.title('MedianAgeMarriage_s = 0');

Code 5.10

In [20]:

R_avg = np.linspace(-3, 3, 100)

mu_pred = trace_5_3['a'] + trace_5_3['bA'][:,1] * R_avg[:,None]
mu_hpd = pm.hpd(mu_pred.T)
divorce_hpd = pm.hpd(stats.norm.rvs(mu_pred, trace_5_3['sigma']).T)


plt.plot(R_avg, mu_pred.mean(1), 'C0');
plt.fill_between(R_avg, mu_hpd[:,0], mu_hpd[:,1], color='C2', alpha=0.25)
plt.fill_between(R_avg, divorce_hpd[:,0], divorce_hpd[:,1], color='C2', alpha=0.25)

plt.xlabel('MedianAgeMarriage.s')
plt.ylabel('Divorce')
plt.title('Marriage_s = 0');

Code 5.11

In [21]:

mu_pred = trace_5_3['mu']
mu_hpd = pm.hpd(mu_pred)

divorce_pred = pm.sample_ppc(trace_5_3, samples=1000, model=model_5_3)['Divorce']
divorce_hpd = pm.hpd(divorce_pred)

100%|██████████| 1000/1000 [00:00<00:00, 2766.11it/s]

Code 5.12

In [22]:

mu_hpd = pm.hpd(mu_pred, alpha=0.05)
plt.errorbar(d.Divorce, divorce_pred.mean(0), yerr=np.abs(divorce_pred.mean(0)-mu_hpd.T) , fmt='C0o')
plt.plot(d.Divorce, divorce_pred.mean(0), 'C0o')

plt.xlabel('Observed divorce', fontsize=14)
plt.ylabel('Predicted divorce', fontsize=14)

min_x, max_x = d.Divorce.min(), d.Divorce.max()
plt.plot([min_x, max_x], [min_x, max_x], 'k--');

Code 5.14

In [23]:

plt.figure(figsize=(10,12))
residuals = d.Divorce - mu_pred.mean(0)
idx = np.argsort(residuals)
y_label = d.Loc[idx]
y_points = np.linspace(0, 1, 50)
plt.errorbar(residuals[idx], y_points, 
             xerr=np.abs(divorce_pred.mean(0)-mu_hpd.T),
             fmt='C0o',lw=3)

plt.errorbar(residuals[idx], y_points, 
             xerr=np.abs(divorce_pred.mean(0)-divorce_hpd.T),
             fmt='C0o', lw=3, alpha=0.5)

plt.yticks(y_points, y_label);
plt.vlines(0, 0, 1, 'grey');

Code 5.15

In [24]:

N = 100
x_real = stats.norm.rvs(size=N)
x_spur = stats.norm.rvs(x_real)
y = stats.norm.rvs(x_real)
d = pd.DataFrame([y, x_real, x_spur]).T

sns.pairplot(d);

Code 5.16

In [25]:

d = pd.read_csv('Data/milk.csv', sep=';')
d.head()

	clade	species	kcal.per.g	perc.fat	perc.protein	perc.lactose	mass	neocortex.perc
0	Strepsirrhine	Eulemur fulvus	0.49	16.60	15.42	67.98	1.95	55.16
1	Strepsirrhine	E macaco	0.51	19.27	16.91	63.82	2.09	NaN
2	Strepsirrhine	E mongoz	0.46	14.11	16.85	69.04	2.51	NaN
3	Strepsirrhine	E rubriventer	0.48	14.91	13.18	71.91	1.62	NaN
4	Strepsirrhine	Lemur catta	0.60	27.28	19.50	53.22	2.19	NaN

Code 5.17 to 5.20

In [26]:

dcc = d.dropna().copy()

In [27]:

with pm.Model() as model_5_5:
    a = pm.Normal('a', mu=10, sd=100)
    bn = pm.Normal('bn', mu=0, sd=1)
    sigma = pm.Uniform('sigma', lower=0, upper=1)
    mu = pm.Deterministic('mu', a + bn * dcc['neocortex.perc'])
    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g'])
    trace_5_5 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bn, a]
100%|██████████| 2000/2000 [00:09<00:00, 217.22it/s]
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 25% for some parameters.

In [28]:

varnames = ['a', 'bn', 'sigma']
pm.traceplot(trace_5_5, varnames);

Code 5.21

In [31]:

pm.summary(trace_5_5, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.338	0.554	0.024	-0.540	1.205	536.423	1.0
bn	0.005	0.008	0.000	-0.008	0.018	536.629	1.0
sigma	0.196	0.041	0.002	0.134	0.254	474.932	1.0

Code 5.22

In [32]:

trace_5_5['bn'].mean() * (76 - 55)

0.09955421253996859

Code 5.23

In [33]:

seq = np.linspace(55, 76, 50)
mu_pred = trace_5_5['a'] + trace_5_5['bn'] * seq[:,None]
mu_hpd = pm.hpd(mu_pred.T)

plt.plot(d['neocortex.perc'], d['kcal.per.g'], 'C0o')

plt.plot(seq, mu_pred.mean(1), 'k')
plt.plot(seq, mu_hpd[:,0], 'k--')
plt.plot(seq, mu_hpd[:,1], 'k--')

plt.xlabel('neocortex.perc', fontsize=14)
plt.ylabel('kcal.per.g', fontsize=14);

Code 5.24

In [34]:

dcc['log_mass'] = np.log(dcc['mass'])

Code 5.25

In [35]:

with pm.Model() as model_5_6:
    a = pm.Normal('a', mu=10, sd=100)
    bn = pm.Normal('bn', mu=0, sd=1)
    sigma = pm.Uniform('sigma', lower=0, upper=1)
    mu = pm.Deterministic('mu', a + bn * dcc['log_mass'])
    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g'])
    trace_5_6 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bn, a]
100%|██████████| 2000/2000 [00:01<00:00, 1062.03it/s]

In [36]:

pm.summary(trace_5_6, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.703	0.058	0.002	0.609	0.791	900.755	1.000
bn	-0.031	0.024	0.001	-0.072	0.003	914.185	0.999
sigma	0.182	0.037	0.001	0.126	0.232	1031.675	1.000

Code 5.26

In [37]:

with pm.Model() as model_5_7:
    a = pm.Normal('a', mu=10, sd=100)
    bn = pm.Normal('bn', mu=0, sd=1, shape=2)
    sigma = pm.Uniform('sigma', lower=0, upper=1)
    mu = pm.Deterministic('mu', a + bn[0] * dcc['neocortex.perc'] + bn[1] * dcc['log_mass'])
    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g'])
    trace_5_7 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bn, a]
100%|██████████| 2000/2000 [00:16<00:00, 123.40it/s]
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
There were 11 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 25% for some parameters.

In [38]:

pm.summary(trace_5_7, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	-1.109	0.579	0.023	-1.959	-0.097	480.448	1.000
bn__0	0.028	0.009	0.000	0.013	0.042	470.322	1.000
bn__1	-0.097	0.028	0.001	-0.140	-0.051	574.242	0.999
sigma	0.140	0.030	0.001	0.093	0.179	282.447	1.001

Code 5.27

In [39]:

seq = np.linspace(55, 76, 50)
mu_pred = trace_5_7['a'] + trace_5_7['bn'][:,0] * seq[:,None] + trace_5_7['bn'][:,1] * dcc['log_mass'].mean()
mu_hpd = pm.hpd(mu_pred.T)

plt.plot(seq, mu_pred.mean(1), 'k')
plt.plot(seq, mu_hpd[:,0], 'k--')
plt.plot(seq, mu_hpd[:,1], 'k--')

plt.xlabel('neocortex.perc')
plt.ylabel('kcal.per.g');

Code 5.28

In [40]:

N = 100
rho = 0.7
x_pos = stats.norm.rvs(size=N)
x_neg = stats.norm.rvs(rho*x_pos, (1-rho**2)**0.5)
y = stats.norm.rvs(x_pos - x_neg)
d = pd.DataFrame([y, x_real, x_spur]).T

sns.pairplot(d);

Code 5.29

In [41]:

N = 100
height = stats.norm.rvs(size=N, loc=10, scale=2)
leg_prop = stats.uniform.rvs(size=N, loc=0.4, scale=0.1) 
leg_left = leg_prop * height + stats.norm.rvs(size=N, loc=0, scale=0.02)
leg_right = leg_prop * height + stats.norm.rvs(size=N, loc=0, scale=0.02)

Code 5.30

In [42]:

with pm.Model() as m5_8:
    a = pm.Normal('a', mu=10, sd=100)
    bl = pm.Normal('bl', mu=2, sd=10)
    br = pm.Normal('br', mu=2, sd=10)
    mu = pm.Deterministic('mu', a + bl * leg_left + br * leg_right)    
    sigma = pm.Uniform('sigma', lower=0 , upper=10)
    height_p = pm.Normal('height_p', mu=mu, sd=sigma, observed=height)
    
    trace_5_8 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, br, bl, a]
100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]

In [43]:

varnames=['a', 'bl', 'br', 'sigma']
pm.traceplot(trace_5_8, varnames);
pm.summary(trace_5_8, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.706	0.265	0.008	0.292	1.126	978.995	1.0
bl	3.498	1.882	0.065	0.728	6.536	773.283	1.0
br	-1.443	1.890	0.065	-4.147	1.677	774.994	1.0
sigma	0.577	0.040	0.001	0.516	0.637	1097.410	1.0

Code 5.31

In [45]:

pm.forestplot(trace_5_8, varnames=varnames);

Code 5.32

In [46]:

plt.scatter(trace_5_8['br'], trace_5_8['bl']);

Code 5.33

In [47]:

sum_blbr = trace_5_8['br'] + trace_5_8['bl']
pm.kdeplot(sum_blbr);

Code 5.34

In [48]:

with pm.Model() as m5_9:
    a = pm.Normal('a',mu = 10, sd=100)
    bl = pm.Normal('bl',mu=2, sd= 10)
    mu = pm.Deterministic('mu',a + bl * leg_left)    
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    height = pm.Normal('height',mu=mu, sd=sigma, observed=height)
    trace_5_9 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bl, a]
100%|██████████| 2000/2000 [00:04<00:00, 497.31it/s]

In [49]:

varnames_1 = ['a', 'bl', 'sigma']
#pm.traceplot(trace_5_9, varnames_1)
pm.summary(trace_5_9, varnames_1, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.678	0.253	0.009	0.288	1.083	598.443	1.0
bl	2.062	0.056	0.002	1.972	2.150	555.537	1.0
sigma	0.573	0.042	0.001	0.508	0.639	1022.338	1.0

Code 5.35

In [50]:

milk = pd.read_csv('Data/milk.csv', sep=';')
milk.head()

	clade	species	kcal.per.g	perc.fat	perc.protein	perc.lactose	mass	neocortex.perc
0	Strepsirrhine	Eulemur fulvus	0.49	16.60	15.42	67.98	1.95	55.16
1	Strepsirrhine	E macaco	0.51	19.27	16.91	63.82	2.09	NaN
2	Strepsirrhine	E mongoz	0.46	14.11	16.85	69.04	2.51	NaN
3	Strepsirrhine	E rubriventer	0.48	14.91	13.18	71.91	1.62	NaN
4	Strepsirrhine	Lemur catta	0.60	27.28	19.50	53.22	2.19	NaN

Code 5.36

In [51]:

with pm.Model() as m5_10:
    a = pm.Normal('a',mu = 0.6, sd=10)
    bf = pm.Normal('bf',mu=0, sd= 1)
    mu = pm.Deterministic('mu',a + bf * milk['perc.fat'])    
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g'])
    trace_5_10 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bf, a]
100%|██████████| 2000/2000 [00:03<00:00, 584.32it/s]

In [52]:

varnames = ['a', 'bf', 'sigma']
pm.summary(trace_5_10, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.296	0.038	0.001	0.230	0.352	669.791	1.000
bf	0.010	0.001	0.000	0.009	0.012	672.732	1.000
sigma	0.079	0.012	0.000	0.061	0.097	804.753	1.003

In [53]:

with pm.Model() as m5_11:
    a = pm.Normal('a',mu = 0.6, sd=10)
    bl = pm.Normal('bl',mu=0, sd= 1)
    mu = pm.Deterministic('mu',a + bl * milk['perc.lactose'])    
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g'])
    trace_5_11 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bl, a]
100%|██████████| 2000/2000 [00:04<00:00, 422.39it/s]

In [54]:

varnames = ['a', 'bl', 'sigma']
pm.summary(trace_5_11, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	1.164	0.048	0.002	1.088	1.239	641.491	1.001
bl	-0.011	0.001	0.000	-0.012	-0.009	692.192	1.001
sigma	0.067	0.009	0.000	0.052	0.081	910.639	1.002

Code 5.37

In [55]:

with pm.Model() as m5_12:
    a = pm.Normal('a',mu = 0.6, sd=10)
    bf = pm.Normal('bf',mu=0, sd= 1)
    bl = pm.Normal('bl',mu=0, sd= 1)
    mu = pm.Deterministic('mu',a + bf * milk['perc.fat'] + bl * milk['perc.lactose'])    
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g'])
    trace_5_12 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bl, bf, a]
100%|██████████| 2000/2000 [00:14<00:00, 137.25it/s]
The acceptance probability does not match the target. It is 0.8913043977304509, but should be close to 0.8. Try to increase the number of tuning steps.
There were 10 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.7166625089873978, but should be close to 0.8. Try to increase the number of tuning steps.
The number of effective samples is smaller than 25% for some parameters.

In [56]:

varnames = ['a', 'bf', 'bl', 'sigma']
pm.summary(trace_5_12, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.995	0.223	0.01	0.635	1.348	485.708	1.001
bf	0.002	0.003	0.00	-0.002	0.006	495.420	1.001
bl	-0.009	0.003	0.00	-0.013	-0.004	494.107	1.001
sigma	0.068	0.011	0.00	0.051	0.081	491.258	1.002

Code 5.38

In [57]:

df = milk[['kcal.per.g','perc.fat','perc.lactose']]
sns.pairplot(df);

Code 5.39

In [58]:

milk.corr()['perc.fat']['perc.lactose']

-0.9416373456839282

Code 5.40

In [59]:

def simcoll(r = 0.9):
    milk['x'] = stats.norm.rvs(size=len(milk), loc = r * milk['perc.fat'],
                       scale = np.sqrt((1 - r**2) * milk['perc.fat'].var()))
    X = np.column_stack((milk['perc.fat'], milk['x']))
    m = smf.OLS(milk['kcal.per.g'], X).fit()
    cov = m.cov_params()
    return (np.diag(cov)[1])**0.5
    
def repsimcoll(r= 0.9, N = 100):
    stddev = [simcoll(r) for _ in range(N)]
    return np.mean(stddev)

lista = []
for i in np.arange(start = 0, stop = 0.99, step = 0.01):   
    lista.append(repsimcoll (r= i, N = 100))

plt.plot(np.arange(start = 0, stop = 0.99, step = 0.01), lista)
plt.xlabel('correlation', fontsize=14)
plt.ylabel('stddev', fontsize=14);

Code 5.41

In [60]:

# number of plants
N = 100

# simulate initial heights
h0 = stats.norm.rvs(size = N, loc = 10, scale = 2)

# assign treatments and simulate fungus and growth
treatment = np.repeat([0, 1], [N/2]*2)
fungus = np.random.binomial(n=1, p=(0.5-treatment * 0.4), size=N)
h1 = h0 + stats.norm.rvs(size= N, loc= 5- 3*fungus, scale=1)

# compose a clean data frame
d = pd.DataFrame({'h0': h0,
                  'h1': h1,
                  'Treatment':treatment,
                  'Fungus': fungus})

Code 5.42

In [61]:

with pm.Model() as m5_13:
    a = pm.Normal('a',mu = 0, sd=100)
    bh = pm.Normal('bh',mu = 0, sd=10)
    bt = pm.Normal('bt',mu = 0, sd=10)
    bf = pm.Normal('bf',mu = 0, sd=10)
    mu = pm.Deterministic('mu',a + bh * h0 + bt * treatment + bf * fungus)
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    h1 = pm.Normal('h1', mu = mu, sd=sigma, observed = d['h1'].get_values())
    trace_5_13 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bf, bt, bh, a]
100%|██████████| 2000/2000 [00:09<00:00, 219.22it/s]

In [62]:

varnames = ['a', 'bh', 'bt', 'bf', 'sigma']
pm.summary(trace_5_13, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	5.152	0.522	0.015	4.346	5.958	945.669	1.000
bh	0.990	0.052	0.002	0.912	1.072	942.363	1.000
bt	-0.047	0.206	0.005	-0.362	0.287	1353.866	1.000
bf	-2.668	0.232	0.006	-3.051	-2.326	1269.197	1.000
sigma	0.985	0.071	0.002	0.876	1.094	1499.170	1.001

Code 5.43

In [63]:

with pm.Model() as m5_14:
    a = pm.Normal('a',mu = 0, sd=100)
    bh = pm.Normal('bh',mu = 0, sd=10)
    bt = pm.Normal('bt',mu = 0, sd=10)
    mu = pm.Deterministic('mu',a + bh * h0 + bt * treatment)
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    h1 = pm.Normal('h1', mu = mu, sd=sigma, observed =d['h1'])
    trace_5_14 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bt, bh, a]
100%|██████████| 2000/2000 [00:06<00:00, 323.02it/s]
The acceptance probability does not match the target. It is 0.9031963539075056, but should be close to 0.8. Try to increase the number of tuning steps.

In [64]:

varnames = ['a', 'bh', 'bt', 'sigma']
pm.summary(trace_5_14, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	4.289	0.801	0.026	2.849	5.387	808.481	0.999
bh	0.962	0.081	0.003	0.836	1.088	828.823	1.000
bt	0.770	0.294	0.008	0.283	1.216	1210.581	1.000
sigma	1.502	0.111	0.003	1.328	1.679	1320.224	1.000

Code 5.44

In [65]:

d = pd.read_csv('Data/Howell1.csv', sep=';')
d.head()

	height	weight	age	male
0	151.765	47.825606	63.0	1
1	139.700	36.485807	63.0	0
2	136.525	31.864838	65.0	0
3	156.845	53.041915	41.0	1
4	145.415	41.276872	51.0	0

Code 5.45

In [66]:

with pm.Model() as m5_15:
    a = pm.Normal('a',mu = 178, sd=100)
    bm = pm.Normal('bm',mu = 0, sd=10)
    mu = pm.Deterministic('mu',a + bm * d['male'])
    sigma = pm.Uniform('sigma', lower= 0 , upper= 50)
    height = pm.Normal('height', mu = mu, sd=sigma, observed = d['height'])
    trace_5_15 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, bm, a]
100%|██████████| 2000/2000 [00:02<00:00, 838.72it/s]

In [67]:

varnames = ['a', 'bm', 'sigma']
pm.summary(trace_5_15, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	134.842	1.631	0.048	132.307	137.442	966.190	1.0
bm	7.222	2.354	0.069	3.571	11.139	974.984	1.0
sigma	27.405	0.798	0.020	26.216	28.720	1916.216	1.0

Code 5.46

In [68]:

mu.male = trace_5_15['a'] + trace_5_15['bm']
pm.hpd(mu.male)

array([138.85167759, 145.38548207])

Code 5.47

In [69]:

with pm.Model() as m5_15b:
    af = pm.Normal('af',mu = 178, sd=100)
    am = pm.Normal('am',mu = 178, sd=100)
    mu = pm.Deterministic('mu',af * (1 - d['male']) + am * d['male'])
    sigma = pm.Uniform('sigma', lower= 0 , upper= 50)
    height = pm.Normal('height', mu = mu, sd=sigma, observed = d['height'])
    trace_5_15b = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, am, af]
100%|██████████| 2000/2000 [00:01<00:00, 1144.87it/s]

Code 5.48

In [70]:

d = pd.read_csv('Data/milk.csv', sep=';')
d = d.drop_duplicates()

Code 5.49

In [71]:

d['clade.NWM'] = np.where( d['clade'] == 'New World Monkey', 1, 0)
d['clade.NWM'].get_values()

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

Code 5.50

In [72]:

d['clade.OWM'] = np.where( d['clade'] == 'Old World Monkey', 1, 0)
d['clade.S'] = np.where( d['clade'] == 'Strepsirrhine', 1, 0)

Code 5.51

In [73]:

with pm.Model() as m5_16:
    a = pm.Normal('a', mu = 0.6, sd=10)
    b_NWM = pm.Normal('b_NWM',mu = 0, sd=1)
    b_OWM = pm.Normal('b_OWM',mu = 0, sd=1)
    b_S = pm.Normal('b_S',mu = 0, sd=1)
    mu = pm.Deterministic('mu', a + 
                          b_NWM * d['clade.NWM'] + 
                          b_OWM * d['clade.OWM'] +
                          b_S * d['clade.S'])
    
    # instead of adding this as a deterministic when running the model 
    # it is possible to add them, after sampling using something like this
    # trace_5_16.add_values({'mu_NWM', trace_5_16[a] + trace_5_16['b_NWM']})
    mu_ape = pm.Deterministic('mu_ape', a + 0)
    mu_NWM = pm.Deterministic('mu_NWM', a + b_NWM)
    mu_OWM = pm.Deterministic('mu_OWM', a + b_OWM)
    mu_S =  pm.Deterministic('mu_S', a + b_S)
    
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    kcal_per_g = pm.Normal('kcal_per_g', mu = mu, sd=sigma, observed = d['kcal.per.g'])
    trace_5_16 = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, b_S, b_OWM, b_NWM, a]
100%|██████████| 2000/2000 [00:02<00:00, 689.23it/s]

In [74]:

varnames = ['a', 'b_NWM', 'b_OWM', 'b_S', 'sigma']
pm.summary(trace_5_16, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
a	0.546	0.044	0.001	0.472	0.612	986.461	1.0
b_NWM	0.166	0.062	0.002	0.079	0.272	1144.439	1.0
b_OWM	0.243	0.068	0.002	0.139	0.350	1063.904	1.0
b_S	-0.036	0.073	0.002	-0.155	0.078	1195.937	1.0
sigma	0.130	0.019	0.000	0.103	0.163	1166.728	1.0

Code 5.52

In [75]:

varnames = ['mu_ape', 'mu_NWM', 'b_OWM', 'b_S']
pm.summary(trace_5_16, varnames, alpha=.11).round(3)[['mean', 'sd', 'hpd_5.5', 'hpd_94.5']]

	mean	sd	hpd_5.5	hpd_94.5
mu_ape	0.546	0.044	0.472	0.612
mu_NWM	0.712	0.044	0.643	0.780
b_OWM	0.243	0.068	0.139	0.350
b_S	-0.036	0.073	-0.155	0.078

Code 5.53

In [76]:

diff_NMW_OWM = trace_5_16['mu_NWM'] - trace_5_16['mu_OWM']
np.percentile(diff_NMW_OWM, 2.5), np.percentile(diff_NMW_OWM, 50), np.percentile(diff_NMW_OWM, 97.5)

(-0.20627036770774942, -0.07802395845314053, 0.05473488710942071)

Code 5.54

In [77]:

z = pd.Categorical(d['clade'])
d['clade_id'] = z.codes

Code 5.55

In [78]:

with pm.Model() as m5_16_alt:
    a = pm.Normal('a',mu = 0.6, sd=10)
    mu = pm.Deterministic('mu', a)
    sigma = pm.Uniform('sigma', lower= 0 , upper= 10)
    kcal_per_g = pm.Normal('kcal_per_g', mu = mu, sd=sigma, observed = d['kcal.per.g'])
    trace_5_16_alt = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma_interval__, mu]
100%|██████████| 2000/2000 [00:01<00:00, 1755.60it/s]

In [79]:

varnames = ['mu', 'sigma']
pm.summary(trace_5_16_alt, varnames, alpha=.11).round(3)

	mean	sd	mc_error	hpd_5.5	hpd_94.5	n_eff	Rhat
mu	0.641	0.032	0.001	0.588	0.688	1877.261	1.0
sigma	0.168	0.023	0.001	0.131	0.201	1930.809	1.0

The following cells (5.56-5.61) correspond to example code for the use of R's function: lm. Therefore they have no output.

Code 5.62

In [80]:

data  = pd.read_csv('Data/cars.csv', sep=',') 
pm.GLM.from_formula('dist ~ speed', data=data)

\begin{array}{rcl} \text{Intercept} &\sim & \text{Flat}()\\\text{speed} &\sim & \text{Normal}(\mathit{mu}=0,~\mathit{sd}=1000.0)\\\text{sd} &\sim & \text{HalfCauchy}(\mathit{beta}=10)\\\text{y} &\sim & \text{Normal}(\mathit{mu}=f(f(array,~f(\text{Intercept},~\text{speed})),~f()),~\mathit{sd}=f(\text{sd})) \end{array}

In [81]:

import sys, IPython, scipy, matplotlib, platform
print("This notebook was createad on a computer %s running %s and using:\nPython %s\nIPython %s\nPyMC3 %s\nNumPy %s\nPandas %s\nSciPy %s\nMatplotlib %s\nSeaborn %s\n" % (platform.machine(), ' '.join(platform.linux_distribution()[:2]), sys.version[:5], IPython.__version__, pm.__version__, np.__version__, pd.__version__, scipy.__version__, matplotlib.__version__, sns.__version__))

This notebook was createad on a computer x86_64 running debian stretch/sid and using:
Python 3.6.3
IPython 6.2.1
PyMC3 3.3
NumPy 1.14.1
Pandas 0.22.0
SciPy 1.0.0
Matplotlib 2.1.2
Seaborn 0.8.1