Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 95645
License: OTHER
Kernel: Python 3
%matplotlib inline import pymc3 as pm import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt import seaborn as sns import statsmodels.formula.api as smf %config InlineBackend.figure_format = 'retina' plt.style.use(['seaborn-colorblind', 'seaborn-darkgrid'])
/home/osvaldo/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters

Code 5.1

# load data d = pd.read_csv('Data/WaffleDivorce.csv', sep=';') # standardize predictor d['MedianAgeMarriage_s'] = (d.MedianAgeMarriage - d.MedianAgeMarriage.mean()) / d.MedianAgeMarriage.std()
with pm.Model() as model_5_1: a = pm.Normal('a', mu=10, sd=10) bA = pm.Normal('bA', mu=0, sd=1) sigma = pm.Uniform('sigma', lower=0, upper=10) # good (default) alternatives for sigma (in this and other models) are # sigma = pm.HalfNormal('sigma', 5) # sigma = pm.HalfCauchy('sigma', 5) # some people recomed avoiding "hard" boundaries unless they have a theoretical/data-based justification, like a correlation that is restricted to be [-1, 1]. mu = pm.Deterministic('mu', a + bA * d.MedianAgeMarriage_s) Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce) trace_5_1 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bA, a] 100%|██████████| 2000/2000 [00:01<00:00, 1027.91it/s]
varnames = ['a', 'bA', 'sigma'] pm.traceplot(trace_5_1, varnames);
Image in a Jupyter notebook

Code 5.2

mu_mean = trace_5_1['mu'] mu_hpd = pm.hpd(mu_mean) plt.plot(d.MedianAgeMarriage_s, d.Divorce, 'C0o') plt.plot(d.MedianAgeMarriage_s, mu_mean.mean(0), 'C2') idx = np.argsort(d.MedianAgeMarriage_s) plt.fill_between(d.MedianAgeMarriage_s[idx], mu_hpd[:,0][idx], mu_hpd[:,1][idx], color='C2', alpha=0.25) plt.xlabel('Meadian Age Marriage', fontsize=14) plt.ylabel('Divorce', fontsize=14);
Image in a Jupyter notebook
Code 5.3
d['Marriage_s'] = (d.Marriage - d.Marriage.mean()) / d.Marriage.std()
with pm.Model() as model_5_2: a = pm.Normal('a', mu=10, sd=10) bA = pm.Normal('bA', mu=0, sd=1) sigma = pm.Uniform('sigma', lower=0, upper=10) mu = pm.Deterministic('mu', a + bA * d.Marriage_s) Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce) trace_5_2 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bA, a] 100%|██████████| 2000/2000 [00:01<00:00, 1166.50it/s]
pm.traceplot(trace_5_2, varnames);
Image in a Jupyter notebook
mu_mean = trace_5_2['mu'] mu_hpd = pm.hpd(mu_mean) d.plot('Marriage_s', 'Divorce', kind='scatter', xlim = (-2, 3)) plt.plot(d.Marriage_s, mu_mean.mean(0), 'C2') idx = np.argsort(d.Marriage_s) plt.fill_between(d.Marriage_s[idx], mu_hpd[:,0][idx], mu_hpd[:,1][idx], color='C2', alpha=0.25);
Image in a Jupyter notebook

Code 5.4

with pm.Model() as model_5_3: a = pm.Normal('a', mu=10, sd=10) bA = pm.Normal('bA', mu=0, sd=1, shape=2) sigma = pm.Uniform('sigma', lower=0, upper=10) mu = pm.Deterministic('mu', a + bA[0] * d.Marriage_s + bA[1] * d.MedianAgeMarriage_s) Divorce = pm.Normal('Divorce', mu=mu, sd=sigma, observed=d.Divorce) trace_5_3 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bA, a] 100%|██████████| 2000/2000 [00:02<00:00, 699.95it/s]
varnames = ['a', 'bA', 'sigma'] pm.traceplot(trace_5_3, varnames);
Image in a Jupyter notebook
pm.summary(trace_5_3, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 9.688 0.219 0.005 9.337 10.036 1820.070 1.0
bA__0 -0.122 0.289 0.007 -0.579 0.339 1506.622 1.0
bA__1 -1.120 0.288 0.007 -1.578 -0.676 1540.244 1.0
sigma 1.522 0.157 0.003 1.277 1.771 1561.718 1.0

Code 5.5

pm.forestplot(trace_5_3, varnames=varnames);
Image in a Jupyter notebook

Code 5.6

with pm.Model() as model_5_4: a = pm.Normal('a', mu=10, sd=10) b = pm.Normal('b', mu=0, sd=1) sigma = pm.Uniform('sigma', lower=0, upper=10) mu = pm.Deterministic('mu', a + b * d.MedianAgeMarriage_s) Marriage = pm.Normal('Marriage', mu=mu, sd=sigma, observed=d.Marriage_s) trace_5_4 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, b, a] 100%|██████████| 2000/2000 [00:01<00:00, 1320.20it/s]
varnames = ['a', 'b', 'sigma'] pm.traceplot(trace_5_4, varnames);
Image in a Jupyter notebook

Code 5.7

mu_pred = trace_5_4['mu'].mean(0) residuals = d.Marriage_s - mu_pred

Code 5.8

idx = np.argsort(d.MedianAgeMarriage_s) d.plot('MedianAgeMarriage_s', 'Marriage_s', kind='scatter', xlim = (-3, 3), ylim = (-3, 3)) plt.plot(d.MedianAgeMarriage_s[idx], mu_pred[idx], 'k') plt.vlines(d.MedianAgeMarriage_s, mu_pred, mu_pred + residuals, colors='grey');
Image in a Jupyter notebook

Code 5.9

R_avg = np.linspace(-3, 3, 100) mu_pred = trace_5_3['a'] + trace_5_3['bA'][:,0] * R_avg[:,None] mu_hpd = pm.hpd(mu_pred.T) divorce_hpd = pm.hpd(stats.norm.rvs(mu_pred, trace_5_3['sigma']).T) plt.plot(R_avg, mu_pred.mean(1), 'C0'); plt.fill_between(R_avg, mu_hpd[:,0], mu_hpd[:,1], color='C2', alpha=0.25) plt.fill_between(R_avg, divorce_hpd[:,0], divorce_hpd[:,1], color='C2', alpha=0.25) plt.xlabel('Marriage.s') plt.ylabel('Divorce') plt.title('MedianAgeMarriage_s = 0');
Image in a Jupyter notebook

Code 5.10

R_avg = np.linspace(-3, 3, 100) mu_pred = trace_5_3['a'] + trace_5_3['bA'][:,1] * R_avg[:,None] mu_hpd = pm.hpd(mu_pred.T) divorce_hpd = pm.hpd(stats.norm.rvs(mu_pred, trace_5_3['sigma']).T) plt.plot(R_avg, mu_pred.mean(1), 'C0'); plt.fill_between(R_avg, mu_hpd[:,0], mu_hpd[:,1], color='C2', alpha=0.25) plt.fill_between(R_avg, divorce_hpd[:,0], divorce_hpd[:,1], color='C2', alpha=0.25) plt.xlabel('MedianAgeMarriage.s') plt.ylabel('Divorce') plt.title('Marriage_s = 0');
Image in a Jupyter notebook

Code 5.11

mu_pred = trace_5_3['mu'] mu_hpd = pm.hpd(mu_pred) divorce_pred = pm.sample_ppc(trace_5_3, samples=1000, model=model_5_3)['Divorce'] divorce_hpd = pm.hpd(divorce_pred)
100%|██████████| 1000/1000 [00:00<00:00, 2766.11it/s]

Code 5.12

mu_hpd = pm.hpd(mu_pred, alpha=0.05) plt.errorbar(d.Divorce, divorce_pred.mean(0), yerr=np.abs(divorce_pred.mean(0)-mu_hpd.T) , fmt='C0o') plt.plot(d.Divorce, divorce_pred.mean(0), 'C0o') plt.xlabel('Observed divorce', fontsize=14) plt.ylabel('Predicted divorce', fontsize=14) min_x, max_x = d.Divorce.min(), d.Divorce.max() plt.plot([min_x, max_x], [min_x, max_x], 'k--');
Image in a Jupyter notebook

Code 5.14

plt.figure(figsize=(10,12)) residuals = d.Divorce - mu_pred.mean(0) idx = np.argsort(residuals) y_label = d.Loc[idx] y_points = np.linspace(0, 1, 50) plt.errorbar(residuals[idx], y_points, xerr=np.abs(divorce_pred.mean(0)-mu_hpd.T), fmt='C0o',lw=3) plt.errorbar(residuals[idx], y_points, xerr=np.abs(divorce_pred.mean(0)-divorce_hpd.T), fmt='C0o', lw=3, alpha=0.5) plt.yticks(y_points, y_label); plt.vlines(0, 0, 1, 'grey');
Image in a Jupyter notebook

Code 5.15

N = 100 x_real = stats.norm.rvs(size=N) x_spur = stats.norm.rvs(x_real) y = stats.norm.rvs(x_real) d = pd.DataFrame([y, x_real, x_spur]).T sns.pairplot(d);
Image in a Jupyter notebook

Code 5.16

d = pd.read_csv('Data/milk.csv', sep=';') d.head()
clade species kcal.per.g perc.fat perc.protein perc.lactose mass neocortex.perc
0 Strepsirrhine Eulemur fulvus 0.49 16.60 15.42 67.98 1.95 55.16
1 Strepsirrhine E macaco 0.51 19.27 16.91 63.82 2.09 NaN
2 Strepsirrhine E mongoz 0.46 14.11 16.85 69.04 2.51 NaN
3 Strepsirrhine E rubriventer 0.48 14.91 13.18 71.91 1.62 NaN
4 Strepsirrhine Lemur catta 0.60 27.28 19.50 53.22 2.19 NaN

Code 5.17 to 5.20

dcc = d.dropna().copy()
with pm.Model() as model_5_5: a = pm.Normal('a', mu=10, sd=100) bn = pm.Normal('bn', mu=0, sd=1) sigma = pm.Uniform('sigma', lower=0, upper=1) mu = pm.Deterministic('mu', a + bn * dcc['neocortex.perc']) kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g']) trace_5_5 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bn, a] 100%|██████████| 2000/2000 [00:09<00:00, 217.22it/s] There were 4 divergences after tuning. Increase `target_accept` or reparameterize. The number of effective samples is smaller than 25% for some parameters.
varnames = ['a', 'bn', 'sigma'] pm.traceplot(trace_5_5, varnames);
Image in a Jupyter notebook

Code 5.21

pm.summary(trace_5_5, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.338 0.554 0.024 -0.540 1.205 536.423 1.0
bn 0.005 0.008 0.000 -0.008 0.018 536.629 1.0
sigma 0.196 0.041 0.002 0.134 0.254 474.932 1.0

Code 5.22

trace_5_5['bn'].mean() * (76 - 55)
0.09955421253996859

Code 5.23

seq = np.linspace(55, 76, 50) mu_pred = trace_5_5['a'] + trace_5_5['bn'] * seq[:,None] mu_hpd = pm.hpd(mu_pred.T) plt.plot(d['neocortex.perc'], d['kcal.per.g'], 'C0o') plt.plot(seq, mu_pred.mean(1), 'k') plt.plot(seq, mu_hpd[:,0], 'k--') plt.plot(seq, mu_hpd[:,1], 'k--') plt.xlabel('neocortex.perc', fontsize=14) plt.ylabel('kcal.per.g', fontsize=14);
Image in a Jupyter notebook

Code 5.24

dcc['log_mass'] = np.log(dcc['mass'])

Code 5.25

with pm.Model() as model_5_6: a = pm.Normal('a', mu=10, sd=100) bn = pm.Normal('bn', mu=0, sd=1) sigma = pm.Uniform('sigma', lower=0, upper=1) mu = pm.Deterministic('mu', a + bn * dcc['log_mass']) kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g']) trace_5_6 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bn, a] 100%|██████████| 2000/2000 [00:01<00:00, 1062.03it/s]
pm.summary(trace_5_6, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.703 0.058 0.002 0.609 0.791 900.755 1.000
bn -0.031 0.024 0.001 -0.072 0.003 914.185 0.999
sigma 0.182 0.037 0.001 0.126 0.232 1031.675 1.000

Code 5.26

with pm.Model() as model_5_7: a = pm.Normal('a', mu=10, sd=100) bn = pm.Normal('bn', mu=0, sd=1, shape=2) sigma = pm.Uniform('sigma', lower=0, upper=1) mu = pm.Deterministic('mu', a + bn[0] * dcc['neocortex.perc'] + bn[1] * dcc['log_mass']) kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=dcc['kcal.per.g']) trace_5_7 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bn, a] 100%|██████████| 2000/2000 [00:16<00:00, 123.40it/s] There were 3 divergences after tuning. Increase `target_accept` or reparameterize. There were 11 divergences after tuning. Increase `target_accept` or reparameterize. The number of effective samples is smaller than 25% for some parameters.
pm.summary(trace_5_7, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a -1.109 0.579 0.023 -1.959 -0.097 480.448 1.000
bn__0 0.028 0.009 0.000 0.013 0.042 470.322 1.000
bn__1 -0.097 0.028 0.001 -0.140 -0.051 574.242 0.999
sigma 0.140 0.030 0.001 0.093 0.179 282.447 1.001

Code 5.27

seq = np.linspace(55, 76, 50) mu_pred = trace_5_7['a'] + trace_5_7['bn'][:,0] * seq[:,None] + trace_5_7['bn'][:,1] * dcc['log_mass'].mean() mu_hpd = pm.hpd(mu_pred.T) plt.plot(seq, mu_pred.mean(1), 'k') plt.plot(seq, mu_hpd[:,0], 'k--') plt.plot(seq, mu_hpd[:,1], 'k--') plt.xlabel('neocortex.perc') plt.ylabel('kcal.per.g');
Image in a Jupyter notebook

Code 5.28

N = 100 rho = 0.7 x_pos = stats.norm.rvs(size=N) x_neg = stats.norm.rvs(rho*x_pos, (1-rho**2)**0.5) y = stats.norm.rvs(x_pos - x_neg) d = pd.DataFrame([y, x_real, x_spur]).T sns.pairplot(d);
Image in a Jupyter notebook

Code 5.29

N = 100 height = stats.norm.rvs(size=N, loc=10, scale=2) leg_prop = stats.uniform.rvs(size=N, loc=0.4, scale=0.1) leg_left = leg_prop * height + stats.norm.rvs(size=N, loc=0, scale=0.02) leg_right = leg_prop * height + stats.norm.rvs(size=N, loc=0, scale=0.02)

Code 5.30

with pm.Model() as m5_8: a = pm.Normal('a', mu=10, sd=100) bl = pm.Normal('bl', mu=2, sd=10) br = pm.Normal('br', mu=2, sd=10) mu = pm.Deterministic('mu', a + bl * leg_left + br * leg_right) sigma = pm.Uniform('sigma', lower=0 , upper=10) height_p = pm.Normal('height_p', mu=mu, sd=sigma, observed=height) trace_5_8 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, br, bl, a] 100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]
varnames=['a', 'bl', 'br', 'sigma'] pm.traceplot(trace_5_8, varnames); pm.summary(trace_5_8, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.706 0.265 0.008 0.292 1.126 978.995 1.0
bl 3.498 1.882 0.065 0.728 6.536 773.283 1.0
br -1.443 1.890 0.065 -4.147 1.677 774.994 1.0
sigma 0.577 0.040 0.001 0.516 0.637 1097.410 1.0
Image in a Jupyter notebook

Code 5.31

pm.forestplot(trace_5_8, varnames=varnames);
Image in a Jupyter notebook

Code 5.32

plt.scatter(trace_5_8['br'], trace_5_8['bl']);
Image in a Jupyter notebook

Code 5.33

sum_blbr = trace_5_8['br'] + trace_5_8['bl'] pm.kdeplot(sum_blbr);
Image in a Jupyter notebook

Code 5.34

with pm.Model() as m5_9: a = pm.Normal('a',mu = 10, sd=100) bl = pm.Normal('bl',mu=2, sd= 10) mu = pm.Deterministic('mu',a + bl * leg_left) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) height = pm.Normal('height',mu=mu, sd=sigma, observed=height) trace_5_9 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bl, a] 100%|██████████| 2000/2000 [00:04<00:00, 497.31it/s]
varnames_1 = ['a', 'bl', 'sigma'] #pm.traceplot(trace_5_9, varnames_1) pm.summary(trace_5_9, varnames_1, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.678 0.253 0.009 0.288 1.083 598.443 1.0
bl 2.062 0.056 0.002 1.972 2.150 555.537 1.0
sigma 0.573 0.042 0.001 0.508 0.639 1022.338 1.0

Code 5.35

milk = pd.read_csv('Data/milk.csv', sep=';') milk.head()
clade species kcal.per.g perc.fat perc.protein perc.lactose mass neocortex.perc
0 Strepsirrhine Eulemur fulvus 0.49 16.60 15.42 67.98 1.95 55.16
1 Strepsirrhine E macaco 0.51 19.27 16.91 63.82 2.09 NaN
2 Strepsirrhine E mongoz 0.46 14.11 16.85 69.04 2.51 NaN
3 Strepsirrhine E rubriventer 0.48 14.91 13.18 71.91 1.62 NaN
4 Strepsirrhine Lemur catta 0.60 27.28 19.50 53.22 2.19 NaN

Code 5.36

with pm.Model() as m5_10: a = pm.Normal('a',mu = 0.6, sd=10) bf = pm.Normal('bf',mu=0, sd= 1) mu = pm.Deterministic('mu',a + bf * milk['perc.fat']) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g']) trace_5_10 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bf, a] 100%|██████████| 2000/2000 [00:03<00:00, 584.32it/s]
varnames = ['a', 'bf', 'sigma'] pm.summary(trace_5_10, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.296 0.038 0.001 0.230 0.352 669.791 1.000
bf 0.010 0.001 0.000 0.009 0.012 672.732 1.000
sigma 0.079 0.012 0.000 0.061 0.097 804.753 1.003
with pm.Model() as m5_11: a = pm.Normal('a',mu = 0.6, sd=10) bl = pm.Normal('bl',mu=0, sd= 1) mu = pm.Deterministic('mu',a + bl * milk['perc.lactose']) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g']) trace_5_11 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bl, a] 100%|██████████| 2000/2000 [00:04<00:00, 422.39it/s]
varnames = ['a', 'bl', 'sigma'] pm.summary(trace_5_11, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 1.164 0.048 0.002 1.088 1.239 641.491 1.001
bl -0.011 0.001 0.000 -0.012 -0.009 692.192 1.001
sigma 0.067 0.009 0.000 0.052 0.081 910.639 1.002

Code 5.37

with pm.Model() as m5_12: a = pm.Normal('a',mu = 0.6, sd=10) bf = pm.Normal('bf',mu=0, sd= 1) bl = pm.Normal('bl',mu=0, sd= 1) mu = pm.Deterministic('mu',a + bf * milk['perc.fat'] + bl * milk['perc.lactose']) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) kcalperg = pm.Normal('kcal.per.g',mu=mu, sd=sigma, observed=milk['kcal.per.g']) trace_5_12 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bl, bf, a] 100%|██████████| 2000/2000 [00:14<00:00, 137.25it/s] The acceptance probability does not match the target. It is 0.8913043977304509, but should be close to 0.8. Try to increase the number of tuning steps. There were 10 divergences after tuning. Increase `target_accept` or reparameterize. The acceptance probability does not match the target. It is 0.7166625089873978, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
varnames = ['a', 'bf', 'bl', 'sigma'] pm.summary(trace_5_12, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.995 0.223 0.01 0.635 1.348 485.708 1.001
bf 0.002 0.003 0.00 -0.002 0.006 495.420 1.001
bl -0.009 0.003 0.00 -0.013 -0.004 494.107 1.001
sigma 0.068 0.011 0.00 0.051 0.081 491.258 1.002

Code 5.38

df = milk[['kcal.per.g','perc.fat','perc.lactose']] sns.pairplot(df);
Image in a Jupyter notebook

Code 5.39

milk.corr()['perc.fat']['perc.lactose']
-0.9416373456839282

Code 5.40

def simcoll(r = 0.9): milk['x'] = stats.norm.rvs(size=len(milk), loc = r * milk['perc.fat'], scale = np.sqrt((1 - r**2) * milk['perc.fat'].var())) X = np.column_stack((milk['perc.fat'], milk['x'])) m = smf.OLS(milk['kcal.per.g'], X).fit() cov = m.cov_params() return (np.diag(cov)[1])**0.5 def repsimcoll(r= 0.9, N = 100): stddev = [simcoll(r) for _ in range(N)] return np.mean(stddev) lista = [] for i in np.arange(start = 0, stop = 0.99, step = 0.01): lista.append(repsimcoll (r= i, N = 100)) plt.plot(np.arange(start = 0, stop = 0.99, step = 0.01), lista) plt.xlabel('correlation', fontsize=14) plt.ylabel('stddev', fontsize=14);
Image in a Jupyter notebook

Code 5.41

# number of plants N = 100 # simulate initial heights h0 = stats.norm.rvs(size = N, loc = 10, scale = 2) # assign treatments and simulate fungus and growth treatment = np.repeat([0, 1], [N/2]*2) fungus = np.random.binomial(n=1, p=(0.5-treatment * 0.4), size=N) h1 = h0 + stats.norm.rvs(size= N, loc= 5- 3*fungus, scale=1) # compose a clean data frame d = pd.DataFrame({'h0': h0, 'h1': h1, 'Treatment':treatment, 'Fungus': fungus})

Code 5.42

with pm.Model() as m5_13: a = pm.Normal('a',mu = 0, sd=100) bh = pm.Normal('bh',mu = 0, sd=10) bt = pm.Normal('bt',mu = 0, sd=10) bf = pm.Normal('bf',mu = 0, sd=10) mu = pm.Deterministic('mu',a + bh * h0 + bt * treatment + bf * fungus) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) h1 = pm.Normal('h1', mu = mu, sd=sigma, observed = d['h1'].get_values()) trace_5_13 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bf, bt, bh, a] 100%|██████████| 2000/2000 [00:09<00:00, 219.22it/s]
varnames = ['a', 'bh', 'bt', 'bf', 'sigma'] pm.summary(trace_5_13, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 5.152 0.522 0.015 4.346 5.958 945.669 1.000
bh 0.990 0.052 0.002 0.912 1.072 942.363 1.000
bt -0.047 0.206 0.005 -0.362 0.287 1353.866 1.000
bf -2.668 0.232 0.006 -3.051 -2.326 1269.197 1.000
sigma 0.985 0.071 0.002 0.876 1.094 1499.170 1.001

Code 5.43

with pm.Model() as m5_14: a = pm.Normal('a',mu = 0, sd=100) bh = pm.Normal('bh',mu = 0, sd=10) bt = pm.Normal('bt',mu = 0, sd=10) mu = pm.Deterministic('mu',a + bh * h0 + bt * treatment) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) h1 = pm.Normal('h1', mu = mu, sd=sigma, observed =d['h1']) trace_5_14 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bt, bh, a] 100%|██████████| 2000/2000 [00:06<00:00, 323.02it/s] The acceptance probability does not match the target. It is 0.9031963539075056, but should be close to 0.8. Try to increase the number of tuning steps.
varnames = ['a', 'bh', 'bt', 'sigma'] pm.summary(trace_5_14, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 4.289 0.801 0.026 2.849 5.387 808.481 0.999
bh 0.962 0.081 0.003 0.836 1.088 828.823 1.000
bt 0.770 0.294 0.008 0.283 1.216 1210.581 1.000
sigma 1.502 0.111 0.003 1.328 1.679 1320.224 1.000

Code 5.44

d = pd.read_csv('Data/Howell1.csv', sep=';') d.head()
height weight age male
0 151.765 47.825606 63.0 1
1 139.700 36.485807 63.0 0
2 136.525 31.864838 65.0 0
3 156.845 53.041915 41.0 1
4 145.415 41.276872 51.0 0

Code 5.45

with pm.Model() as m5_15: a = pm.Normal('a',mu = 178, sd=100) bm = pm.Normal('bm',mu = 0, sd=10) mu = pm.Deterministic('mu',a + bm * d['male']) sigma = pm.Uniform('sigma', lower= 0 , upper= 50) height = pm.Normal('height', mu = mu, sd=sigma, observed = d['height']) trace_5_15 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, bm, a] 100%|██████████| 2000/2000 [00:02<00:00, 838.72it/s]
varnames = ['a', 'bm', 'sigma'] pm.summary(trace_5_15, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 134.842 1.631 0.048 132.307 137.442 966.190 1.0
bm 7.222 2.354 0.069 3.571 11.139 974.984 1.0
sigma 27.405 0.798 0.020 26.216 28.720 1916.216 1.0

Code 5.46

mu.male = trace_5_15['a'] + trace_5_15['bm'] pm.hpd(mu.male)
array([138.85167759, 145.38548207])

Code 5.47

with pm.Model() as m5_15b: af = pm.Normal('af',mu = 178, sd=100) am = pm.Normal('am',mu = 178, sd=100) mu = pm.Deterministic('mu',af * (1 - d['male']) + am * d['male']) sigma = pm.Uniform('sigma', lower= 0 , upper= 50) height = pm.Normal('height', mu = mu, sd=sigma, observed = d['height']) trace_5_15b = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, am, af] 100%|██████████| 2000/2000 [00:01<00:00, 1144.87it/s]

Code 5.48

d = pd.read_csv('Data/milk.csv', sep=';') d = d.drop_duplicates()

Code 5.49

d['clade.NWM'] = np.where( d['clade'] == 'New World Monkey', 1, 0) d['clade.NWM'].get_values()
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Code 5.50

d['clade.OWM'] = np.where( d['clade'] == 'Old World Monkey', 1, 0) d['clade.S'] = np.where( d['clade'] == 'Strepsirrhine', 1, 0)

Code 5.51

with pm.Model() as m5_16: a = pm.Normal('a', mu = 0.6, sd=10) b_NWM = pm.Normal('b_NWM',mu = 0, sd=1) b_OWM = pm.Normal('b_OWM',mu = 0, sd=1) b_S = pm.Normal('b_S',mu = 0, sd=1) mu = pm.Deterministic('mu', a + b_NWM * d['clade.NWM'] + b_OWM * d['clade.OWM'] + b_S * d['clade.S']) # instead of adding this as a deterministic when running the model # it is possible to add them, after sampling using something like this # trace_5_16.add_values({'mu_NWM', trace_5_16[a] + trace_5_16['b_NWM']}) mu_ape = pm.Deterministic('mu_ape', a + 0) mu_NWM = pm.Deterministic('mu_NWM', a + b_NWM) mu_OWM = pm.Deterministic('mu_OWM', a + b_OWM) mu_S = pm.Deterministic('mu_S', a + b_S) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) kcal_per_g = pm.Normal('kcal_per_g', mu = mu, sd=sigma, observed = d['kcal.per.g']) trace_5_16 = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, b_S, b_OWM, b_NWM, a] 100%|██████████| 2000/2000 [00:02<00:00, 689.23it/s]
varnames = ['a', 'b_NWM', 'b_OWM', 'b_S', 'sigma'] pm.summary(trace_5_16, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
a 0.546 0.044 0.001 0.472 0.612 986.461 1.0
b_NWM 0.166 0.062 0.002 0.079 0.272 1144.439 1.0
b_OWM 0.243 0.068 0.002 0.139 0.350 1063.904 1.0
b_S -0.036 0.073 0.002 -0.155 0.078 1195.937 1.0
sigma 0.130 0.019 0.000 0.103 0.163 1166.728 1.0

Code 5.52

varnames = ['mu_ape', 'mu_NWM', 'b_OWM', 'b_S'] pm.summary(trace_5_16, varnames, alpha=.11).round(3)[['mean', 'sd', 'hpd_5.5', 'hpd_94.5']]
mean sd hpd_5.5 hpd_94.5
mu_ape 0.546 0.044 0.472 0.612
mu_NWM 0.712 0.044 0.643 0.780
b_OWM 0.243 0.068 0.139 0.350
b_S -0.036 0.073 -0.155 0.078

Code 5.53

diff_NMW_OWM = trace_5_16['mu_NWM'] - trace_5_16['mu_OWM'] np.percentile(diff_NMW_OWM, 2.5), np.percentile(diff_NMW_OWM, 50), np.percentile(diff_NMW_OWM, 97.5)
(-0.20627036770774942, -0.07802395845314053, 0.05473488710942071)

Code 5.54

z = pd.Categorical(d['clade']) d['clade_id'] = z.codes

Code 5.55

with pm.Model() as m5_16_alt: a = pm.Normal('a',mu = 0.6, sd=10) mu = pm.Deterministic('mu', a) sigma = pm.Uniform('sigma', lower= 0 , upper= 10) kcal_per_g = pm.Normal('kcal_per_g', mu = mu, sd=sigma, observed = d['kcal.per.g']) trace_5_16_alt = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sigma_interval__, mu] 100%|██████████| 2000/2000 [00:01<00:00, 1755.60it/s]
varnames = ['mu', 'sigma'] pm.summary(trace_5_16_alt, varnames, alpha=.11).round(3)
mean sd mc_error hpd_5.5 hpd_94.5 n_eff Rhat
mu 0.641 0.032 0.001 0.588 0.688 1877.261 1.0
sigma 0.168 0.023 0.001 0.131 0.201 1930.809 1.0

The following cells (5.56-5.61) correspond to example code for the use of R's function: lm. Therefore they have no output.

Code 5.62

data = pd.read_csv('Data/cars.csv', sep=',') pm.GLM.from_formula('dist ~ speed', data=data)
InterceptFlat()speedNormal(mu=0, sd=1000.0)sdHalfCauchy(beta=10)yNormal(mu=f(f(array, f(Intercept, speed)), f()), sd=f(sd))\begin{array}{rcl} \text{Intercept} &\sim & \text{Flat}()\\\text{speed} &\sim & \text{Normal}(\mathit{mu}=0,~\mathit{sd}=1000.0)\\\text{sd} &\sim & \text{HalfCauchy}(\mathit{beta}=10)\\\text{y} &\sim & \text{Normal}(\mathit{mu}=f(f(array,~f(\text{Intercept},~\text{speed})),~f()),~\mathit{sd}=f(\text{sd})) \end{array}
import sys, IPython, scipy, matplotlib, platform print("This notebook was createad on a computer %s running %s and using:\nPython %s\nIPython %s\nPyMC3 %s\nNumPy %s\nPandas %s\nSciPy %s\nMatplotlib %s\nSeaborn %s\n" % (platform.machine(), ' '.join(platform.linux_distribution()[:2]), sys.version[:5], IPython.__version__, pm.__version__, np.__version__, pd.__version__, scipy.__version__, matplotlib.__version__, sns.__version__))
This notebook was createad on a computer x86_64 running debian stretch/sid and using: Python 3.6.3 IPython 6.2.1 PyMC3 3.3 NumPy 1.14.1 Pandas 0.22.0 SciPy 1.0.0 Matplotlib 2.1.2 Seaborn 0.8.1