Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 95652
License: OTHER
Kernel: Python 3
%matplotlib inline import pymc3 as pm import numpy as np import pandas as pd from scipy import stats # R-like interface, alternatively you can import statsmodels as import statsmodels.api as sm import statsmodels.formula.api as smf import statsmodels.api as sm import matplotlib.pyplot as plt %config InlineBackend.figure_format = 'retina' plt.style.use(['seaborn-colorblind', 'seaborn-darkgrid'])
/home/agustina/anaconda3/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools

Code 6.1

data = {'species' : ['afarensis', 'africanus', 'habilis', 'boisei', 'rudolfensis', 'ergaster', 'sapiens'], 'brain' : [438, 452, 612, 521, 752, 871, 1350], 'mass' : [37., 35.5, 34.5, 41.5, 55.5, 61.0, 53.5]} d = pd.DataFrame(data) d
brain mass species
0 438 37.0 afarensis
1 452 35.5 africanus
2 612 34.5 habilis
3 521 41.5 boisei
4 752 55.5 rudolfensis
5 871 61.0 ergaster
6 1350 53.5 sapiens

Code 6.2

m_6_1 = smf.ols('brain ~ mass', data=d).fit()

Code 6.3

1 - m_6_1.resid.var()/d.brain.var() # m_6_1.summary() check the value for R-squared
0.4901580479490838

Code 6.4

m_6_2 = smf.ols('brain ~ mass + I(mass**2)', data=d).fit()

Code 6.5

m_6_3 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3)', data=d).fit() m_6_4 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4)', data=d).fit() m_6_5 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5)', data=d).fit() m_6_6 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5) + I(mass**6)', data=d).fit()

Code 6.6

m_6_7 = smf.ols('brain ~ 1', data=d).fit()

Code 6.7

d_new = d.drop(d.index[-1])

Code 6.8

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(8,3)) ax1.scatter(d.mass, d.brain, alpha=0.8) ax2.scatter(d.mass, d.brain, alpha=0.8) for i in range(len(d)): d_new = d.drop(d.index[-i]) m0 = smf.ols('brain ~ mass', d_new).fit() # need to calculate regression line # need to add intercept term explicitly x = sm.add_constant(d_new.mass) # add constant to new data frame with mass x_pred = pd.DataFrame({'mass': np.linspace(x.mass.min() - 10, x.mass.max() + 10, 50)}) # create linspace dataframe x_pred2 = sm.add_constant(x_pred) # add constant to newly created linspace dataframe y_pred = m0.predict(x_pred2) # calculate predicted values ax1.plot(x_pred, y_pred, 'gray', alpha=.5) ax1.set_ylabel('body mass (kg)', fontsize=12); ax1.set_xlabel('brain volume (cc)', fontsize=12) ax1.set_title('Underfit model') # fifth order model m1 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5)', data=d_new).fit() x = sm.add_constant(d_new.mass) # add constant to new data frame with mass x_pred = pd.DataFrame({'mass': np.linspace(x.mass.min()-10, x.mass.max()+10, 200)}) # create linspace dataframe x_pred2 = sm.add_constant(x_pred) # add constant to newly created linspace dataframe y_pred = m1.predict(x_pred2) # calculate predicted values from fitted model ax2.plot(x_pred, y_pred, 'gray', alpha=.5) ax2.set_xlim(32,62) ax2.set_ylim(-250, 2200) ax2.set_ylabel('body mass (kg)', fontsize=12); ax2.set_xlabel('brain volume (cc)', fontsize=12) ax2.set_title('Overfit model') plt.show()
Image in a Jupyter notebook

Code 6.9

p = (0.3, 0.7) -sum(p * np.log(p))
0.6108643020548935

Code 6.10

# fit model m_6_1 = smf.ols('brain ~ mass', data=d).fit() #compute de deviance by cheating -2 * m_6_1.llf
94.924989685887581

Code 6.11

# standarize the mass before fitting d['mass_s'] = d['mass'] - np.mean(d['mass'] / np.std(d['mass'])) with pm.Model() as m_6_8 : a = pm.Normal('a', mu=np.mean(d['brain']), sd=10) b = pm.Normal('b', mu=0, sd=10) sigma = pm.Uniform('sigma', 0, np.std(d['brain']) * 10) mu = pm.Deterministic('mu', a + b * d['mass_s']) brain = pm.Normal('brain', mu = mu, sd = sigma, observed = d['brain']) m_6_8 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:05<00:00, 338.85it/s]
theta = pm.summary(m_6_8)['mean'][:3]
#compute deviance dev = - 2 * sum(stats.norm.logpdf(d['brain'], loc = theta[0] + theta[1] * d['mass_s'] , scale = theta[2])) dev
100.48254797793284

Code 6.12 - 14

The overthinking section corresponding to cells 6.12-14 is not ported because it requires an ad-hoc rethinking package function. Feel free to contribute code to this section.

Code 6.15

data = pd.read_csv('Data/cars.csv', sep=',')
with pm.Model() as m_6_15 : a = pm.Normal('a', mu=0, sd=100) b = pm.Normal('b', mu=0, sd=10) sigma = pm.Uniform('sigma', 0, 30) mu = pm.Deterministic('mu', a + b * data['speed']) dist = pm.Normal('dist', mu=mu, sd=sigma, observed = data['dist']) m_6_15 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:12<00:00, 161.13it/s]

Code 6.16

n_samples = 1000 n_cases = data.shape[0] ll = np.zeros((n_cases, n_samples))
for s in range(0, n_samples): mu = m_6_15['a'][s] + m_6_15['b'][s] * data['speed'] p_ = stats.norm.logpdf(data['dist'], loc=mu, scale=m_6_15['sigma'][s]) ll[:,s] = p_

Code 6.17

from scipy.misc import logsumexp n_cases = data.shape[0] lppd = np.zeros((n_cases)) for a in range(1, n_cases): lppd[a,] = logsumexp(ll[a,]) - np.log(n_samples)
/home/agustina/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: DeprecationWarning: `logsumexp` is deprecated! Importing `logsumexp` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.logsumexp` instead.

Code 6.18

pWAIC = np.zeros((n_cases)) for i in range(1, n_cases): pWAIC[i,] = np.var(ll[i,])

Code 6.19

- 2 * (sum(lppd) - sum(pWAIC))
412.49046436448634

Code 6.20

waic_vec = - 2 * (lppd - pWAIC) np.sqrt(n_cases * np.var(waic_vec))
14.88328369492803

Code 6.21

d = pd.read_csv('Data/milk.csv', sep=';') d['neocortex'] = d['neocortex.perc'] / 100 d.dropna(inplace=True) d.shape
(17, 9)

Code 6.22

a_start = d['kcal.per.g'].mean() sigma_start = d['kcal.per.g'].std()
import theano mass_shared = theano.shared(np.log(d['mass'].values)) neocortex_shared = theano.shared(d['neocortex'].values) with pm.Model() as m6_11: alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start) mu = alpha + 0 * neocortex_shared sigma = pm.HalfCauchy('sigma',beta=10, testval=sigma_start) kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g']) trace_m6_11 = pm.sample(1000, tune=1000) with pm.Model() as m6_12: alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start) beta = pm.Normal('beta', mu=0, sd=10) sigma = pm.HalfCauchy('sigma',beta=10, testval=sigma_start) mu = alpha + beta * neocortex_shared kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g']) trace_m6_12 = pm.sample(1000, tune=1000) with pm.Model() as m6_13: alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start) beta = pm.Normal('beta', mu=0, sd=10) sigma = pm.HalfCauchy('sigma', beta=10, testval=sigma_start) mu = alpha + beta * mass_shared kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g']) trace_m6_13 = pm.sample(1000, tune=1000) with pm.Model() as m6_14: alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start) beta = pm.Normal('beta', mu=0, sd=10, shape=2) sigma = pm.HalfCauchy('sigma', beta=10, testval=sigma_start) mu = alpha + beta[0] * mass_shared + beta[1] * neocortex_shared kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g']) trace_m6_14 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:08<00:00, 243.55it/s] 87%|████████▋ | 1747/2000 [02:33<00:22, 11.39it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 1 contains 1 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) 100%|█████████▉| 1999/2000 [02:45<00:00, 12.10it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 0 contains 2 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) 100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s] 100%|██████████| 2000/2000 [00:28<00:00, 69.65it/s] 100%|██████████| 2000/2000 [04:31<00:00, 7.38it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:452: UserWarning: The acceptance probability in chain 0 does not match the target. It is 0.681715916019, but should be close to 0.8. Try to increase the number of tuning steps. % (self._chain_id, mean_accept, target_accept)) /home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 0 contains 25 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) /home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 1 contains 5 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging))

Code 6.23

pm.waic(trace_m6_14, m6_14)
/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/stats.py:220: UserWarning: For one or more samples the posterior variance of the log predictive densities exceeds 0.4. This could be indication of WAIC starting to fail see http://arxiv.org/abs/1507.04544 for details """)
WAIC_r(WAIC=-17.054227823698863, WAIC_se=4.8600477062566103, p_WAIC=2.9504889764816951)

Code 6.24

compare_df = pm.compare([trace_m6_11, trace_m6_12, trace_m6_13, trace_m6_14], [m6_11, m6_12, m6_13, m6_14], method='pseudo-BMA') compare_df.loc[:,'model'] = pd.Series(['m6.11', 'm6.12', 'm6.13', 'm6.14']) compare_df = compare_df.set_index('model') compare_df
WAIC pWAIC dWAIC weight SE dSE warning
model
m6.14 -17.05 2.95 0 0.96 4.86 0 1
m6.13 -9.22 1.88 7.84 0.02 3.99 3.25 1
m6.11 -8.63 1.36 8.43 0.01 3.58 4.61 1
m6.12 -7.18 1.93 9.88 0.01 3.15 4.76 1

Code 6.25

pm.compareplot(compare_df);
Image in a Jupyter notebook

Code 6.26

diff = np.random.normal(loc=6.7, scale=7.26, size=100000) sum(diff[diff<0]) / 100000
-0.69650384567058254

Code 6.27

Compare function already checks number of observations to be equal.

coeftab = pd.DataFrame({'m6_11': pm.summary(trace_m6_11)['mean'], 'm6_12': pm.summary(trace_m6_12)['mean'], 'm6_13': pm.summary(trace_m6_13)['mean'], 'm6_14': pm.summary(trace_m6_14)['mean']}) coeftab
m6_11 m6_12 m6_13 m6_14
alpha 0.657464 0.342740 0.706000 -1.074019
beta NaN 0.466021 -0.031884 NaN
beta__0 NaN NaN NaN -0.095749
beta__1 NaN NaN NaN 2.774855
sigma 0.188901 0.192944 0.183208 0.139930

Code 6.28

traces = [trace_m6_11, trace_m6_12, trace_m6_13, trace_m6_14] models = [m6_11, m6_12, m6_13, m6_14]
plt.figure(figsize=(10, 8)) pm.forestplot(traces, plot_kwargs={'fontsize':14});
Image in a Jupyter notebook

Code 6.29

kcal_per_g = np.repeat(0, 30) # empty outcome neocortex = np.linspace(0.5, 0.8, 30) # sequence of neocortex mass = np.repeat(4.5, 30) # average mass
mass_shared.set_value(np.log(mass)) neocortex_shared.set_value(neocortex) post_pred = pm.sample_ppc(trace_m6_14, samples=10000, model=m6_14)
100%|██████████| 10000/10000 [00:37<00:00, 264.13it/s]

Code 6.30

milk_ensemble = pm.sample_ppc_w(traces, 10000, models, weights=compare_df.weight.sort_index(ascending=True))
100%|██████████| 10000/10000 [00:28<00:00, 355.45it/s]
plt.figure(figsize=(8, 6)) plt.plot(neocortex, post_pred['kcal'].mean(0), ls='--', color='C2') hpd_post_pred = pm.hpd(post_pred['kcal']) plt.plot(neocortex,hpd_post_pred[:,0], ls='--', color='C2') plt.plot(neocortex,hpd_post_pred[:,], ls='--', color='C2') plt.plot(neocortex, milk_ensemble['kcal'].mean(0), color='C0') hpd_av = pm.hpd(milk_ensemble['kcal']) plt.fill_between(neocortex, hpd_av[:,0], hpd_av[:,1], alpha=0.1, color='C0') plt.scatter(d['neocortex'], d['kcal.per.g'], facecolor='None', edgecolors='C0') plt.ylim(0.3, 1) plt.xlabel('neocortex', fontsize=16) plt.ylabel('kcal.per.g', fontsize=16);
Image in a Jupyter notebook
import sys, IPython, scipy, matplotlib, platform print("This notebook was createad on a computer %s running %s and using:\nPython %s\nIPython %s\nPyMC3 %s\nNumPy %s\nPandas %s\nSciPy %s\nMatplotlib %s\n" % (platform.machine(), ' '.join(platform.linux_distribution()[:2]), sys.version[:5], IPython.__version__, pm.__version__, np.__version__, pd.__version__, scipy.__version__, matplotlib.__version__))
This notebook was createad on a computer x86_64 running debian stretch/sid and using: Python 3.5.4 IPython 4.1.2 PyMC3 3.2 NumPy 1.13.3 Pandas 0.21.0 SciPy 1.0.0 Matplotlib 2.0.2