Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7115
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import pandas
12
import patsy
13
import random
14
import numpy as np
15
import statsmodels.api as sm
16
import statsmodels.formula.api as smf
17
import re
18
19
import chap01soln
20
import first
21
import linear
22
import thinkplot
23
import thinkstats2
24
25
26
def QuickLeastSquares(xs, ys):
27
"""Estimates linear least squares fit and returns MSE.
28
29
xs: sequence of values
30
ys: sequence of values
31
32
returns: inter, slope, mse
33
"""
34
n = float(len(xs))
35
36
meanx = xs.mean()
37
dxs = xs - meanx
38
varx = np.dot(dxs, dxs) / n
39
40
meany = ys.mean()
41
dys = ys - meany
42
43
cov = np.dot(dxs, dys) / n
44
slope = cov / varx
45
inter = meany - slope * meanx
46
47
res = ys - (inter + slope * xs)
48
mse = np.dot(res, res) / n
49
return inter, slope, mse
50
51
52
def ReadVariables():
53
"""Reads Stata dictionary files for NSFG data.
54
55
returns: DataFrame that maps variables names to descriptions
56
"""
57
vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
58
vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables
59
60
all_vars = vars1.append(vars2)
61
all_vars.index = all_vars.name
62
return all_vars
63
64
65
def JoinFemResp(df):
66
"""Reads the female respondent file and joins on caseid.
67
68
df: DataFrame
69
"""
70
resp = chap01soln.ReadFemResp()
71
resp.index = resp.caseid
72
73
join = df.join(resp, on='caseid', rsuffix='_r')
74
75
# convert from colon-separated time strings to datetimes
76
join.screentime = pandas.to_datetime(join.screentime)
77
78
return join
79
80
81
MESSAGE = """If you get this error, it's probably because
82
you are running Python 3 and the nice people who maintain
83
Patsy have not fixed this problem:
84
https://github.com/pydata/patsy/issues/34
85
86
While we wait, I suggest running this example in
87
Python 2, or skipping this example."""
88
89
90
def GoMining(df):
91
"""Searches for variables that predict birth weight.
92
93
df: DataFrame of pregnancy records
94
95
returns: list of (rsquared, variable name) pairs
96
"""
97
variables = []
98
for name in df.columns:
99
try:
100
if df[name].var() < 1e-7:
101
continue
102
103
formula = 'totalwgt_lb ~ agepreg + ' + name
104
formula = formula.encode('ascii')
105
106
model = smf.ols(formula, data=df)
107
if model.nobs < len(df)/2:
108
continue
109
110
results = model.fit()
111
except (ValueError, TypeError):
112
continue
113
except patsy.PatsyError:
114
raise ValueError(MESSAGE)
115
116
variables.append((results.rsquared, name))
117
118
return variables
119
120
121
def MiningReport(variables, n=30):
122
"""Prints variables with the highest R^2.
123
124
t: list of (R^2, variable name) pairs
125
n: number of pairs to print
126
"""
127
all_vars = ReadVariables()
128
129
variables.sort(reverse=True)
130
for mse, name in variables[:n]:
131
key = re.sub('_r$', '', name)
132
try:
133
desc = all_vars.loc[key].desc
134
if isinstance(desc, pandas.Series):
135
desc = desc[0]
136
print(name, mse, desc)
137
except KeyError:
138
print(name, mse)
139
140
141
def PredictBirthWeight(live):
142
"""Predicts birth weight of a baby at 30 weeks.
143
144
live: DataFrame of live births
145
"""
146
live = live[live.prglngth>30]
147
join = JoinFemResp(live)
148
149
t = GoMining(join)
150
MiningReport(t)
151
152
formula = ('totalwgt_lb ~ agepreg + C(race) + babysex==1 + '
153
'nbrnaliv>1 + paydu==1 + totincr')
154
results = smf.ols(formula, data=join).fit()
155
SummarizeResults(results)
156
157
158
def SummarizeResults(results):
159
"""Prints the most important parts of linear regression results:
160
161
results: RegressionResults object
162
"""
163
for name, param in results.params.items():
164
pvalue = results.pvalues[name]
165
print('%s %0.3g (%.3g)' % (name, param, pvalue))
166
167
try:
168
print('R^2 %.4g' % results.rsquared)
169
ys = results.model.endog
170
print('Std(ys) %.4g' % ys.std())
171
print('Std(res) %.4g' % results.resid.std())
172
except AttributeError:
173
print('R^2 %.4g' % results.prsquared)
174
175
176
def RunSimpleRegression(live):
177
"""Runs a simple regression and compare results to thinkstats2 functions.
178
179
live: DataFrame of live births
180
"""
181
# run the regression with thinkstats2 functions
182
live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb'])
183
ages = live_dropna.agepreg
184
weights = live_dropna.totalwgt_lb
185
inter, slope = thinkstats2.LeastSquares(ages, weights)
186
res = thinkstats2.Residuals(ages, weights, inter, slope)
187
r2 = thinkstats2.CoefDetermination(weights, res)
188
189
# run the regression with statsmodels
190
formula = 'totalwgt_lb ~ agepreg'
191
model = smf.ols(formula, data=live)
192
results = model.fit()
193
SummarizeResults(results)
194
195
def AlmostEquals(x, y, tol=1e-6):
196
return abs(x-y) < tol
197
198
assert(AlmostEquals(results.params['Intercept'], inter))
199
assert(AlmostEquals(results.params['agepreg'], slope))
200
assert(AlmostEquals(results.rsquared, r2))
201
202
203
def PivotTables(live):
204
"""Prints a pivot table comparing first babies to others.
205
206
live: DataFrame of live births
207
"""
208
table = pandas.pivot_table(live, rows='isfirst',
209
values=['totalwgt_lb', 'agepreg'])
210
print(table)
211
212
213
def FormatRow(results, columns):
214
"""Converts regression results to a string.
215
216
results: RegressionResults object
217
218
returns: string
219
"""
220
t = []
221
for col in columns:
222
coef = results.params.get(col, np.nan)
223
pval = results.pvalues.get(col, np.nan)
224
if np.isnan(coef):
225
s = '--'
226
elif pval < 0.001:
227
s = '%0.3g (*)' % (coef)
228
else:
229
s = '%0.3g (%0.2g)' % (coef, pval)
230
t.append(s)
231
232
try:
233
t.append('%.2g' % results.rsquared)
234
except AttributeError:
235
t.append('%.2g' % results.prsquared)
236
237
return t
238
239
240
def RunModels(live):
241
"""Runs regressions that predict birth weight.
242
243
live: DataFrame of pregnancy records
244
"""
245
columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
246
header = ['isfirst', 'agepreg', 'agepreg2']
247
248
rows = []
249
formula = 'totalwgt_lb ~ isfirst'
250
results = smf.ols(formula, data=live).fit()
251
rows.append(FormatRow(results, columns))
252
print(formula)
253
SummarizeResults(results)
254
255
formula = 'totalwgt_lb ~ agepreg'
256
results = smf.ols(formula, data=live).fit()
257
rows.append(FormatRow(results, columns))
258
print(formula)
259
SummarizeResults(results)
260
261
formula = 'totalwgt_lb ~ isfirst + agepreg'
262
results = smf.ols(formula, data=live).fit()
263
rows.append(FormatRow(results, columns))
264
print(formula)
265
SummarizeResults(results)
266
267
live['agepreg2'] = live.agepreg**2
268
formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
269
results = smf.ols(formula, data=live).fit()
270
rows.append(FormatRow(results, columns))
271
print(formula)
272
SummarizeResults(results)
273
274
PrintTabular(rows, header)
275
276
277
def PrintTabular(rows, header):
278
"""Prints results in LaTeX tabular format.
279
280
rows: list of rows
281
header: list of strings
282
"""
283
s = r'\hline ' + ' & '.join(header) + r' \\ \hline'
284
print(s)
285
286
for row in rows:
287
s = ' & '.join(row) + r' \\'
288
print(s)
289
290
print(r'\hline')
291
292
293
def LogisticRegressionExample():
294
"""Runs a simple example of logistic regression and prints results.
295
"""
296
y = np.array([0, 1, 0, 1])
297
x1 = np.array([0, 0, 0, 1])
298
x2 = np.array([0, 1, 1, 1])
299
300
beta = [-1.5, 2.8, 1.1]
301
302
log_o = beta[0] + beta[1] * x1 + beta[2] * x2
303
print(log_o)
304
305
o = np.exp(log_o)
306
print(o)
307
308
p = o / (o+1)
309
print(p)
310
311
like = y * p + (1-y) * (1-p)
312
print(like)
313
print(np.prod(like))
314
315
df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2))
316
results = smf.logit('y ~ x1 + x2', data=df).fit()
317
print(results.summary())
318
319
320
321
def RunLogisticModels(live):
322
"""Runs regressions that predict sex.
323
324
live: DataFrame of pregnancy records
325
"""
326
#live = linear.ResampleRowsWeighted(live)
327
328
df = live[live.prglngth>30]
329
330
df['boy'] = (df.babysex==1).astype(int)
331
df['isyoung'] = (df.agepreg<20).astype(int)
332
df['isold'] = (df.agepreg<35).astype(int)
333
df['season'] = (((df.datend+1) % 12) / 3).astype(int)
334
335
# run the simple model
336
model = smf.logit('boy ~ agepreg', data=df)
337
results = model.fit()
338
print('nobs', results.nobs)
339
print(type(results))
340
SummarizeResults(results)
341
342
# run the complex model
343
model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df)
344
results = model.fit()
345
print('nobs', results.nobs)
346
print(type(results))
347
SummarizeResults(results)
348
349
# make the scatter plot
350
exog = pandas.DataFrame(model.exog, columns=model.exog_names)
351
endog = pandas.DataFrame(model.endog, columns=[model.endog_names])
352
353
xs = exog['agepreg']
354
lo = results.fittedvalues
355
o = np.exp(lo)
356
p = o / (o+1)
357
358
#thinkplot.Scatter(xs, p, alpha=0.1)
359
#thinkplot.Show()
360
361
# compute accuracy
362
actual = endog['boy']
363
baseline = actual.mean()
364
365
predict = (results.predict() >= 0.5)
366
true_pos = predict * actual
367
true_neg = (1 - predict) * (1 - actual)
368
369
acc = (sum(true_pos) + sum(true_neg)) / len(actual)
370
print(acc, baseline)
371
372
columns = ['agepreg', 'hpagelb', 'birthord', 'race']
373
new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns)
374
y = results.predict(new)
375
print(y)
376
377
378
def main(name, data_dir='.'):
379
thinkstats2.RandomSeed(17)
380
LogisticRegressionExample()
381
382
live, firsts, others = first.MakeFrames()
383
live['isfirst'] = (live.birthord == 1)
384
385
RunLogisticModels(live)
386
387
RunSimpleRegression(live)
388
RunModels(live)
389
390
PredictBirthWeight(live)
391
392
393
if __name__ == '__main__':
394
import sys
395
main(*sys.argv)
396
397