CoCalc -- regression.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Path: think-stats-code / regression.py
Views: ⁷¹¹⁵
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
import math
11
import pandas
12
import patsy
13
import random
14
import numpy as np
15
import statsmodels.api as sm
16
import statsmodels.formula.api as smf
17
import re
18

19
import chap01soln
20
import first
21
import linear
22
import thinkplot
23
import thinkstats2
24

25

26
def QuickLeastSquares(xs, ys):
27
    """Estimates linear least squares fit and returns MSE.
28

29
    xs: sequence of values
30
    ys: sequence of values
31

32
    returns: inter, slope, mse
33
    """
34
    n = float(len(xs))
35

36
    meanx = xs.mean()
37
    dxs = xs - meanx
38
    varx = np.dot(dxs, dxs) / n
39

40
    meany = ys.mean()
41
    dys = ys - meany
42

43
    cov = np.dot(dxs, dys) / n
44
    slope = cov / varx
45
    inter = meany - slope * meanx
46

47
    res = ys - (inter + slope * xs)
48
    mse = np.dot(res, res) / n
49
    return inter, slope, mse
50

51

52
def ReadVariables():
53
    """Reads Stata dictionary files for NSFG data.
54

55
    returns: DataFrame that maps variables names to descriptions
56
    """
57
    vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
58
    vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables
59

60
    all_vars = vars1.append(vars2)
61
    all_vars.index = all_vars.name
62
    return all_vars
63

64

65
def JoinFemResp(df):
66
    """Reads the female respondent file and joins on caseid.
67

68
    df: DataFrame
69
    """
70
    resp = chap01soln.ReadFemResp()
71
    resp.index = resp.caseid
72

73
    join = df.join(resp, on='caseid', rsuffix='_r')
74

75
    # convert from colon-separated time strings to datetimes
76
    join.screentime = pandas.to_datetime(join.screentime)
77

78
    return join
79

80

81
MESSAGE = """If you get this error, it's probably because 
82
you are running Python 3 and the nice people who maintain
83
Patsy have not fixed this problem:
84
https://github.com/pydata/patsy/issues/34
85

86
While we wait, I suggest running this example in
87
Python 2, or skipping this example."""
88

89

90
def GoMining(df):
91
    """Searches for variables that predict birth weight.
92

93
    df: DataFrame of pregnancy records
94

95
    returns: list of (rsquared, variable name) pairs
96
    """
97
    variables = []
98
    for name in df.columns:
99
        try:
100
            if df[name].var() < 1e-7:
101
                continue
102

103
            formula = 'totalwgt_lb ~ agepreg + ' + name
104
            formula = formula.encode('ascii')
105

106
            model = smf.ols(formula, data=df)
107
            if model.nobs < len(df)/2:
108
                continue
109

110
            results = model.fit()
111
        except (ValueError, TypeError):
112
            continue
113
        except patsy.PatsyError:
114
            raise ValueError(MESSAGE)
115

116
        variables.append((results.rsquared, name))
117

118
    return variables
119

120

121
def MiningReport(variables, n=30):
122
    """Prints variables with the highest R^2.
123

124
    t: list of (R^2, variable name) pairs
125
    n: number of pairs to print
126
    """
127
    all_vars = ReadVariables()
128

129
    variables.sort(reverse=True)
130
    for mse, name in variables[:n]:
131
        key = re.sub('_r$', '', name)
132
        try:
133
            desc = all_vars.loc[key].desc
134
            if isinstance(desc, pandas.Series):
135
                desc = desc[0]
136
            print(name, mse, desc)
137
        except KeyError:
138
            print(name, mse)
139

140

141
def PredictBirthWeight(live):
142
    """Predicts birth weight of a baby at 30 weeks.
143

144
    live: DataFrame of live births
145
    """
146
    live = live[live.prglngth>30]
147
    join = JoinFemResp(live)
148

149
    t = GoMining(join)
150
    MiningReport(t)
151

152
    formula = ('totalwgt_lb ~ agepreg + C(race) + babysex==1 + '
153
               'nbrnaliv>1 + paydu==1 + totincr')
154
    results = smf.ols(formula, data=join).fit()
155
    SummarizeResults(results)
156

157

158
def SummarizeResults(results):
159
    """Prints the most important parts of linear regression results:
160

161
    results: RegressionResults object
162
    """
163
    for name, param in results.params.items():
164
        pvalue = results.pvalues[name]
165
        print('%s   %0.3g   (%.3g)' % (name, param, pvalue))
166

167
    try:
168
        print('R^2 %.4g' % results.rsquared)
169
        ys = results.model.endog
170
        print('Std(ys) %.4g' % ys.std())
171
        print('Std(res) %.4g' % results.resid.std())
172
    except AttributeError:
173
        print('R^2 %.4g' % results.prsquared)
174

175

176
def RunSimpleRegression(live):
177
    """Runs a simple regression and compare results to thinkstats2 functions.
178

179
    live: DataFrame of live births
180
    """
181
    # run the regression with thinkstats2 functions
182
    live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb'])
183
    ages = live_dropna.agepreg
184
    weights = live_dropna.totalwgt_lb
185
    inter, slope = thinkstats2.LeastSquares(ages, weights)
186
    res = thinkstats2.Residuals(ages, weights, inter, slope)
187
    r2 = thinkstats2.CoefDetermination(weights, res)
188

189
    # run the regression with statsmodels
190
    formula = 'totalwgt_lb ~ agepreg'
191
    model = smf.ols(formula, data=live)
192
    results = model.fit()
193
    SummarizeResults(results)
194

195
    def AlmostEquals(x, y, tol=1e-6):
196
        return abs(x-y) < tol
197

198
    assert(AlmostEquals(results.params['Intercept'], inter))
199
    assert(AlmostEquals(results.params['agepreg'], slope))
200
    assert(AlmostEquals(results.rsquared, r2))
201

202

203
def PivotTables(live):
204
    """Prints a pivot table comparing first babies to others.
205

206
    live: DataFrame of live births
207
    """
208
    table = pandas.pivot_table(live, rows='isfirst',
209
                               values=['totalwgt_lb', 'agepreg'])
210
    print(table)
211

212

213
def FormatRow(results, columns):
214
    """Converts regression results to a string.
215

216
    results: RegressionResults object
217

218
    returns: string
219
    """
220
    t = []
221
    for col in columns:
222
        coef = results.params.get(col, np.nan)
223
        pval = results.pvalues.get(col, np.nan)
224
        if np.isnan(coef):
225
            s = '--'
226
        elif pval < 0.001:
227
            s = '%0.3g (*)' % (coef)
228
        else:
229
            s = '%0.3g (%0.2g)' % (coef, pval)
230
        t.append(s)
231

232
    try:
233
        t.append('%.2g' % results.rsquared)
234
    except AttributeError:
235
        t.append('%.2g' % results.prsquared)
236
        
237
    return t
238

239

240
def RunModels(live):
241
    """Runs regressions that predict birth weight.
242

243
    live: DataFrame of pregnancy records
244
    """
245
    columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
246
    header = ['isfirst', 'agepreg', 'agepreg2']
247

248
    rows = []
249
    formula = 'totalwgt_lb ~ isfirst'
250
    results = smf.ols(formula, data=live).fit()
251
    rows.append(FormatRow(results, columns))
252
    print(formula)
253
    SummarizeResults(results)
254

255
    formula = 'totalwgt_lb ~ agepreg'
256
    results = smf.ols(formula, data=live).fit()
257
    rows.append(FormatRow(results, columns))
258
    print(formula)
259
    SummarizeResults(results)
260
    
261
    formula = 'totalwgt_lb ~ isfirst + agepreg'
262
    results = smf.ols(formula, data=live).fit()
263
    rows.append(FormatRow(results, columns))
264
    print(formula)
265
    SummarizeResults(results)
266
    
267
    live['agepreg2'] = live.agepreg**2
268
    formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
269
    results = smf.ols(formula, data=live).fit()
270
    rows.append(FormatRow(results, columns))
271
    print(formula)
272
    SummarizeResults(results)
273
    
274
    PrintTabular(rows, header)
275

276

277
def PrintTabular(rows, header):
278
    """Prints results in LaTeX tabular format.
279

280
    rows: list of rows
281
    header: list of strings
282
    """
283
    s = r'\hline ' + ' & '.join(header) + r' \\ \hline'
284
    print(s)
285

286
    for row in rows:
287
        s = ' & '.join(row) + r' \\'
288
        print(s)
289

290
    print(r'\hline')
291

292

293
def LogisticRegressionExample():
294
    """Runs a simple example of logistic regression and prints results.
295
    """
296
    y = np.array([0, 1, 0, 1])
297
    x1 = np.array([0, 0, 0, 1])
298
    x2 = np.array([0, 1, 1, 1])
299

300
    beta = [-1.5, 2.8, 1.1]
301

302
    log_o = beta[0] + beta[1] * x1 + beta[2] * x2 
303
    print(log_o)
304

305
    o = np.exp(log_o)
306
    print(o)
307

308
    p = o / (o+1)
309
    print(p)
310

311
    like = y * p + (1-y) * (1-p)
312
    print(like)
313
    print(np.prod(like))
314

315
    df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2))
316
    results = smf.logit('y ~ x1 + x2', data=df).fit()
317
    print(results.summary())
318

319
    
320

321
def RunLogisticModels(live):
322
    """Runs regressions that predict sex.
323

324
    live: DataFrame of pregnancy records
325
    """
326
    #live = linear.ResampleRowsWeighted(live)
327

328
    df = live[live.prglngth>30]
329

330
    df['boy'] = (df.babysex==1).astype(int)
331
    df['isyoung'] = (df.agepreg<20).astype(int)
332
    df['isold'] = (df.agepreg<35).astype(int)
333
    df['season'] = (((df.datend+1) % 12) / 3).astype(int)
334

335
    # run the simple model
336
    model = smf.logit('boy ~ agepreg', data=df)    
337
    results = model.fit()
338
    print('nobs', results.nobs)
339
    print(type(results))
340
    SummarizeResults(results)
341

342
    # run the complex model
343
    model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df)
344
    results = model.fit()
345
    print('nobs', results.nobs)
346
    print(type(results))
347
    SummarizeResults(results)
348

349
    # make the scatter plot
350
    exog = pandas.DataFrame(model.exog, columns=model.exog_names)
351
    endog = pandas.DataFrame(model.endog, columns=[model.endog_names])
352
    
353
    xs = exog['agepreg']
354
    lo = results.fittedvalues
355
    o = np.exp(lo)
356
    p = o / (o+1)
357

358
    #thinkplot.Scatter(xs, p, alpha=0.1)
359
    #thinkplot.Show()
360

361
    # compute accuracy
362
    actual = endog['boy']
363
    baseline = actual.mean()
364

365
    predict = (results.predict() >= 0.5)
366
    true_pos = predict * actual
367
    true_neg = (1 - predict) * (1 - actual)
368

369
    acc = (sum(true_pos) + sum(true_neg)) / len(actual)
370
    print(acc, baseline)
371

372
    columns = ['agepreg', 'hpagelb', 'birthord', 'race']
373
    new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns)
374
    y = results.predict(new)
375
    print(y)
376

377

378
def main(name, data_dir='.'):
379
    thinkstats2.RandomSeed(17)
380
    LogisticRegressionExample()
381

382
    live, firsts, others = first.MakeFrames()
383
    live['isfirst'] = (live.birthord == 1)
384

385
    RunLogisticModels(live)
386

387
    RunSimpleRegression(live)
388
    RunModels(live)
389

390
    PredictBirthWeight(live)
391

392

393
if __name__ == '__main__':
394
    import sys
395
    main(*sys.argv)
396

397