| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7115License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import math10import pandas11import patsy12import random13import numpy as np14import statsmodels.api as sm15import statsmodels.formula.api as smf16import re1718import chap01soln19import first20import linear21import thinkplot22import thinkstats2232425def QuickLeastSquares(xs, ys):26"""Estimates linear least squares fit and returns MSE.2728xs: sequence of values29ys: sequence of values3031returns: inter, slope, mse32"""33n = float(len(xs))3435meanx = xs.mean()36dxs = xs - meanx37varx = np.dot(dxs, dxs) / n3839meany = ys.mean()40dys = ys - meany4142cov = np.dot(dxs, dys) / n43slope = cov / varx44inter = meany - slope * meanx4546res = ys - (inter + slope * xs)47mse = np.dot(res, res) / n48return inter, slope, mse495051def ReadVariables():52"""Reads Stata dictionary files for NSFG data.5354returns: DataFrame that maps variables names to descriptions55"""56vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables57vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables5859all_vars = vars1.append(vars2)60all_vars.index = all_vars.name61return all_vars626364def JoinFemResp(df):65"""Reads the female respondent file and joins on caseid.6667df: DataFrame68"""69resp = chap01soln.ReadFemResp()70resp.index = resp.caseid7172join = df.join(resp, on='caseid', rsuffix='_r')7374# convert from colon-separated time strings to datetimes75join.screentime = pandas.to_datetime(join.screentime)7677return join787980MESSAGE = """If you get this error, it's probably because81you are running Python 3 and the nice people who maintain82Patsy have not fixed this problem:83https://github.com/pydata/patsy/issues/348485While we wait, I suggest running this example in86Python 2, or skipping this example."""878889def GoMining(df):90"""Searches for variables that predict birth weight.9192df: DataFrame of pregnancy records9394returns: list of (rsquared, variable name) pairs95"""96variables = []97for name in df.columns:98try:99if df[name].var() < 1e-7:100continue101102formula = 'totalwgt_lb ~ agepreg + ' + name103formula = formula.encode('ascii')104105model = smf.ols(formula, data=df)106if model.nobs < len(df)/2:107continue108109results = model.fit()110except (ValueError, TypeError):111continue112except patsy.PatsyError:113raise ValueError(MESSAGE)114115variables.append((results.rsquared, name))116117return variables118119120def MiningReport(variables, n=30):121"""Prints variables with the highest R^2.122123t: list of (R^2, variable name) pairs124n: number of pairs to print125"""126all_vars = ReadVariables()127128variables.sort(reverse=True)129for mse, name in variables[:n]:130key = re.sub('_r$', '', name)131try:132desc = all_vars.loc[key].desc133if isinstance(desc, pandas.Series):134desc = desc[0]135print(name, mse, desc)136except KeyError:137print(name, mse)138139140def PredictBirthWeight(live):141"""Predicts birth weight of a baby at 30 weeks.142143live: DataFrame of live births144"""145live = live[live.prglngth>30]146join = JoinFemResp(live)147148t = GoMining(join)149MiningReport(t)150151formula = ('totalwgt_lb ~ agepreg + C(race) + babysex==1 + '152'nbrnaliv>1 + paydu==1 + totincr')153results = smf.ols(formula, data=join).fit()154SummarizeResults(results)155156157def SummarizeResults(results):158"""Prints the most important parts of linear regression results:159160results: RegressionResults object161"""162for name, param in results.params.items():163pvalue = results.pvalues[name]164print('%s %0.3g (%.3g)' % (name, param, pvalue))165166try:167print('R^2 %.4g' % results.rsquared)168ys = results.model.endog169print('Std(ys) %.4g' % ys.std())170print('Std(res) %.4g' % results.resid.std())171except AttributeError:172print('R^2 %.4g' % results.prsquared)173174175def RunSimpleRegression(live):176"""Runs a simple regression and compare results to thinkstats2 functions.177178live: DataFrame of live births179"""180# run the regression with thinkstats2 functions181live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb'])182ages = live_dropna.agepreg183weights = live_dropna.totalwgt_lb184inter, slope = thinkstats2.LeastSquares(ages, weights)185res = thinkstats2.Residuals(ages, weights, inter, slope)186r2 = thinkstats2.CoefDetermination(weights, res)187188# run the regression with statsmodels189formula = 'totalwgt_lb ~ agepreg'190model = smf.ols(formula, data=live)191results = model.fit()192SummarizeResults(results)193194def AlmostEquals(x, y, tol=1e-6):195return abs(x-y) < tol196197assert(AlmostEquals(results.params['Intercept'], inter))198assert(AlmostEquals(results.params['agepreg'], slope))199assert(AlmostEquals(results.rsquared, r2))200201202def PivotTables(live):203"""Prints a pivot table comparing first babies to others.204205live: DataFrame of live births206"""207table = pandas.pivot_table(live, rows='isfirst',208values=['totalwgt_lb', 'agepreg'])209print(table)210211212def FormatRow(results, columns):213"""Converts regression results to a string.214215results: RegressionResults object216217returns: string218"""219t = []220for col in columns:221coef = results.params.get(col, np.nan)222pval = results.pvalues.get(col, np.nan)223if np.isnan(coef):224s = '--'225elif pval < 0.001:226s = '%0.3g (*)' % (coef)227else:228s = '%0.3g (%0.2g)' % (coef, pval)229t.append(s)230231try:232t.append('%.2g' % results.rsquared)233except AttributeError:234t.append('%.2g' % results.prsquared)235236return t237238239def RunModels(live):240"""Runs regressions that predict birth weight.241242live: DataFrame of pregnancy records243"""244columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']245header = ['isfirst', 'agepreg', 'agepreg2']246247rows = []248formula = 'totalwgt_lb ~ isfirst'249results = smf.ols(formula, data=live).fit()250rows.append(FormatRow(results, columns))251print(formula)252SummarizeResults(results)253254formula = 'totalwgt_lb ~ agepreg'255results = smf.ols(formula, data=live).fit()256rows.append(FormatRow(results, columns))257print(formula)258SummarizeResults(results)259260formula = 'totalwgt_lb ~ isfirst + agepreg'261results = smf.ols(formula, data=live).fit()262rows.append(FormatRow(results, columns))263print(formula)264SummarizeResults(results)265266live['agepreg2'] = live.agepreg**2267formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'268results = smf.ols(formula, data=live).fit()269rows.append(FormatRow(results, columns))270print(formula)271SummarizeResults(results)272273PrintTabular(rows, header)274275276def PrintTabular(rows, header):277"""Prints results in LaTeX tabular format.278279rows: list of rows280header: list of strings281"""282s = r'\hline ' + ' & '.join(header) + r' \\ \hline'283print(s)284285for row in rows:286s = ' & '.join(row) + r' \\'287print(s)288289print(r'\hline')290291292def LogisticRegressionExample():293"""Runs a simple example of logistic regression and prints results.294"""295y = np.array([0, 1, 0, 1])296x1 = np.array([0, 0, 0, 1])297x2 = np.array([0, 1, 1, 1])298299beta = [-1.5, 2.8, 1.1]300301log_o = beta[0] + beta[1] * x1 + beta[2] * x2302print(log_o)303304o = np.exp(log_o)305print(o)306307p = o / (o+1)308print(p)309310like = y * p + (1-y) * (1-p)311print(like)312print(np.prod(like))313314df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2))315results = smf.logit('y ~ x1 + x2', data=df).fit()316print(results.summary())317318319320def RunLogisticModels(live):321"""Runs regressions that predict sex.322323live: DataFrame of pregnancy records324"""325#live = linear.ResampleRowsWeighted(live)326327df = live[live.prglngth>30]328329df['boy'] = (df.babysex==1).astype(int)330df['isyoung'] = (df.agepreg<20).astype(int)331df['isold'] = (df.agepreg<35).astype(int)332df['season'] = (((df.datend+1) % 12) / 3).astype(int)333334# run the simple model335model = smf.logit('boy ~ agepreg', data=df)336results = model.fit()337print('nobs', results.nobs)338print(type(results))339SummarizeResults(results)340341# run the complex model342model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df)343results = model.fit()344print('nobs', results.nobs)345print(type(results))346SummarizeResults(results)347348# make the scatter plot349exog = pandas.DataFrame(model.exog, columns=model.exog_names)350endog = pandas.DataFrame(model.endog, columns=[model.endog_names])351352xs = exog['agepreg']353lo = results.fittedvalues354o = np.exp(lo)355p = o / (o+1)356357#thinkplot.Scatter(xs, p, alpha=0.1)358#thinkplot.Show()359360# compute accuracy361actual = endog['boy']362baseline = actual.mean()363364predict = (results.predict() >= 0.5)365true_pos = predict * actual366true_neg = (1 - predict) * (1 - actual)367368acc = (sum(true_pos) + sum(true_neg)) / len(actual)369print(acc, baseline)370371columns = ['agepreg', 'hpagelb', 'birthord', 'race']372new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns)373y = results.predict(new)374print(y)375376377def main(name, data_dir='.'):378thinkstats2.RandomSeed(17)379LogisticRegressionExample()380381live, firsts, others = first.MakeFrames()382live['isfirst'] = (live.birthord == 1)383384RunLogisticModels(live)385386RunSimpleRegression(live)387RunModels(live)388389PredictBirthWeight(live)390391392if __name__ == '__main__':393import sys394main(*sys.argv)395396397