| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import sys10import numpy as np11import thinkstats21213from collections import defaultdict141516def ReadFemResp(dct_file='2002FemResp.dct',17dat_file='2002FemResp.dat.gz',18nrows=None):19"""Reads the NSFG respondent data.2021dct_file: string file name22dat_file: string file name2324returns: DataFrame25"""26dct = thinkstats2.ReadStataDct(dct_file)27df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)28CleanFemResp(df)29return df303132def CleanFemResp(df):33"""Recodes variables from the respondent frame.3435df: DataFrame36"""37pass383940def ReadFemPreg(dct_file='2002FemPreg.dct',41dat_file='2002FemPreg.dat.gz'):42"""Reads the NSFG pregnancy data.4344dct_file: string file name45dat_file: string file name4647returns: DataFrame48"""49dct = thinkstats2.ReadStataDct(dct_file)50df = dct.ReadFixedWidth(dat_file, compression='gzip')51CleanFemPreg(df)52return df535455def CleanFemPreg(df):56"""Recodes variables from the pregnancy frame.5758df: DataFrame59"""60# mother's age is encoded in centiyears; convert to years61df.agepreg /= 100.06263# birthwgt_lb contains at least one bogus value (51 lbs)64# replace with NaN65df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan6667# replace 'not ascertained', 'refused', 'don't know' with NaN68na_vals = [97, 98, 99]69df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)70df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)71df.hpagelb.replace(na_vals, np.nan, inplace=True)7273df.babysex.replace([7, 9], np.nan, inplace=True)74df.nbrnaliv.replace([9], np.nan, inplace=True)7576# birthweight is stored in two columns, lbs and oz.77# convert to a single column in lb78# NOTE: creating a new column requires dictionary syntax,79# not attribute assignment (like df.totalwgt_lb)80df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.08182# due to a bug in ReadStataDct, the last variable gets clipped;83# so for now set it to NaN84df.cmintvw = np.nan858687def ValidatePregnum(resp, preg):88"""Validate pregnum in the respondent file.8990resp: respondent DataFrame91preg: pregnancy DataFrame92"""93# make the map from caseid to list of pregnancy indices94preg_map = MakePregMap(preg)9596# iterate through the respondent pregnum series97for index, pregnum in resp.pregnum.iteritems():98caseid = resp.caseid[index]99indices = preg_map[caseid]100101# check that pregnum from the respondent file equals102# the number of records in the pregnancy file103if len(indices) != pregnum:104print(caseid, len(indices), pregnum)105return False106107return True108109110def MakePregMap(df):111"""Make a map from caseid to list of preg indices.112113df: DataFrame114115returns: dict that maps from caseid to list of indices into `preg`116"""117d = defaultdict(list)118for index, caseid in df.caseid.iteritems():119d[caseid].append(index)120return d121122123def main():124"""Tests the functions in this module.125126script: string script name127"""128# read and validate the respondent file129resp = ReadFemResp()130131assert(len(resp) == 7643)132assert(resp.pregnum.value_counts()[1] == 1267)133134# read and validate the pregnancy file135preg = ReadFemPreg()136print(preg.shape)137138assert len(preg) == 13593139assert preg.caseid[13592] == 12571140assert preg.pregordr.value_counts()[1] == 5033141assert preg.nbrnaliv.value_counts()[1] == 8981142assert preg.babysex.value_counts()[1] == 4641143assert preg.birthwgt_lb.value_counts()[7] == 3049144assert preg.birthwgt_oz.value_counts()[0] == 1037145assert preg.prglngth.value_counts()[39] == 4744146assert preg.outcome.value_counts()[1] == 9148147assert preg.birthord.value_counts()[1] == 4413148assert preg.agepreg.value_counts()[22.75] == 100149assert preg.totalwgt_lb.value_counts()[7.5] == 302150151weights = preg.finalwgt.value_counts()152key = max(weights.keys())153assert preg.finalwgt.value_counts()[key] == 6154155# validate that the pregnum column in `resp` matches the number156# of entries in `preg`157assert(ValidatePregnum(resp, preg))158159160print('All tests passed.')161162163if __name__ == '__main__':164main()165166167