"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function, division
import sys
import numpy as np
import thinkstats2
from collections import defaultdict
def ReadFemResp(dct_file='2002FemResp.dct',
dat_file='2002FemResp.dat.gz',
nrows=None):
"""Reads the NSFG respondent data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
CleanFemResp(df)
return df
def CleanFemResp(df):
"""Recodes variables from the respondent frame.
df: DataFrame
"""
pass
def ReadFemPreg(dct_file='2002FemPreg.dct',
dat_file='2002FemPreg.dat.gz'):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df
def CleanFemPreg(df):
"""Recodes variables from the pregnancy frame.
df: DataFrame
"""
df.agepreg /= 100.0
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)
df.babysex.replace([7, 9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
df.cmintvw = np.nan
def ValidatePregnum(resp, preg):
"""Validate pregnum in the respondent file.
resp: respondent DataFrame
preg: pregnancy DataFrame
"""
preg_map = MakePregMap(preg)
for index, pregnum in resp.pregnum.iteritems():
caseid = resp.caseid[index]
indices = preg_map[caseid]
if len(indices) != pregnum:
print(caseid, len(indices), pregnum)
return False
return True
def MakePregMap(df):
"""Make a map from caseid to list of preg indices.
df: DataFrame
returns: dict that maps from caseid to list of indices into `preg`
"""
d = defaultdict(list)
for index, caseid in df.caseid.iteritems():
d[caseid].append(index)
return d
def main():
"""Tests the functions in this module.
script: string script name
"""
resp = ReadFemResp()
assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)
preg = ReadFemPreg()
print(preg.shape)
assert len(preg) == 13593
assert preg.caseid[13592] == 12571
assert preg.pregordr.value_counts()[1] == 5033
assert preg.nbrnaliv.value_counts()[1] == 8981
assert preg.babysex.value_counts()[1] == 4641
assert preg.birthwgt_lb.value_counts()[7] == 3049
assert preg.birthwgt_oz.value_counts()[0] == 1037
assert preg.prglngth.value_counts()[39] == 4744
assert preg.outcome.value_counts()[1] == 9148
assert preg.birthord.value_counts()[1] == 4413
assert preg.agepreg.value_counts()[22.75] == 100
assert preg.totalwgt_lb.value_counts()[7.5] == 302
weights = preg.finalwgt.value_counts()
key = max(weights.keys())
assert preg.finalwgt.value_counts()[key] == 6
assert(ValidatePregnum(resp, preg))
print('All tests passed.')
if __name__ == '__main__':
main()