CoCalc -- nsfg.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Views: ⁷¹¹⁹
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
import sys
11
import numpy as np
12
import thinkstats2
13

14
from collections import defaultdict
15

16

17
def ReadFemResp(dct_file='2002FemResp.dct',
18
                dat_file='2002FemResp.dat.gz',
19
                nrows=None):
20
    """Reads the NSFG respondent data.
21

22
    dct_file: string file name
23
    dat_file: string file name
24

25
    returns: DataFrame
26
    """
27
    dct = thinkstats2.ReadStataDct(dct_file)
28
    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
29
    CleanFemResp(df)
30
    return df
31

32

33
def CleanFemResp(df):
34
    """Recodes variables from the respondent frame.
35

36
    df: DataFrame
37
    """
38
    pass
39

40

41
def ReadFemPreg(dct_file='2002FemPreg.dct',
42
                dat_file='2002FemPreg.dat.gz'):
43
    """Reads the NSFG pregnancy data.
44

45
    dct_file: string file name
46
    dat_file: string file name
47

48
    returns: DataFrame
49
    """
50
    dct = thinkstats2.ReadStataDct(dct_file)
51
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
52
    CleanFemPreg(df)
53
    return df
54

55

56
def CleanFemPreg(df):
57
    """Recodes variables from the pregnancy frame.
58

59
    df: DataFrame
60
    """
61
    # mother's age is encoded in centiyears; convert to years
62
    df.agepreg /= 100.0
63

64
    # birthwgt_lb contains at least one bogus value (51 lbs)
65
    # replace with NaN
66
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
67
    
68
    # replace 'not ascertained', 'refused', 'don't know' with NaN
69
    na_vals = [97, 98, 99]
70
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
71
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
72
    df.hpagelb.replace(na_vals, np.nan, inplace=True)
73

74
    df.babysex.replace([7, 9], np.nan, inplace=True)
75
    df.nbrnaliv.replace([9], np.nan, inplace=True)
76

77
    # birthweight is stored in two columns, lbs and oz.
78
    # convert to a single column in lb
79
    # NOTE: creating a new column requires dictionary syntax,
80
    # not attribute assignment (like df.totalwgt_lb)
81
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
82

83
    # due to a bug in ReadStataDct, the last variable gets clipped;
84
    # so for now set it to NaN
85
    df.cmintvw = np.nan
86

87

88
def ValidatePregnum(resp, preg):
89
    """Validate pregnum in the respondent file.
90

91
    resp: respondent DataFrame
92
    preg: pregnancy DataFrame
93
    """
94
    # make the map from caseid to list of pregnancy indices
95
    preg_map = MakePregMap(preg)
96
    
97
    # iterate through the respondent pregnum series
98
    for index, pregnum in resp.pregnum.iteritems():
99
        caseid = resp.caseid[index]
100
        indices = preg_map[caseid]
101

102
        # check that pregnum from the respondent file equals
103
        # the number of records in the pregnancy file
104
        if len(indices) != pregnum:
105
            print(caseid, len(indices), pregnum)
106
            return False
107

108
    return True
109

110

111
def MakePregMap(df):
112
    """Make a map from caseid to list of preg indices.
113

114
    df: DataFrame
115

116
    returns: dict that maps from caseid to list of indices into `preg`
117
    """
118
    d = defaultdict(list)
119
    for index, caseid in df.caseid.iteritems():
120
        d[caseid].append(index)
121
    return d
122

123

124
def main():
125
    """Tests the functions in this module.
126

127
    script: string script name
128
    """
129
    # read and validate the respondent file
130
    resp = ReadFemResp()
131

132
    assert(len(resp) == 7643)
133
    assert(resp.pregnum.value_counts()[1] == 1267)
134

135
    # read and validate the pregnancy file
136
    preg = ReadFemPreg()
137
    print(preg.shape)
138

139
    assert len(preg) == 13593
140
    assert preg.caseid[13592] == 12571
141
    assert preg.pregordr.value_counts()[1] == 5033
142
    assert preg.nbrnaliv.value_counts()[1] == 8981
143
    assert preg.babysex.value_counts()[1] == 4641
144
    assert preg.birthwgt_lb.value_counts()[7] == 3049
145
    assert preg.birthwgt_oz.value_counts()[0] == 1037
146
    assert preg.prglngth.value_counts()[39] == 4744
147
    assert preg.outcome.value_counts()[1] == 9148
148
    assert preg.birthord.value_counts()[1] == 4413
149
    assert preg.agepreg.value_counts()[22.75] == 100
150
    assert preg.totalwgt_lb.value_counts()[7.5] == 302
151

152
    weights = preg.finalwgt.value_counts()
153
    key = max(weights.keys())
154
    assert preg.finalwgt.value_counts()[key] == 6
155

156
    # validate that the pregnum column in `resp` matches the number
157
    # of entries in `preg`
158
    assert(ValidatePregnum(resp, preg))
159

160
    
161
    print('All tests passed.')
162

163

164
if __name__ == '__main__':
165
    main()
166

167