CoCalc -- nsfg.py

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / think-stats-2ed / workshop / nsfg.py
Views: ⁹⁶¹⁴⁴
License: OTHER
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function
9

10
from collections import defaultdict
11
import numpy as np
12
import sys
13

14
import thinkstats2
15

16

17
def ReadFemPreg(dct_file='2002FemPreg.dct',
18
                dat_file='2002FemPreg.dat.gz'):
19
    """Reads the NSFG pregnancy data.
20

21
    dct_file: string file name
22
    dat_file: string file name
23

24
    returns: DataFrame
25
    """
26
    dct = thinkstats2.ReadStataDct(dct_file)
27
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
28
    CleanFemPreg(df)
29
    return df
30

31

32
def CleanFemPreg(df):
33
    """Recodes variables from the pregnancy frame.
34

35
    df: DataFrame
36
    """
37
    # mother's age is encoded in centiyears; convert to years
38
    df.agepreg /= 100.0
39

40
    # birthwgt_lb contains at least one bogus value (51 lbs)
41
    # replace with NaN
42
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
43
    
44
    # replace 'not ascertained', 'refused', 'don't know' with NaN
45
    na_vals = [97, 98, 99]
46
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
47
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
48
    df.hpagelb.replace(na_vals, np.nan, inplace=True)
49

50
    df.babysex.replace([7, 9], np.nan, inplace=True)
51
    df.nbrnaliv.replace([9], np.nan, inplace=True)
52

53
    # birthweight is stored in two columns, lbs and oz.
54
    # convert to a single column in lb
55
    # NOTE: creating a new column requires dictionary syntax,
56
    # not attribute assignment (like df.totalwgt_lb)
57
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
58

59
    # due to a bug in ReadStataDct, the last variable gets clipped;
60
    # so for now set it to NaN
61
    df.cmintvw = np.nan
62

63

64
def MakePregMap(df):
65
    """Make a map from caseid to list of preg indices.
66

67
    df: DataFrame
68

69
    returns: dict that maps from caseid to list of indices into preg df
70
    """
71
    d = defaultdict(list)
72
    for index, caseid in df.caseid.iteritems():
73
        d[caseid].append(index)
74
    return d
75

76

77
def main(script):
78
    """Tests the functions in this module.
79

80
    script: string script name
81
    """
82
    df = ReadFemPreg()
83
    print(df.shape)
84

85
    assert len(df) == 13593
86

87
    assert df.caseid[13592] == 12571
88
    assert df.pregordr.value_counts()[1] == 5033
89
    assert df.nbrnaliv.value_counts()[1] == 8981
90
    assert df.babysex.value_counts()[1] == 4641
91
    assert df.birthwgt_lb.value_counts()[7] == 3049
92
    assert df.birthwgt_oz.value_counts()[0] == 1037
93
    assert df.prglngth.value_counts()[39] == 4744
94
    assert df.outcome.value_counts()[1] == 9148
95
    assert df.birthord.value_counts()[1] == 4413
96
    assert df.agepreg.value_counts()[22.75] == 100
97
    assert df.totalwgt_lb.value_counts()[7.5] == 302
98

99
    weights = df.finalwgt.value_counts()
100
    key = max(weights.keys())
101
    assert df.finalwgt.value_counts()[key] == 6
102

103
    print('%s: All tests passed.' % script)
104

105
if __name__ == '__main__':
106
    main(*sys.argv)
107

108
Product

Resources

Company