CoCalc -- survey.py

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / think-bayes / code / survey.py
Views: ⁹⁶¹⁶⁰
License: OTHER
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
import sys
9
import gzip
10
import os
11

12
class Record(object):
13
    """Represents a record."""
14

15
class Respondent(Record): 
16
    """Represents a respondent."""
17

18
class Pregnancy(Record):
19
    """Represents a pregnancy."""
20

21
class Table(object):
22
    """Represents a table as a list of objects"""
23

24
    def __init__(self):
25
        self.records = []
26
        
27
    def __len__(self):
28
        return len(self.records)
29

30
    def ReadFile(self, data_dir, filename, fields, constructor, n=None):
31
        """Reads a compressed data file builds one object per record.
32

33
        Args:
34
            data_dir: string directory name
35
            filename: string name of the file to read
36

37
            fields: sequence of (name, start, end, case) tuples specifying 
38
            the fields to extract
39

40
            constructor: what kind of object to create
41
        """
42
        filename = os.path.join(data_dir, filename)
43

44
        if filename.endswith('gz'):
45
            fp = gzip.open(filename)
46
        else:
47
            fp = open(filename)
48

49
        for i, line in enumerate(fp):
50
            if i == n:
51
                break
52
            record = self.MakeRecord(line, fields, constructor)
53
            self.AddRecord(record)
54
        fp.close()
55

56
    def MakeRecord(self, line, fields, constructor):
57
        """Scans a line and returns an object with the appropriate fields.
58

59
        Args:
60
            line: string line from a data file
61

62
            fields: sequence of (name, start, end, cast) tuples specifying 
63
            the fields to extract
64

65
            constructor: callable that makes an object for the record.
66

67
        Returns:
68
            Record with appropriate fields.
69
        """
70
        obj = constructor()
71
        for (field, start, end, cast) in fields:
72
            try:
73
                s = line[start-1:end]
74
                val = cast(s)
75
            except ValueError:
76
                #print line
77
                #print field, start, end, s
78
                val = 'NA'
79
            setattr(obj, field, val)
80
        return obj
81

82
    def AddRecord(self, record):
83
        """Adds a record to this table.
84

85
        Args:
86
            record: an object of one of the record types.
87
        """
88
        self.records.append(record)
89

90
    def ExtendRecords(self, records):
91
        """Adds records to this table.
92

93
        Args:
94
            records: a sequence of record object
95
        """
96
        self.records.extend(records)
97

98
    def Recode(self):
99
        """Child classes can override this to recode values."""
100
        pass
101

102

103
class Respondents(Table):
104
    """Represents the respondent table."""
105

106
    def ReadRecords(self, data_dir='.', n=None):
107
        filename = self.GetFilename()
108
        self.ReadFile(data_dir, filename, self.GetFields(), Respondent, n)
109
        self.Recode()
110

111
    def GetFilename(self):
112
        return '2002FemResp.dat.gz'
113

114
    def GetFields(self):
115
        """Returns a tuple specifying the fields to extract.
116

117
        The elements of the tuple are field, start, end, case.
118

119
                field is the name of the variable
120
                start and end are the indices as specified in the NSFG docs
121
                cast is a callable that converts the result to int, float, etc.
122
        """
123
        return [
124
            ('caseid', 1, 12, int),
125
            ]
126

127
class Pregnancies(Table):
128
    """Contains survey data about a Pregnancy."""
129

130
    def ReadRecords(self, data_dir='.', n=None):
131
        filename = self.GetFilename()
132
        self.ReadFile(data_dir, filename, self.GetFields(), Pregnancy, n)
133
        self.Recode()
134

135
    def GetFilename(self):
136
        return '2002FemPreg.dat.gz'
137

138
    def GetFields(self):
139
        """Gets information about the fields to extract from the survey data.
140

141
        Documentation of the fields for Cycle 6 is at
142
        http://nsfg.icpsr.umich.edu/cocoon/WebDocs/NSFG/public/index.htm
143

144
        Returns:
145
            sequence of (name, start, end, type) tuples
146
        """
147
        return [
148
            ('caseid', 1, 12, int),
149
            ('nbrnaliv', 22, 22, int),
150
            ('babysex', 56, 56, int),
151
            ('birthwgt_lb', 57, 58, int),
152
            ('birthwgt_oz', 59, 60, int),
153
            ('prglength', 275, 276, int),
154
            ('outcome', 277, 277, int),
155
            ('birthord', 278, 279, int),
156
            ('agepreg', 284, 287, int),
157
            ('finalwgt', 423, 440, float),
158
            ]
159

160
    def Recode(self):
161
        for rec in self.records:
162

163
            # divide mother's age by 100
164
            try:
165
                if rec.agepreg != 'NA':
166
                    rec.agepreg /= 100.0
167
            except AttributeError:
168
                pass
169

170
            # convert weight at birth from lbs/oz to total ounces
171
            # note: there are some very low birthweights
172
            # that are almost certainly errors, but for now I am not
173
            # filtering
174
            try:
175
                if (rec.birthwgt_lb != 'NA' and rec.birthwgt_lb < 20 and
176
                    rec.birthwgt_oz != 'NA' and rec.birthwgt_oz <= 16):
177
                    rec.totalwgt_oz = rec.birthwgt_lb * 16 + rec.birthwgt_oz
178
                else:
179
                    rec.totalwgt_oz = 'NA'
180
            except AttributeError:
181
                pass
182

183

184
def main(name, data_dir='.'):
185
    resp = Respondents()
186
    resp.ReadRecords(data_dir)
187
    print 'Number of respondents', len(resp.records)
188

189
    preg = Pregnancies()
190
    preg.ReadRecords(data_dir)
191
    print 'Number of pregnancies', len(preg.records)
192

193
    
194
if __name__ == '__main__':
195
    main(*sys.argv)
196

197
Product

Resources

Company