Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import sys
11
import numpy as np
12
import thinkstats2
13
14
from collections import defaultdict
15
16
17
def ReadFemResp(dct_file='2002FemResp.dct',
18
dat_file='2002FemResp.dat.gz',
19
nrows=None):
20
"""Reads the NSFG respondent data.
21
22
dct_file: string file name
23
dat_file: string file name
24
25
returns: DataFrame
26
"""
27
dct = thinkstats2.ReadStataDct(dct_file)
28
df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
29
CleanFemResp(df)
30
return df
31
32
33
def CleanFemResp(df):
34
"""Recodes variables from the respondent frame.
35
36
df: DataFrame
37
"""
38
pass
39
40
41
def ReadFemPreg(dct_file='2002FemPreg.dct',
42
dat_file='2002FemPreg.dat.gz'):
43
"""Reads the NSFG pregnancy data.
44
45
dct_file: string file name
46
dat_file: string file name
47
48
returns: DataFrame
49
"""
50
dct = thinkstats2.ReadStataDct(dct_file)
51
df = dct.ReadFixedWidth(dat_file, compression='gzip')
52
CleanFemPreg(df)
53
return df
54
55
56
def CleanFemPreg(df):
57
"""Recodes variables from the pregnancy frame.
58
59
df: DataFrame
60
"""
61
# mother's age is encoded in centiyears; convert to years
62
df.agepreg /= 100.0
63
64
# birthwgt_lb contains at least one bogus value (51 lbs)
65
# replace with NaN
66
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
67
68
# replace 'not ascertained', 'refused', 'don't know' with NaN
69
na_vals = [97, 98, 99]
70
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
71
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
72
df.hpagelb.replace(na_vals, np.nan, inplace=True)
73
74
df.babysex.replace([7, 9], np.nan, inplace=True)
75
df.nbrnaliv.replace([9], np.nan, inplace=True)
76
77
# birthweight is stored in two columns, lbs and oz.
78
# convert to a single column in lb
79
# NOTE: creating a new column requires dictionary syntax,
80
# not attribute assignment (like df.totalwgt_lb)
81
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
82
83
# due to a bug in ReadStataDct, the last variable gets clipped;
84
# so for now set it to NaN
85
df.cmintvw = np.nan
86
87
88
def ValidatePregnum(resp, preg):
89
"""Validate pregnum in the respondent file.
90
91
resp: respondent DataFrame
92
preg: pregnancy DataFrame
93
"""
94
# make the map from caseid to list of pregnancy indices
95
preg_map = MakePregMap(preg)
96
97
# iterate through the respondent pregnum series
98
for index, pregnum in resp.pregnum.iteritems():
99
caseid = resp.caseid[index]
100
indices = preg_map[caseid]
101
102
# check that pregnum from the respondent file equals
103
# the number of records in the pregnancy file
104
if len(indices) != pregnum:
105
print(caseid, len(indices), pregnum)
106
return False
107
108
return True
109
110
111
def MakePregMap(df):
112
"""Make a map from caseid to list of preg indices.
113
114
df: DataFrame
115
116
returns: dict that maps from caseid to list of indices into `preg`
117
"""
118
d = defaultdict(list)
119
for index, caseid in df.caseid.iteritems():
120
d[caseid].append(index)
121
return d
122
123
124
def main():
125
"""Tests the functions in this module.
126
127
script: string script name
128
"""
129
# read and validate the respondent file
130
resp = ReadFemResp()
131
132
assert(len(resp) == 7643)
133
assert(resp.pregnum.value_counts()[1] == 1267)
134
135
# read and validate the pregnancy file
136
preg = ReadFemPreg()
137
print(preg.shape)
138
139
assert len(preg) == 13593
140
assert preg.caseid[13592] == 12571
141
assert preg.pregordr.value_counts()[1] == 5033
142
assert preg.nbrnaliv.value_counts()[1] == 8981
143
assert preg.babysex.value_counts()[1] == 4641
144
assert preg.birthwgt_lb.value_counts()[7] == 3049
145
assert preg.birthwgt_oz.value_counts()[0] == 1037
146
assert preg.prglngth.value_counts()[39] == 4744
147
assert preg.outcome.value_counts()[1] == 9148
148
assert preg.birthord.value_counts()[1] == 4413
149
assert preg.agepreg.value_counts()[22.75] == 100
150
assert preg.totalwgt_lb.value_counts()[7.5] == 302
151
152
weights = preg.finalwgt.value_counts()
153
key = max(weights.keys())
154
assert preg.finalwgt.value_counts()[key] == 6
155
156
# validate that the pregnum column in `resp` matches the number
157
# of entries in `preg`
158
assert(ValidatePregnum(resp, preg))
159
160
161
print('All tests passed.')
162
163
164
if __name__ == '__main__':
165
main()
166
167