Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import numpy as np
11
12
import thinkstats2
13
14
def MakeFrames():
15
"""Reads pregnancy data and partitions first babies and others.
16
17
returns: DataFrames (all live births, first babies, others)
18
"""
19
preg = ReadFemPreg()
20
21
live = preg[preg.outcome == 1]
22
firsts = live[live.birthord == 1]
23
others = live[live.birthord != 1]
24
25
assert(len(live) == 14292)
26
assert(len(firsts) == 6683)
27
assert(len(others) == 7609)
28
29
return live, firsts, others
30
31
32
def ReadFemPreg(dct_file='2006_2010_FemPregSetup.dct',
33
dat_file='2006_2010_FemPreg.dat.gz'):
34
"""Reads the NSFG 2006-2010 pregnancy data.
35
36
dct_file: string file name
37
dat_file: string file name
38
39
returns: DataFrame
40
"""
41
dct = thinkstats2.ReadStataDct(dct_file, encoding='iso-8859-1')
42
df = dct.ReadFixedWidth(dat_file, compression='gzip')
43
CleanFemPreg(df)
44
return df
45
46
47
def CleanFemPreg(df):
48
"""Recodes variables from the pregnancy frame.
49
50
df: DataFrame
51
"""
52
# mother's age is encoded in centiyears; convert to years
53
df.agepreg /= 100.0
54
55
# birthwgt_lb contains at least one bogus value (51 lbs)
56
# replace with NaN
57
df.loc[df.birthwgt_lb1 > 20, 'birthwgt_lb1'] = np.nan
58
59
# replace 'not ascertained', 'refused', 'don't know' with NaN
60
na_vals = [97, 98, 99]
61
df.birthwgt_lb1.replace(na_vals, np.nan, inplace=True)
62
df.birthwgt_oz1.replace(na_vals, np.nan, inplace=True)
63
64
# birthweight is stored in two columns, lbs and oz.
65
# convert to a single column in lb
66
# NOTE: creating a new column requires dictionary syntax,
67
# not attribute assignment (like df.totalwgt_lb)
68
df['totalwgt_lb'] = df.birthwgt_lb1 + df.birthwgt_oz1 / 16.0
69
70
# due to a bug in ReadStataDct, the last variable gets clipped;
71
# so for now set it to NaN
72
df.phase = np.nan
73
74
75
def main():
76
live, firsts, others = MakeFrames()
77
78
79
if __name__ == '__main__':
80
main()
81
82
83
84