| Download
Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Project: Support and Testing
Views: 7119License: GPL3
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function89import numpy as np1011import thinkstats21213def MakeFrames():14"""Reads pregnancy data and partitions first babies and others.1516returns: DataFrames (all live births, first babies, others)17"""18preg = ReadFemPreg()1920live = preg[preg.outcome == 1]21firsts = live[live.birthord == 1]22others = live[live.birthord != 1]2324assert(len(live) == 14292)25assert(len(firsts) == 6683)26assert(len(others) == 7609)2728return live, firsts, others293031def ReadFemPreg(dct_file='2006_2010_FemPregSetup.dct',32dat_file='2006_2010_FemPreg.dat.gz'):33"""Reads the NSFG 2006-2010 pregnancy data.3435dct_file: string file name36dat_file: string file name3738returns: DataFrame39"""40dct = thinkstats2.ReadStataDct(dct_file, encoding='iso-8859-1')41df = dct.ReadFixedWidth(dat_file, compression='gzip')42CleanFemPreg(df)43return df444546def CleanFemPreg(df):47"""Recodes variables from the pregnancy frame.4849df: DataFrame50"""51# mother's age is encoded in centiyears; convert to years52df.agepreg /= 100.05354# birthwgt_lb contains at least one bogus value (51 lbs)55# replace with NaN56df.loc[df.birthwgt_lb1 > 20, 'birthwgt_lb1'] = np.nan5758# replace 'not ascertained', 'refused', 'don't know' with NaN59na_vals = [97, 98, 99]60df.birthwgt_lb1.replace(na_vals, np.nan, inplace=True)61df.birthwgt_oz1.replace(na_vals, np.nan, inplace=True)6263# birthweight is stored in two columns, lbs and oz.64# convert to a single column in lb65# NOTE: creating a new column requires dictionary syntax,66# not attribute assignment (like df.totalwgt_lb)67df['totalwgt_lb'] = df.birthwgt_lb1 + df.birthwgt_oz1 / 16.06869# due to a bug in ReadStataDct, the last variable gets clipped;70# so for now set it to NaN71df.phase = np.nan727374def main():75live, firsts, others = MakeFrames()767778if __name__ == '__main__':79main()8081828384