Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7115
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import pandas
11
import numpy as np
12
13
import thinkplot
14
import thinkstats2
15
import survival
16
17
18
def CleanData(resp):
19
"""Cleans respondent data.
20
21
resp: DataFrame
22
"""
23
resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True)
24
25
resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
26
resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
27
resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0
28
29
month0 = pandas.to_datetime('1899-12-15')
30
dates = [month0 + pandas.DateOffset(months=cm)
31
for cm in resp.cmbirth]
32
resp['decade'] = (pandas.DatetimeIndex(dates).year - 1900) // 10
33
34
35
def ResampleDivorceCurve(resps):
36
"""Plots divorce curves based on resampled data.
37
38
resps: list of respondent DataFrames
39
"""
40
for _ in range(41):
41
samples = [thinkstats2.ResampleRowsWeighted(resp)
42
for resp in resps]
43
sample = pandas.concat(samples, ignore_index=True)
44
PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)
45
46
thinkplot.Show(xlabel='years',
47
axis=[0, 28, 0, 1])
48
49
50
def ResampleDivorceCurveByDecade(resps):
51
"""Plots divorce curves for each birth cohort.
52
53
resps: list of respondent DataFrames
54
"""
55
for i in range(41):
56
samples = [thinkstats2.ResampleRowsWeighted(resp)
57
for resp in resps]
58
sample = pandas.concat(samples, ignore_index=True)
59
groups = sample.groupby('decade')
60
if i == 0:
61
survival.AddLabelsByDecade(groups, alpha=0.7)
62
63
EstimateSurvivalByDecade(groups, alpha=0.1)
64
65
thinkplot.Save(root='survival7',
66
xlabel='years',
67
axis=[0, 28, 0, 1])
68
69
70
def EstimateSurvivalByDecade(groups, **options):
71
"""Groups respondents by decade and plots survival curves.
72
73
groups: GroupBy object
74
"""
75
thinkplot.PrePlot(len(groups))
76
for name, group in groups:
77
print(name, len(group))
78
_, sf = EstimateSurvival(group)
79
thinkplot.Plot(sf, **options)
80
81
82
def EstimateSurvival(resp):
83
"""Estimates the survival curve.
84
85
resp: DataFrame of respondents
86
87
returns: pair of HazardFunction, SurvivalFunction
88
"""
89
complete = resp[resp.notdivorced == 0].duration
90
ongoing = resp[resp.notdivorced == 1].durationsofar
91
92
hf = survival.EstimateHazardFunction(complete, ongoing)
93
sf = hf.MakeSurvival()
94
95
return hf, sf
96
97
98
def main():
99
resp6 = survival.ReadFemResp2002()
100
CleanData(resp6)
101
married6 = resp6[resp6.evrmarry==1]
102
103
resp7 = survival.ReadFemResp2010()
104
CleanData(resp7)
105
married7 = resp7[resp7.evrmarry==1]
106
107
ResampleDivorceCurveByDecade([married6, married7])
108
109
110
if __name__ == '__main__':
111
main()
112
113