Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import sys
11
import numpy as np
12
import math
13
14
import brfss
15
import thinkplot
16
import thinkstats2
17
18
19
def GetHeightWeight(df, hjitter=0.0, wjitter=0.0):
20
"""Get sequences of height and weight.
21
22
df: DataFrame with htm3 and wtkg2
23
hjitter: float magnitude of random noise added to heights
24
wjitter: float magnitude of random noise added to weights
25
26
returns: tuple of sequences (heights, weights)
27
"""
28
heights = df.htm3
29
if hjitter:
30
heights = thinkstats2.Jitter(heights, hjitter)
31
32
weights = df.wtkg2
33
if wjitter:
34
weights = thinkstats2.Jitter(weights, wjitter)
35
36
return heights, weights
37
38
39
def ScatterPlot(heights, weights, alpha=1.0):
40
"""Make a scatter plot and save it.
41
42
heights: sequence of float
43
weights: sequence of float
44
alpha: float
45
"""
46
thinkplot.Scatter(heights, weights, alpha=alpha)
47
thinkplot.Config(xlabel='height (cm)',
48
ylabel='weight (kg)',
49
axis=[140, 210, 20, 200],
50
legend=False)
51
52
53
def HexBin(heights, weights, bins=None):
54
"""Make a hexbin plot and save it.
55
56
heights: sequence of float
57
weights: sequence of float
58
bins: 'log' or None for linear
59
"""
60
thinkplot.HexBin(heights, weights, bins=bins)
61
thinkplot.Config(xlabel='height (cm)',
62
ylabel='weight (kg)',
63
axis=[140, 210, 20, 200],
64
legend=False)
65
66
67
def MakeFigures(df):
68
"""Make scatterplots.
69
"""
70
sample = thinkstats2.SampleRows(df, 5000)
71
72
# simple scatter plot
73
thinkplot.PrePlot(cols=2)
74
heights, weights = GetHeightWeight(sample)
75
ScatterPlot(heights, weights)
76
77
# scatter plot with jitter
78
thinkplot.SubPlot(2)
79
heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
80
ScatterPlot(heights, weights)
81
82
thinkplot.Save(root='scatter1')
83
84
# with jitter and transparency
85
thinkplot.PrePlot(cols=2)
86
ScatterPlot(heights, weights, alpha=0.1)
87
88
# hexbin plot
89
thinkplot.SubPlot(2)
90
heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
91
HexBin(heights, weights)
92
thinkplot.Save(root='scatter2')
93
94
95
def BinnedPercentiles(df):
96
"""Bin the data by height and plot percentiles of weight for eachbin.
97
98
df: DataFrame
99
"""
100
cdf = thinkstats2.Cdf(df.htm3)
101
print('Fraction between 140 and 200 cm', cdf[200] - cdf[140])
102
103
bins = np.arange(135, 210, 5)
104
indices = np.digitize(df.htm3, bins)
105
groups = df.groupby(indices)
106
107
heights = [group.htm3.mean() for i, group in groups][1:-1]
108
cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
109
110
thinkplot.PrePlot(3)
111
for percent in [75, 50, 25]:
112
weights = [cdf.Percentile(percent) for cdf in cdfs]
113
label = '%dth' % percent
114
thinkplot.Plot(heights, weights, label=label)
115
116
thinkplot.Save(root='scatter3',
117
xlabel='height (cm)',
118
ylabel='weight (kg)')
119
120
121
def Correlations(df):
122
print('pandas cov', df.htm3.cov(df.wtkg2))
123
#print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
124
print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
125
print()
126
127
print('pandas corr', df.htm3.corr(df.wtkg2))
128
#print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
129
print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
130
print()
131
132
print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
133
print('thinkstats2 SpearmanCorr',
134
thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
135
print('thinkstats2 SpearmanCorr log wtkg3',
136
thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
137
print()
138
139
print('thinkstats2 Corr log wtkg3',
140
thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
141
print()
142
143
144
def main(script):
145
thinkstats2.RandomSeed(17)
146
147
df = brfss.ReadBrfss(nrows=None)
148
df = df.dropna(subset=['htm3', 'wtkg2'])
149
Correlations(df)
150
return
151
152
MakeFigures(df)
153
BinnedPercentiles(df)
154
155
156
if __name__ == '__main__':
157
main(*sys.argv)
158
159