CoCalc -- scatter.py

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.
This is the accompanying code for this book.
Website: http://greenteapress.com/wp/think-stats-2e/
Views: ⁷¹¹⁹
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function
9

10
import sys
11
import numpy as np
12
import math
13

14
import brfss
15
import thinkplot
16
import thinkstats2
17

18

19
def GetHeightWeight(df, hjitter=0.0, wjitter=0.0):
20
    """Get sequences of height and weight.
21

22
    df: DataFrame with htm3 and wtkg2
23
    hjitter: float magnitude of random noise added to heights
24
    wjitter: float magnitude of random noise added to weights
25

26
    returns: tuple of sequences (heights, weights)
27
    """
28
    heights = df.htm3
29
    if hjitter:
30
        heights = thinkstats2.Jitter(heights, hjitter)
31

32
    weights = df.wtkg2
33
    if wjitter:
34
        weights = thinkstats2.Jitter(weights, wjitter)
35

36
    return heights, weights
37

38

39
def ScatterPlot(heights, weights, alpha=1.0):
40
    """Make a scatter plot and save it.
41

42
    heights: sequence of float
43
    weights: sequence of float
44
    alpha: float
45
    """
46
    thinkplot.Scatter(heights, weights, alpha=alpha)
47
    thinkplot.Config(xlabel='height (cm)',
48
                     ylabel='weight (kg)',
49
                     axis=[140, 210, 20, 200],
50
                     legend=False)
51

52

53
def HexBin(heights, weights, bins=None):
54
    """Make a hexbin plot and save it.
55

56
    heights: sequence of float
57
    weights: sequence of float
58
    bins: 'log' or None for linear
59
    """
60
    thinkplot.HexBin(heights, weights, bins=bins)
61
    thinkplot.Config(xlabel='height (cm)',
62
                     ylabel='weight (kg)',
63
                     axis=[140, 210, 20, 200],
64
                     legend=False)
65

66

67
def MakeFigures(df):
68
    """Make scatterplots.
69
    """
70
    sample = thinkstats2.SampleRows(df, 5000)
71

72
    # simple scatter plot
73
    thinkplot.PrePlot(cols=2)
74
    heights, weights = GetHeightWeight(sample)
75
    ScatterPlot(heights, weights)
76

77
    # scatter plot with jitter
78
    thinkplot.SubPlot(2)
79
    heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
80
    ScatterPlot(heights, weights)
81

82
    thinkplot.Save(root='scatter1')
83

84
    # with jitter and transparency
85
    thinkplot.PrePlot(cols=2)
86
    ScatterPlot(heights, weights, alpha=0.1)
87

88
    # hexbin plot
89
    thinkplot.SubPlot(2)
90
    heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
91
    HexBin(heights, weights)
92
    thinkplot.Save(root='scatter2')
93

94

95
def BinnedPercentiles(df):
96
    """Bin the data by height and plot percentiles of weight for eachbin.
97

98
    df: DataFrame
99
    """
100
    cdf = thinkstats2.Cdf(df.htm3)
101
    print('Fraction between 140 and 200 cm', cdf[200] - cdf[140])
102

103
    bins = np.arange(135, 210, 5)
104
    indices = np.digitize(df.htm3, bins)
105
    groups = df.groupby(indices)
106

107
    heights = [group.htm3.mean() for i, group in groups][1:-1]
108
    cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
109

110
    thinkplot.PrePlot(3)
111
    for percent in [75, 50, 25]:
112
        weights = [cdf.Percentile(percent) for cdf in cdfs]
113
        label = '%dth' % percent
114
        thinkplot.Plot(heights, weights, label=label)
115

116
    thinkplot.Save(root='scatter3',
117
                   xlabel='height (cm)',
118
                   ylabel='weight (kg)')
119

120

121
def Correlations(df):
122
    print('pandas cov', df.htm3.cov(df.wtkg2))
123
    #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
124
    print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
125
    print()
126

127
    print('pandas corr', df.htm3.corr(df.wtkg2))
128
    #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
129
    print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
130
    print()
131

132
    print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
133
    print('thinkstats2 SpearmanCorr', 
134
          thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
135
    print('thinkstats2 SpearmanCorr log wtkg3', 
136
          thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
137
    print()
138

139
    print('thinkstats2 Corr log wtkg3',
140
          thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
141
    print()
142

143

144
def main(script):
145
    thinkstats2.RandomSeed(17)
146
    
147
    df = brfss.ReadBrfss(nrows=None)
148
    df = df.dropna(subset=['htm3', 'wtkg2'])
149
    Correlations(df)
150
    return
151

152
    MakeFigures(df)
153
    BinnedPercentiles(df)
154
    
155

156
if __name__ == '__main__':
157
    main(*sys.argv)
158

159