Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7115
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import sys
11
import numpy as np
12
import math
13
14
import first
15
import thinkplot
16
import thinkstats2
17
18
19
"""This file contains a solution to an exercise in Think Stats:
20
21
Using data from the NSFG, make a scatter plot of birth weight
22
versus mother's age. Plot percentiles of birth weight
23
versus mother's age. Compute Pearson's and Spearman's correlations.
24
How would you characterize the relationship
25
between these variables?
26
27
My conclusions:
28
29
1) The scatterplot shows a weak relationship between the variables.
30
31
2) The correlations support this. Pearson's is around 0.07, Spearman's
32
is around 0.09. The difference between them suggests some influence
33
of outliers or a non-linear relationsip.
34
35
3) Plotting percentiles of weight versus age suggests that the
36
relationship is non-linear. Birth weight increases more quickly
37
in the range of mother's age from 15 to 25. After that, the effect
38
is weaker.
39
40
"""
41
42
def ScatterPlot(ages, weights, alpha=1.0):
43
"""Make a scatter plot and save it.
44
45
ages: sequence of float
46
weights: sequence of float
47
alpha: float
48
"""
49
thinkplot.Scatter(ages, weights, alpha=alpha)
50
thinkplot.Config(xlabel='age (years)',
51
ylabel='weight (lbs)',
52
xlim=[10, 45],
53
ylim=[0, 15],
54
legend=False)
55
56
57
def HexBin(ages, weights, bins=None):
58
"""Make a hexbin plot and save it.
59
60
ages: sequence of float
61
weights: sequence of float
62
bins: 'log' or None for linear
63
"""
64
thinkplot.HexBin(ages, weights, bins=bins)
65
thinkplot.Config(xlabel='age (years)',
66
ylabel='weight (lbs)',
67
legend=False)
68
69
70
def BinnedPercentiles(df):
71
"""Bin the data by age and plot percentiles of weight for each bin.
72
73
df: DataFrame
74
"""
75
bins = np.arange(10, 48, 3)
76
indices = np.digitize(df.agepreg, bins)
77
groups = df.groupby(indices)
78
79
ages = [group.agepreg.mean() for i, group in groups][1:-1]
80
cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups][1:-1]
81
82
thinkplot.PrePlot(3)
83
for percent in [75, 50, 25]:
84
weights = [cdf.Percentile(percent) for cdf in cdfs]
85
label = '%dth' % percent
86
thinkplot.Plot(ages, weights, label=label)
87
88
thinkplot.Save(root='chap07scatter3',
89
formats=['jpg'],
90
xlabel="mother's age (years)",
91
ylabel='birth weight (lbs)')
92
93
94
95
def main(script):
96
thinkstats2.RandomSeed(17)
97
98
live, firsts, others = first.MakeFrames()
99
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
100
BinnedPercentiles(live)
101
102
ages = live.agepreg
103
weights = live.totalwgt_lb
104
print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
105
print('thinkstats2 SpearmanCorr',
106
thinkstats2.SpearmanCorr(ages, weights))
107
108
ScatterPlot(ages, weights, alpha=0.1)
109
thinkplot.Save(root='chap07scatter1',
110
legend=False,
111
formats=['jpg'])
112
113
114
if __name__ == '__main__':
115
main(*sys.argv)
116
117