Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7119
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function
9
10
import numpy as np
11
12
import hinc
13
import thinkplot
14
import thinkstats2
15
16
17
def InterpolateSample(df, log_upper=6.0):
18
"""Makes a sample of log10 household income.
19
20
Assumes that log10 income is uniform in each range.
21
22
df: DataFrame with columns income and freq
23
log_upper: log10 of the assumed upper bound for the highest range
24
25
returns: NumPy array of log10 household income
26
"""
27
# compute the log10 of the upper bound for each range
28
df['log_upper'] = np.log10(df.income)
29
30
# get the lower bounds by shifting the upper bound and filling in
31
# the first element
32
df['log_lower'] = df.log_upper.shift(1)
33
df.log_lower[0] = 3.0
34
35
# plug in a value for the unknown upper bound of the highest range
36
df.log_upper[41] = log_upper
37
38
# use the freq column to generate the right number of values in
39
# each range
40
arrays = []
41
for _, row in df.iterrows():
42
vals = np.linspace(row.log_lower, row.log_upper, row.freq)
43
arrays.append(vals)
44
45
# collect the arrays into a single sample
46
log_sample = np.concatenate(arrays)
47
return log_sample
48
49
50
def main():
51
df = hinc.ReadData()
52
log_sample = InterpolateSample(df, log_upper=6.0)
53
54
log_cdf = thinkstats2.Cdf(log_sample)
55
thinkplot.Cdf(log_cdf)
56
thinkplot.Show(xlabel='household income',
57
ylabel='CDF')
58
59
60
if __name__ == "__main__":
61
main()
62
63