Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7118
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import numpy as np
11
import pandas
12
13
import thinkplot
14
import thinkstats2
15
16
17
def Clean(s):
18
"""Converts dollar amounts to integers."""
19
try:
20
return int(s.lstrip('$').replace(',', ''))
21
except ValueError:
22
if s == 'Under':
23
return 0
24
elif s == 'over':
25
return np.inf
26
return None
27
28
29
def ReadData(filename='hinc06.csv'):
30
"""Reads filename and returns populations in thousands
31
32
filename: string
33
34
returns: pandas Series of populations in thousands
35
"""
36
data = pandas.read_csv(filename, header=None, skiprows=9)
37
cols = data[[0, 1]]
38
39
res = []
40
for _, row in cols.iterrows():
41
label, freq = row.values
42
freq = int(freq.replace(',', ''))
43
44
t = label.split()
45
low, high = Clean(t[0]), Clean(t[-1])
46
47
res.append((high, freq))
48
49
df = pandas.DataFrame(res)
50
# correct the first range
51
df.loc[0, 0] -= 1
52
# compute the cumulative sum of the freqs
53
df[2] = df[1].cumsum()
54
# normalize the cumulative freqs
55
total = df[2][41]
56
df[3] = df[2] / total
57
# add column names
58
df.columns = ['income', 'freq', 'cumsum', 'ps']
59
return df
60
61
62
def main():
63
df = ReadData()
64
print(df)
65
66
67
if __name__ == "__main__":
68
main()
69
70