Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96141
License: OTHER
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
"""
9
10
Results: on a log-log scale the tail of the CCDF is a straight line,
11
which suggests that the Pareto distribution is a good model for this data,
12
at least for people with taxable income above the median.
13
14
"""
15
16
import csv
17
import sys
18
19
import myplot
20
import Pmf
21
import Cdf
22
23
24
def ReadIncomeFile(filename='08in11si.csv'):
25
"""Reads a data file from the IRS and returns the first two columns.
26
27
Skips the header and returns only the first table (non-cumulative).
28
29
Args:
30
filename: string data file
31
32
Returns:
33
list of string pairs
34
"""
35
reader = csv.reader(open(filename))
36
for line in reader:
37
if line[0] == 'All returns':
38
break
39
40
t = []
41
for line in reader:
42
if line[0].startswith('Accumulated'):
43
break
44
t.append(line[0:2])
45
46
return t
47
48
49
def MakeIncomeDist(data):
50
"""Converts the strings from the IRS file to a Hist, Pmf and Cdf.
51
52
Args:
53
data: list of (dollar range, number of returns) string pairs
54
55
Returns:
56
tuple of (Hist, Pmf, Cdf) representing the number of returns in each bin
57
"""
58
def clean(s):
59
"""Converts dollar amounts to integers."""
60
try:
61
return int(s.lstrip('$'))
62
except ValueError:
63
if s in ['No', 'income']:
64
return 0
65
if s == 'more':
66
return -1
67
return None
68
69
def midpoint(low, high):
70
"""Finds the midpoint of a range."""
71
if high == -1:
72
return low * 3 / 2
73
else:
74
return (low + high) / 2
75
76
hist = Pmf.Hist()
77
78
for column, number in data:
79
# convert the number of returns
80
number = number.replace(',', '')
81
number = int(number)
82
83
# convert the income range
84
column = column.replace(',', '')
85
t = column.split()
86
low, high = t[0], t[-1]
87
low, high = clean(low), clean(high)
88
89
# add to the histogram
90
x = midpoint(low, high)
91
hist.Incr(x, number)
92
print x, number
93
94
pmf = Pmf.MakePmfFromHist(hist)
95
cdf = Cdf.MakeCdfFromDict(pmf.GetDict())
96
return hist, pmf, cdf
97
98
99
def main(script, *args):
100
data = ReadIncomeFile()
101
hist, pmf, cdf = MakeIncomeDist(data)
102
103
# plot the CDF on a log-x scale
104
myplot.Clf()
105
myplot.Cdf(cdf)
106
myplot.Save(root='income_logx',
107
xscale='log',
108
xlabel='income',
109
ylabel='CDF')
110
111
# plot the complementary CDF on a log-log scale
112
myplot.Clf()
113
myplot.Cdf(cdf, complement=True)
114
myplot.Save(root='income_loglog',
115
complement=True,
116
xscale='log',
117
yscale='log',
118
xlabel='income',
119
ylabel='complementary CDF')
120
121
122
if __name__ == "__main__":
123
main(*sys.argv)
124
125