CoCalc -- irs.py

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / think-stats-2ed / solutions / irs.py
Views: ⁹⁶¹⁴¹
License: OTHER
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
"""
9

10
Results: on a log-log scale the tail of the CCDF is a straight line,
11
which suggests that the Pareto distribution is a good model for this data,
12
at least for people with taxable income above the median.
13

14
"""
15

16
import csv
17
import sys
18

19
import myplot
20
import Pmf
21
import Cdf
22

23

24
def ReadIncomeFile(filename='08in11si.csv'):
25
    """Reads a data file from the IRS and returns the first two columns.
26

27
    Skips the header and returns only the first table (non-cumulative).
28

29
    Args:
30
      filename: string data file
31

32
    Returns:
33
      list of string pairs
34
    """
35
    reader = csv.reader(open(filename))
36
    for line in reader:
37
        if line[0] == 'All returns':
38
            break
39

40
    t = []
41
    for line in reader:
42
        if line[0].startswith('Accumulated'):
43
            break
44
        t.append(line[0:2])
45

46
    return t
47

48

49
def MakeIncomeDist(data):
50
    """Converts the strings from the IRS file to a Hist, Pmf and Cdf.
51

52
    Args:
53
      data: list of (dollar range, number of returns) string pairs
54

55
    Returns:
56
      tuple of (Hist, Pmf, Cdf) representing the number of returns in each bin
57
    """
58
    def clean(s):
59
        """Converts dollar amounts to integers."""
60
        try:
61
            return int(s.lstrip('$'))
62
        except ValueError:
63
            if s in ['No', 'income']:
64
                return 0
65
            if s == 'more':
66
                return -1
67
            return None
68

69
    def midpoint(low, high):
70
        """Finds the midpoint of a range."""
71
        if high == -1:
72
            return low * 3 / 2
73
        else:
74
            return (low + high) / 2
75

76
    hist = Pmf.Hist()
77

78
    for column, number in data:
79
        # convert the number of returns
80
        number = number.replace(',', '')
81
        number = int(number)
82

83
        # convert the income range
84
        column = column.replace(',', '')
85
        t = column.split()
86
        low, high = t[0], t[-1]
87
        low, high = clean(low), clean(high)
88

89
        # add to the histogram
90
        x = midpoint(low, high)
91
        hist.Incr(x, number)
92
        print x, number
93

94
    pmf = Pmf.MakePmfFromHist(hist)
95
    cdf = Cdf.MakeCdfFromDict(pmf.GetDict())
96
    return hist, pmf, cdf
97

98

99
def main(script, *args):
100
    data = ReadIncomeFile()
101
    hist, pmf, cdf = MakeIncomeDist(data)
102

103
    # plot the CDF on a log-x scale
104
    myplot.Clf()
105
    myplot.Cdf(cdf)
106
    myplot.Save(root='income_logx',
107
                xscale='log',
108
                xlabel='income',
109
                ylabel='CDF')
110

111
    # plot the complementary CDF on a log-log scale
112
    myplot.Clf()
113
    myplot.Cdf(cdf, complement=True)
114
    myplot.Save(root='income_loglog',
115
                complement=True,
116
                xscale='log',
117
                yscale='log',
118
                xlabel='income',
119
                ylabel='complementary CDF')
120

121

122
if __name__ == "__main__":
123
    main(*sys.argv)
124

125
Product

Resources

Company