"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
import numpy as np
import hinc
import thinkplot
import thinkstats2
def InterpolateSample(df, log_upper=6.0):
"""Makes a sample of log10 household income.
Assumes that log10 income is uniform in each range.
df: DataFrame with columns income and freq
log_upper: log10 of the assumed upper bound for the highest range
returns: NumPy array of log10 household income
"""
df['log_upper'] = np.log10(df.income)
df['log_lower'] = df.log_upper.shift(1)
df.log_lower[0] = 3.0
df.log_upper[41] = log_upper
arrays = []
for _, row in df.iterrows():
vals = np.linspace(row.log_lower, row.log_upper, row.freq)
arrays.append(vals)
log_sample = np.concatenate(arrays)
return log_sample
def main():
df = hinc.ReadData()
log_sample = InterpolateSample(df, log_upper=6.0)
log_cdf = thinkstats2.Cdf(log_sample)
thinkplot.Cdf(log_cdf)
thinkplot.Show(xlabel='household income',
ylabel='CDF')
if __name__ == "__main__":
main()