CoCalc Public Files2016-05-27-182935.htmlOpen with one click!
Authors: Tim Clemans SMC, Mark Silverberg
Views : 46
Description: Jupyter html version of 2016-05-27-182935.ipynb
Compute Environment: Ubuntu 18.04 (Deprecated)
2016-05-27-182935

Comparing FCC & CFPB Complaint Data

by Mark Silverberg (@Skram)

Objective

Tinker with Sage Math Cloud-powered Jupyter Python notebooks and Socrata-powered federal open data from two different organizations: Federal Communications Commission (FCC) and the Consumer Financial Protection Bureau (CFPB)

The puzzle pieces

Please improve upon this

Depending on how you are viewing this, you can download the python notebook and expand upon this work! Please feel free to raise feature requests and suggestions on Github at https://github.com/marks/open-complaints-data/


Setup

Dependencies and variables

In [42]:
# for top secret credentials
import json
with open('secrets.json') as data_file:    
    secrets = json.load(data_file)

# for data analysis
import pandas as pd
import urllib

# for plotly plotting
import plotly.plotly as py
py.sign_in(secrets["PLOTLY"]["USER"], secrets["PLOTLY"]["API_KEY"])

# for Census querying
from census import Census
from us import states
census = Census(secrets["CENSUS"]["API_KEY"]) # dummy API account

# for iPython displaying
from IPython.display import display, Image, HTML

# variables
variables = {
    "date_received":{
        "fcc":"ticket_created",
        "cfpb":"date_received"
    },
    "state":{
        "fcc":"state",
        "cfpb":"state"
    },
    "endpoint":{
        "fcc":"https://opendata.fcc.gov/resource/sr6c-syda.json",
        "cfpb":"https://data.consumerfinance.gov/resource/jhzv-w97w.json"
    }
}
provider_keys = variables['endpoint'].keys()

query_templates = { # always make sure there is a LIMIT statement with a {n} variable
    'count_by_state':{
        "template":"SELECT {0} as state, count(*) as complaints GROUP BY {0} LIMIT {1}",
        "variables":["state"]
    },
#     'count_by_month_and_state':{
#         "template":"SELECT {0} as state, date_trunc_ym({1}) as date_received, count(*) GROUP BY {0}, {1} ORDER BY date_received LIMIT {2}",
#         "variables":["state","date_received"]
#     }
}
query_template_keys = query_templates.keys()

a_big_number = 10000000 # for SODA query limits

Helper functions

In [43]:
# function to marry a query template and provider's normalized variables into an API query string
def form_query_url(template,provider):
    base_url = variables['endpoint'][provider]
    query_template = query_templates[template]
    
    # form array of variables for the query template
    format_variables = []
    for variable in query_template['variables']:
        format_variables.append(variables[variable][provider])
    format_variables.append(a_big_number) 

    # put template and format_variables together
    query_str = query_template['template'].format(*format_variables)
    return "{0}?$query={1}".format(base_url, urllib.quote(query_str))

# function to create a plotly-powered US-state choropleth
def us_state_choropleth(df,col_to_color_by,units,title,zmin=None,zmax=None):
    plotly_data = [ dict(
        type='choropleth',
        autocolorscale = True,
        locations = df['state'],
        z = df[col_to_color_by],
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            )
        ),
        colorbar = dict(
            title = units
        )
    ) ]

    if zmin != None:
        plotly_data[0]['zmin'] = zmin
    if zmax != None: 
        plotly_data[0]['zmax'] = zmax
        
    plotly_layout = dict(
            title = title,
            geo = dict(
                scope='usa',
                projection=dict( type='albers usa' ),
            ),
        )
    fig = dict( data=plotly_data, layout=plotly_layout )
    return fig

# function used with df.apply to normalize count per 100,000 popualation
def normalize_for_pop(row):
    pop = state_population_lookup.get(row['state'], None)
    if pop != None:
        rate = (int(row['complaints']) / float(pop))*100000
    else:
#         print "** state={0} isnt in state_population_lookup and wont have a rate calculated for it".format(row['state'])
        rate = None
    return rate

# function powered by TextBlob to return a dictionary of sentiment attributes
def sentiment_to_dict(str):
    blob = TextBlob(str)
    blob_sentiment = blob.sentiment
    return {'polarity':blob_sentiment.subjectivity, 'subjectivity': blob_sentiment.polarity}

Fetch the data (for each query template and provider)

In [44]:
# fetch the data 

data = dict()

## fetch data for all query templates and providers
for template in query_template_keys:
    data[template] = {}
    for provider in provider_keys:
        url = form_query_url(template,provider)
        print "Fetching {0} for {1} at {2}".format(template,provider,url)
        data[template][provider] = pd.read_json(url)
Fetching count_by_state for cfpb at https://data.consumerfinance.gov/resource/jhzv-w97w.json?$query=SELECT%20state%20as%20state%2C%20count%28%2A%29%20as%20complaints%20GROUP%20BY%20state%20LIMIT%2010000000
Fetching count_by_state for fcc at https://opendata.fcc.gov/resource/sr6c-syda.json?$query=SELECT%20state%20as%20state%2C%20count%28%2A%29%20as%20complaints%20GROUP%20BY%20state%20LIMIT%2010000000
In [45]:
images_to_display = []

print "Some summary data and graphics:"

template = 'count_by_state'
for provider in provider_keys:
    d = data[template][provider]
    print " - {0} for {1} has {2} rows for a total of {3} complaints".format(template, provider, len(d), d['complaints'].sum())

    fig = us_state_choropleth(d,'complaints','Number of Complaints','{0} Complaints by State'.format(provider.upper()))
    image_name = 'pyimage_{0}-{1}.png'.format(template, provider)
    py.image.save_as(fig, 'images/'+image_name)
    images_to_display.append(image_name)

counts_by_state_html = "<h2 style='text-align: center;'>Total # of Complaints per Data Source</h2><table><tr>"
for image_path in images_to_display:
    counts_by_state_html += '<td><img src="images/{0}" alt="{0}" /></td>'.format(image_path)
counts_by_state_html += "</tr></table>"

display(HTML(counts_by_state_html))
Some summary data and graphics:
 - count_by_state for cfpb has 63 rows for a total of 573023 complaints
 - count_by_state for fcc has 57 rows for a total of 488576 complaints

Total # of Complaints per Data Source

pyimage_count_by_state-cfpb.pngpyimage_count_by_state-fcc.png

Let's try normalizing complaint counts per 100k population

First, get state populations from the US Census

In [46]:
## consistent with Census Fact Finder 'TOTAL POPULATION / Universe: Total population / 2009-2013 American Community Survey 5-Year Estimates'
pop_est_table_id = 'B01003_001E'
state_populations = census.acs.state(('NAME',pop_est_table_id), Census.ALL, year=2013) # http://1.usa.gov/1RAhxWq

## create lookup dictionary of population
state_population_lookup = dict()
for row in state_populations:
    state = states.lookup(row['state']).abbr
    pop = row[pop_est_table_id]
    state_population_lookup[state] = pop
    
print "Got population data for {0} 'states'".format(len(state_population_lookup))
Got population data for 52 'states'

Plot normalized counts

In [47]:
# Normalized complaint counts per 100k population

images_to_display = []

template = 'count_by_state'
for provider in provider_keys:
    d = data[template][provider]
    d['complaint_rate_per_100k_pop'] = d.apply(normalize_for_pop, axis=1)

    fig = us_state_choropleth(d,'complaint_rate_per_100k_pop','Complaints Per 100k Pop.','{0} Complaints by State (per 100k population)'.format(provider.upper()))
    # py.iplot( fig, filename='d3-cloropleth-map' )
    image_name = 'pyimage_{0}-{1}-per100k.png'.format(template, provider)
    py.image.save_as(fig, 'images/'+image_name)
    images_to_display.append(image_name)

counts_by_state_html = "<h2 style='text-align: center;'>Normalized # of Complaints per 100k population per Data Source</h2><table><tr>"
for image_path in images_to_display:
    counts_by_state_html += '<td><img src="images/{0}" alt="{0}" /></td>'.format(image_path)
counts_by_state_html += "</tr></table>"

display(HTML(counts_by_state_html))

Normalized # of Complaints per 100k population per Data Source

pyimage_count_by_state-cfpb-per100k.pngpyimage_count_by_state-fcc-per100k.png

Check out the top states by normalized complaint count

In [48]:
data['count_by_state']['fcc'].sort_values('complaint_rate_per_100k_pop', ascending=False).head(5)
Out[48]:
complaints state complaint_rate_per_100k_pop
8 2314 DC 373.604835
22 17324 MD 296.933702
6 11872 CO 231.905392
10 39859 FL 208.782538
52 14029 WA 205.716511
In [49]:
data['count_by_state']['cfpb'].sort_values('complaint_rate_per_100k_pop', ascending=False).head(5)
Out[49]:
complaints state complaint_rate_per_100k_pop
11 3332 DC 537.965129
12 3006 DE 330.894737
26 18256 MD 312.908200
13 55374 FL 290.050534
39 23139 NJ 261.978446

Beginnings of playing with sentiment analysis

In [55]:
from textblob import TextBlob # read http://planspace.org/20150607-textblob_sentiment/ for a good overview

# fetch all CFPB complaints and their associated state
url = variables['endpoint']['cfpb']+"?$query="+urllib.quote("SELECT state, complaint_what_happened WHERE state IS NOT NULL AND complaint_what_happened IS NOT NULL LIMIT {0}".format(a_big_number))
all_cfpb = pd.read_json(url)
print "Got {0} complaints to CFPB with a non-null state and complaint narrative ".format(len(all_cfpb))
Got 74810 complaints to CFPB with a non-null state and complaint narrative 
In [51]:
# collect sentiment (polarity and subjectivity) and merge into dataframe
sentiment_series = all_cfpb.complaint_what_happened.apply(lambda s: pd.Series(sentiment_to_dict(s)))
all_cfpb['polarity'] = sentiment_series['polarity']
all_cfpb['subjectivity'] = sentiment_series['subjectivity']
In [52]:
# get average by state
cfpb_state_avgs = all_cfpb.groupby(['state']).mean()

# hacks because of Mark's pandas newbness
from StringIO import StringIO
csv = cfpb_state_avgs.to_csv()
CSV_DATA=StringIO(csv)
cfpb_state_avgs = pd.read_csv(CSV_DATA)
In [59]:
fig = us_state_choropleth(cfpb_state_avgs,'polarity','Polarity','Average Polarity of CFPB Complaints by State<br />(-1=negative to 1=positive)')
# py.iplot( fig, filename='d3-cloropleth-map' )
image_name = 'pyimage_count_by_state-cfpb-sentiment_polarity.png'
py.image.save_as(fig, 'images/'+image_name)
Image('images/'+image_name)
Out[59]:
In [54]:
fig = us_state_choropleth(cfpb_state_avgs,'subjectivity','Subjectivity','Average Subjectivity of CFPB Complaints by State<br />(0=less subjective to 1=most subjective)',0,1)
# py.iplot( fig, filename='d3-cloropleth-map' )
image_name = 'pyimage_count_by_state-cfpb-sentiment_subjectivity.png'
py.image.save_as(fig, 'images/'+image_name)
Image('images/'+image_name)
Out[54]:
In [ ]: