{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from datetime import timedelta\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"from sklearn.metrics import confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_all_weather = pd.read_csv('2008-2018 Weather Data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION | \n",
" NAME | \n",
" LATITUDE | \n",
" LONGITUDE | \n",
" ELEVATION | \n",
" DATE | \n",
" AWND | \n",
" DAPR | \n",
" FMTM | \n",
" MDPR | \n",
" ... | \n",
" WT11 | \n",
" WT13 | \n",
" WT14 | \n",
" WT15 | \n",
" WT16 | \n",
" WT17 | \n",
" WT18 | \n",
" WT19 | \n",
" WT21 | \n",
" WT22 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" US1NYKN0003 | \n",
" BROOKLYN 2.4 SW, NY US | \n",
" 40.6194 | \n",
" -73.9859 | \n",
" 7.9 | \n",
" 11/18/2008 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" US1NYKN0003 | \n",
" BROOKLYN 2.4 SW, NY US | \n",
" 40.6194 | \n",
" -73.9859 | \n",
" 7.9 | \n",
" 11/19/2008 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" US1NYKN0003 | \n",
" BROOKLYN 2.4 SW, NY US | \n",
" 40.6194 | \n",
" -73.9859 | \n",
" 7.9 | \n",
" 11/20/2008 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" US1NYKN0003 | \n",
" BROOKLYN 2.4 SW, NY US | \n",
" 40.6194 | \n",
" -73.9859 | \n",
" 7.9 | \n",
" 11/21/2008 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" US1NYKN0003 | \n",
" BROOKLYN 2.4 SW, NY US | \n",
" 40.6194 | \n",
" -73.9859 | \n",
" 7.9 | \n",
" 11/22/2008 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 45 columns
\n",
"
"
]
},
"execution_count": 3,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_all_weather.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_beaches = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sample ID | \n",
" Sample Date | \n",
" Beach Name | \n",
" Sample Location | \n",
" Enterococci Results | \n",
" Units or Notes | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 050514CP13 | \n",
" 05/05/2014 | \n",
" MIDLAND BEACH | \n",
" Center | \n",
" 20.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 1 | \n",
" 062011GR04 | \n",
" 06/20/2011 | \n",
" MANHATTAN BEACH | \n",
" Left | \n",
" NaN | \n",
" Result below detection limit | \n",
"
\n",
" \n",
" 2 | \n",
" 072808BH09 | \n",
" 07/28/2008 | \n",
" MIDLAND BEACH | \n",
" Right | \n",
" 28.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 3 | \n",
" 051214CP36 | \n",
" 05/12/2014 | \n",
" SOUTH BEACH | \n",
" Right | \n",
" 4.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 4 | \n",
" 081511KB07 | \n",
" 08/15/2011 | \n",
" CEDAR GROVE | \n",
" Left | \n",
" 360.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 5,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_beaches.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_beaches.fillna(0,inplace=True)\n",
"df_beaches['Sample Date'] = pd.to_datetime(df_beaches['Sample Date'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sample ID | \n",
" Sample Date | \n",
" Beach Name | \n",
" Sample Location | \n",
" Enterococci Results | \n",
" Units or Notes | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 050514CP13 | \n",
" 2014-05-05 | \n",
" MIDLAND BEACH | \n",
" Center | \n",
" 20.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 1 | \n",
" 062011GR04 | \n",
" 2011-06-20 | \n",
" MANHATTAN BEACH | \n",
" Left | \n",
" 0.0 | \n",
" Result below detection limit | \n",
"
\n",
" \n",
" 2 | \n",
" 072808BH09 | \n",
" 2008-07-28 | \n",
" MIDLAND BEACH | \n",
" Right | \n",
" 28.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 3 | \n",
" 051214CP36 | \n",
" 2014-05-12 | \n",
" SOUTH BEACH | \n",
" Right | \n",
" 4.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 4 | \n",
" 081511KB07 | \n",
" 2011-08-15 | \n",
" CEDAR GROVE | \n",
" Left | \n",
" 360.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 7,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_beaches.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man = df_beaches[df_beaches['Beach Name'] == 'MANHATTAN BEACH']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sample ID | \n",
" Sample Date | \n",
" Beach Name | \n",
" Sample Location | \n",
" Enterococci Results | \n",
" Units or Notes | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 062011GR04 | \n",
" 2011-06-20 | \n",
" MANHATTAN BEACH | \n",
" Left | \n",
" 0.0 | \n",
" Result below detection limit | \n",
"
\n",
" \n",
" 5 | \n",
" 062909KB01 | \n",
" 2009-06-29 | \n",
" MANHATTAN BEACH | \n",
" Left | \n",
" 8.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 7 | \n",
" 072015GR06 | \n",
" 2015-07-20 | \n",
" MANHATTAN BEACH | \n",
" Right | \n",
" 0.0 | \n",
" Result below detection limit | \n",
"
\n",
" \n",
" 71 | \n",
" 082112GR05 | \n",
" 2012-08-21 | \n",
" MANHATTAN BEACH | \n",
" Center | \n",
" 4.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
" 76 | \n",
" 060914GR06 | \n",
" 2014-06-09 | \n",
" MANHATTAN BEACH | \n",
" Right | \n",
" 8.0 | \n",
" MPN/100 ml | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 9,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_man.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man = df_man[['Sample Date','Enterococci Results']]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sample Date | \n",
" Enterococci Results | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 2011-06-20 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 2009-06-29 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 2015-07-20 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 71 | \n",
" 2012-08-21 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 76 | \n",
" 2014-06-09 | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 11,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_man.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man.columns = ['DATE','Enterococci']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 2011-06-20 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 2009-06-29 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 2015-07-20 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 71 | \n",
" 2012-08-21 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 76 | \n",
" 2014-06-09 | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 13,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_man.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man['DATE'] = pd.to_datetime(df_man['DATE'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man.sort_values(by='DATE',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
"
\n",
" \n",
" \n",
" \n",
" 10187 | \n",
" 2005-05-02 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 5811 | \n",
" 2005-05-02 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 12516 | \n",
" 2005-05-02 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 4561 | \n",
" 2005-05-09 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 14220 | \n",
" 2005-05-09 | \n",
" 10.0 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 16,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_man.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_man.index = range(len(df_man.index))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2005-05-02 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2005-05-02 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 2 | \n",
" 2005-05-02 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2005-05-09 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 4 | \n",
" 2005-05-09 | \n",
" 10.0 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 18,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_man.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_jfk = df_all_weather[df_all_weather.NAME == 'JFK INTERNATIONAL AIRPORT, NY US']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATION | \n",
" NAME | \n",
" LATITUDE | \n",
" LONGITUDE | \n",
" ELEVATION | \n",
" DATE | \n",
" AWND | \n",
" DAPR | \n",
" FMTM | \n",
" MDPR | \n",
" ... | \n",
" WT11 | \n",
" WT13 | \n",
" WT14 | \n",
" WT15 | \n",
" WT16 | \n",
" WT17 | \n",
" WT18 | \n",
" WT19 | \n",
" WT21 | \n",
" WT22 | \n",
"
\n",
" \n",
" \n",
" \n",
" 49261 | \n",
" USW00094789 | \n",
" JFK INTERNATIONAL AIRPORT, NY US | \n",
" 40.6386 | \n",
" -73.7622 | \n",
" 3.4 | \n",
" 1/1/2008 | \n",
" 15.88 | \n",
" NaN | \n",
" 827.0 | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 49262 | \n",
" USW00094789 | \n",
" JFK INTERNATIONAL AIRPORT, NY US | \n",
" 40.6386 | \n",
" -73.7622 | \n",
" 3.4 | \n",
" 1/2/2008 | \n",
" 21.25 | \n",
" NaN | \n",
" 1750.0 | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 49263 | \n",
" USW00094789 | \n",
" JFK INTERNATIONAL AIRPORT, NY US | \n",
" 40.6386 | \n",
" -73.7622 | \n",
" 3.4 | \n",
" 1/3/2008 | \n",
" 16.78 | \n",
" NaN | \n",
" 445.0 | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 49264 | \n",
" USW00094789 | \n",
" JFK INTERNATIONAL AIRPORT, NY US | \n",
" 40.6386 | \n",
" -73.7622 | \n",
" 3.4 | \n",
" 1/4/2008 | \n",
" 12.97 | \n",
" NaN | \n",
" 1146.0 | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 49265 | \n",
" USW00094789 | \n",
" JFK INTERNATIONAL AIRPORT, NY US | \n",
" 40.6386 | \n",
" -73.7622 | \n",
" 3.4 | \n",
" 1/5/2008 | \n",
" 6.93 | \n",
" NaN | \n",
" 2203.0 | \n",
" NaN | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 45 columns
\n",
"
"
]
},
"execution_count": 20,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_jfk.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',\n",
" 'DAPR', 'FMTM', 'MDPR', 'MDSF', 'PGTM', 'PRCP', 'SNOW', 'SNWD', 'TAVG',\n",
" 'TMAX', 'TMIN', 'TOBS', 'TSUN', 'WDF2', 'WDF5', 'WESD', 'WESF', 'WSF2',\n",
" 'WSF5', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08',\n",
" 'WT09', 'WT11', 'WT13', 'WT14', 'WT15', 'WT16', 'WT17', 'WT18', 'WT19',\n",
" 'WT21', 'WT22'],\n",
" dtype='object')"
]
},
"execution_count": 21,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_jfk.columns"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_jfk = df_jfk[['DATE','PRCP']]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_jfk['DATE'] = pd.to_datetime(df_jfk['DATE'])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_jfk.sort_values(by='DATE',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_merged = pd.merge(df_man, df_jfk, how='inner', on='DATE')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
" PRCP | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2008-04-21 | \n",
" 4.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 1 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 2 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 3 | \n",
" 2008-04-28 | \n",
" 120.0 | \n",
" 1.05 | \n",
"
\n",
" \n",
" 4 | \n",
" 2008-04-28 | \n",
" 28.0 | \n",
" 1.05 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 26,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
" PRCP | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2008-04-21 | \n",
" 4.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 1 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 2 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 3 | \n",
" 2008-04-28 | \n",
" 120.0 | \n",
" 1.05 | \n",
"
\n",
" \n",
" 4 | \n",
" 2008-04-28 | \n",
" 28.0 | \n",
" 1.05 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 27,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"def eWarn(m):\n",
" if m > 104:\n",
" return 1\n",
" else:\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"df_merged['Warning'] = df_merged.Enterococci.apply(eWarn)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DATE | \n",
" Enterococci | \n",
" PRCP | \n",
" Warning | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2008-04-21 | \n",
" 4.0 | \n",
" 0.00 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2008-04-21 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2008-04-28 | \n",
" 120.0 | \n",
" 1.05 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 2008-04-28 | \n",
" 28.0 | \n",
" 1.05 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 30,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 649\n",
"1 38\n",
"Name: Warning, dtype: int64"
]
},
"execution_count": 31,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df_merged.Warning.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5,1,'Warning vs. PRCP')"
]
},
"execution_count": 32,
"metadata": {
},
"output_type": "execute_result"
},
{
"data": {
"image/png": "f6a137001a3be7c8d0dcaf9d20d3e763fbf1c799"
},
"metadata": {
"image/png": {
"height": 277,
"width": 388
},
"needs_background": "light"
}
}
],
"source": [
"df_merged.plot.scatter(x='PRCP',y='Warning',color='black')\n",
"plt.title('Warning vs. PRCP')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"X = np.array(df_merged.PRCP)\n",
"y = df_merged.Warning"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"logmodel = LogisticRegression(solver='liblinear')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='warn',\n",
" n_jobs=None, penalty='l2', random_state=None, solver='liblinear',\n",
" tol=0.0001, verbose=0, warm_start=False)"
]
},
"execution_count": 36,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"logmodel.fit(X_train.reshape(-1,1),y_train)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"b0 = logmodel.intercept_\n",
"b1 = logmodel.coef_"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5,1,'Warning vs. PRCP')"
]
},
"execution_count": 38,
"metadata": {
},
"output_type": "execute_result"
},
{
"data": {
"image/png": "7d66b157f6a40497688382e2ebfbeb60dbdd53a5"
},
"metadata": {
"image/png": {
"height": 277,
"width": 388
},
"needs_background": "light"
}
}
],
"source": [
"X2 = sorted(X)\n",
"df_merged.plot.scatter(x='PRCP',y='Warning',color='black')\n",
"plt.plot(X2,1/(1+np.exp(-b0-b1*X2)).reshape(-1,1),'r')\n",
"plt.title('Warning vs. PRCP')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"predictions = logmodel.predict(X_test.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[213 0]\n",
" [ 14 0]]\n"
]
}
],
"source": [
"print(confusion_matrix(y_test,logmodel.predict(X_test.reshape(-1, 1))))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (Anaconda 5)",
"language": "python",
"name": "anaconda5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}