{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from datetime import timedelta\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_all_weather = pd.read_csv('2008-2018 Weather Data.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATIONNAMELATITUDELONGITUDEELEVATIONDATEAWNDDAPRFMTMMDPR...WT11WT13WT14WT15WT16WT17WT18WT19WT21WT22
0US1NYKN0003BROOKLYN 2.4 SW, NY US40.6194-73.98597.911/18/2008NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1US1NYKN0003BROOKLYN 2.4 SW, NY US40.6194-73.98597.911/19/2008NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2US1NYKN0003BROOKLYN 2.4 SW, NY US40.6194-73.98597.911/20/2008NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3US1NYKN0003BROOKLYN 2.4 SW, NY US40.6194-73.98597.911/21/2008NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4US1NYKN0003BROOKLYN 2.4 SW, NY US40.6194-73.98597.911/22/2008NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 45 columns

\n", "
" ] }, "execution_count": 3, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_all_weather.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_beaches = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample IDSample DateBeach NameSample LocationEnterococci ResultsUnits or Notes
0050514CP1305/05/2014MIDLAND BEACHCenter20.0MPN/100 ml
1062011GR0406/20/2011MANHATTAN BEACHLeftNaNResult below detection limit
2072808BH0907/28/2008MIDLAND BEACHRight28.0MPN/100 ml
3051214CP3605/12/2014SOUTH BEACHRight4.0MPN/100 ml
4081511KB0708/15/2011CEDAR GROVELeft360.0MPN/100 ml
\n", "
" ] }, "execution_count": 5, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_beaches.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_beaches.fillna(0,inplace=True)\n", "df_beaches['Sample Date'] = pd.to_datetime(df_beaches['Sample Date'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample IDSample DateBeach NameSample LocationEnterococci ResultsUnits or Notes
0050514CP132014-05-05MIDLAND BEACHCenter20.0MPN/100 ml
1062011GR042011-06-20MANHATTAN BEACHLeft0.0Result below detection limit
2072808BH092008-07-28MIDLAND BEACHRight28.0MPN/100 ml
3051214CP362014-05-12SOUTH BEACHRight4.0MPN/100 ml
4081511KB072011-08-15CEDAR GROVELeft360.0MPN/100 ml
\n", "
" ] }, "execution_count": 7, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_beaches.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man = df_beaches[df_beaches['Beach Name'] == 'MANHATTAN BEACH']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample IDSample DateBeach NameSample LocationEnterococci ResultsUnits or Notes
1062011GR042011-06-20MANHATTAN BEACHLeft0.0Result below detection limit
5062909KB012009-06-29MANHATTAN BEACHLeft8.0MPN/100 ml
7072015GR062015-07-20MANHATTAN BEACHRight0.0Result below detection limit
71082112GR052012-08-21MANHATTAN BEACHCenter4.0MPN/100 ml
76060914GR062014-06-09MANHATTAN BEACHRight8.0MPN/100 ml
\n", "
" ] }, "execution_count": 9, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_man.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man = df_man[['Sample Date','Enterococci Results']]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample DateEnterococci Results
12011-06-200.0
52009-06-298.0
72015-07-200.0
712012-08-214.0
762014-06-098.0
\n", "
" ] }, "execution_count": 11, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_man.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man.columns = ['DATE','Enterococci']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococci
12011-06-200.0
52009-06-298.0
72015-07-200.0
712012-08-214.0
762014-06-098.0
\n", "
" ] }, "execution_count": 13, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_man.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man['DATE'] = pd.to_datetime(df_man['DATE'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man.sort_values(by='DATE',inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococci
101872005-05-0210.0
58112005-05-029.9
125162005-05-0220.0
45612005-05-099.9
142202005-05-0910.0
\n", "
" ] }, "execution_count": 16, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_man.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_man.index = range(len(df_man.index))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococci
02005-05-0210.0
12005-05-029.9
22005-05-0220.0
32005-05-099.9
42005-05-0910.0
\n", "
" ] }, "execution_count": 18, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_man.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_jfk = df_all_weather[df_all_weather.NAME == 'JFK INTERNATIONAL AIRPORT, NY US']" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATIONNAMELATITUDELONGITUDEELEVATIONDATEAWNDDAPRFMTMMDPR...WT11WT13WT14WT15WT16WT17WT18WT19WT21WT22
49261USW00094789JFK INTERNATIONAL AIRPORT, NY US40.6386-73.76223.41/1/200815.88NaN827.0NaN...NaNNaNNaNNaN1.0NaNNaNNaNNaNNaN
49262USW00094789JFK INTERNATIONAL AIRPORT, NY US40.6386-73.76223.41/2/200821.25NaN1750.0NaN...NaNNaNNaNNaNNaNNaN1.0NaNNaNNaN
49263USW00094789JFK INTERNATIONAL AIRPORT, NY US40.6386-73.76223.41/3/200816.78NaN445.0NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
49264USW00094789JFK INTERNATIONAL AIRPORT, NY US40.6386-73.76223.41/4/200812.97NaN1146.0NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
49265USW00094789JFK INTERNATIONAL AIRPORT, NY US40.6386-73.76223.41/5/20086.93NaN2203.0NaN...NaNNaNNaNNaN1.0NaNNaNNaNNaNNaN
\n", "

5 rows × 45 columns

\n", "
" ] }, "execution_count": 20, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_jfk.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',\n", " 'DAPR', 'FMTM', 'MDPR', 'MDSF', 'PGTM', 'PRCP', 'SNOW', 'SNWD', 'TAVG',\n", " 'TMAX', 'TMIN', 'TOBS', 'TSUN', 'WDF2', 'WDF5', 'WESD', 'WESF', 'WSF2',\n", " 'WSF5', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08',\n", " 'WT09', 'WT11', 'WT13', 'WT14', 'WT15', 'WT16', 'WT17', 'WT18', 'WT19',\n", " 'WT21', 'WT22'],\n", " dtype='object')" ] }, "execution_count": 21, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_jfk.columns" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_jfk = df_jfk[['DATE','PRCP']]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_jfk['DATE'] = pd.to_datetime(df_jfk['DATE'])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_jfk.sort_values(by='DATE',inplace=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_merged = pd.merge(df_man, df_jfk, how='inner', on='DATE')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococciPRCP
02008-04-214.00.00
12008-04-210.00.00
22008-04-210.00.00
32008-04-28120.01.05
42008-04-2828.01.05
\n", "
" ] }, "execution_count": 26, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_merged.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococciPRCP
02008-04-214.00.00
12008-04-210.00.00
22008-04-210.00.00
32008-04-28120.01.05
42008-04-2828.01.05
\n", "
" ] }, "execution_count": 27, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_merged.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "def eWarn(m):\n", " if m > 104:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df_merged['Warning'] = df_merged.Enterococci.apply(eWarn)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATEEnterococciPRCPWarning
02008-04-214.00.000
12008-04-210.00.000
22008-04-210.00.000
32008-04-28120.01.051
42008-04-2828.01.050
\n", "
" ] }, "execution_count": 30, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_merged.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 649\n", "1 38\n", "Name: Warning, dtype: int64" ] }, "execution_count": 31, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df_merged.Warning.value_counts()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Text(0.5,1,'Warning vs. PRCP')" ] }, "execution_count": 32, "metadata": { }, "output_type": "execute_result" }, { "data": { "image/png": "f6a137001a3be7c8d0dcaf9d20d3e763fbf1c799" }, "metadata": { "image/png": { "height": 277, "width": 388 }, "needs_background": "light" } } ], "source": [ "df_merged.plot.scatter(x='PRCP',y='Warning',color='black')\n", "plt.title('Warning vs. PRCP')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "X = np.array(df_merged.PRCP)\n", "y = df_merged.Warning" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "logmodel = LogisticRegression(solver='liblinear')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='warn',\n", " n_jobs=None, penalty='l2', random_state=None, solver='liblinear',\n", " tol=0.0001, verbose=0, warm_start=False)" ] }, "execution_count": 36, "metadata": { }, "output_type": "execute_result" } ], "source": [ "logmodel.fit(X_train.reshape(-1,1),y_train)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "b0 = logmodel.intercept_\n", "b1 = logmodel.coef_" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Text(0.5,1,'Warning vs. PRCP')" ] }, "execution_count": 38, "metadata": { }, "output_type": "execute_result" }, { "data": { "image/png": "7d66b157f6a40497688382e2ebfbeb60dbdd53a5" }, "metadata": { "image/png": { "height": 277, "width": 388 }, "needs_background": "light" } } ], "source": [ "X2 = sorted(X)\n", "df_merged.plot.scatter(x='PRCP',y='Warning',color='black')\n", "plt.plot(X2,1/(1+np.exp(-b0-b1*X2)).reshape(-1,1),'r')\n", "plt.title('Warning vs. PRCP')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "predictions = logmodel.predict(X_test.reshape(-1,1))" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[213 0]\n", " [ 14 0]]\n" ] } ], "source": [ "print(confusion_matrix(y_test,logmodel.predict(X_test.reshape(-1, 1))))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (Anaconda 5)", "language": "python", "name": "anaconda5" } }, "nbformat": 4, "nbformat_minor": 0 }