Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 124
Kernel: Python 3 (Anaconda 5)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import timedelta from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix
df_all_weather = pd.read_csv('2008-2018 Weather Data.csv')
df_all_weather.head()
STATION NAME LATITUDE LONGITUDE ELEVATION DATE AWND DAPR FMTM MDPR ... WT11 WT13 WT14 WT15 WT16 WT17 WT18 WT19 WT21 WT22
0 US1NYKN0003 BROOKLYN 2.4 SW, NY US 40.6194 -73.9859 7.9 11/18/2008 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 US1NYKN0003 BROOKLYN 2.4 SW, NY US 40.6194 -73.9859 7.9 11/19/2008 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 US1NYKN0003 BROOKLYN 2.4 SW, NY US 40.6194 -73.9859 7.9 11/20/2008 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 US1NYKN0003 BROOKLYN 2.4 SW, NY US 40.6194 -73.9859 7.9 11/21/2008 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 US1NYKN0003 BROOKLYN 2.4 SW, NY US 40.6194 -73.9859 7.9 11/22/2008 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 45 columns

df_beaches = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv')
df_beaches.head()
Sample ID Sample Date Beach Name Sample Location Enterococci Results Units or Notes
0 050514CP13 05/05/2014 MIDLAND BEACH Center 20.0 MPN/100 ml
1 062011GR04 06/20/2011 MANHATTAN BEACH Left NaN Result below detection limit
2 072808BH09 07/28/2008 MIDLAND BEACH Right 28.0 MPN/100 ml
3 051214CP36 05/12/2014 SOUTH BEACH Right 4.0 MPN/100 ml
4 081511KB07 08/15/2011 CEDAR GROVE Left 360.0 MPN/100 ml
df_beaches.fillna(0,inplace=True) df_beaches['Sample Date'] = pd.to_datetime(df_beaches['Sample Date'])
df_beaches.head()
Sample ID Sample Date Beach Name Sample Location Enterococci Results Units or Notes
0 050514CP13 2014-05-05 MIDLAND BEACH Center 20.0 MPN/100 ml
1 062011GR04 2011-06-20 MANHATTAN BEACH Left 0.0 Result below detection limit
2 072808BH09 2008-07-28 MIDLAND BEACH Right 28.0 MPN/100 ml
3 051214CP36 2014-05-12 SOUTH BEACH Right 4.0 MPN/100 ml
4 081511KB07 2011-08-15 CEDAR GROVE Left 360.0 MPN/100 ml
df_man = df_beaches[df_beaches['Beach Name'] == 'MANHATTAN BEACH']
df_man.head()
Sample ID Sample Date Beach Name Sample Location Enterococci Results Units or Notes
1 062011GR04 2011-06-20 MANHATTAN BEACH Left 0.0 Result below detection limit
5 062909KB01 2009-06-29 MANHATTAN BEACH Left 8.0 MPN/100 ml
7 072015GR06 2015-07-20 MANHATTAN BEACH Right 0.0 Result below detection limit
71 082112GR05 2012-08-21 MANHATTAN BEACH Center 4.0 MPN/100 ml
76 060914GR06 2014-06-09 MANHATTAN BEACH Right 8.0 MPN/100 ml
df_man = df_man[['Sample Date','Enterococci Results']]
df_man.head()
Sample Date Enterococci Results
1 2011-06-20 0.0
5 2009-06-29 8.0
7 2015-07-20 0.0
71 2012-08-21 4.0
76 2014-06-09 8.0
df_man.columns = ['DATE','Enterococci']
df_man.head()
DATE Enterococci
1 2011-06-20 0.0
5 2009-06-29 8.0
7 2015-07-20 0.0
71 2012-08-21 4.0
76 2014-06-09 8.0
df_man['DATE'] = pd.to_datetime(df_man['DATE'])
df_man.sort_values(by='DATE',inplace=True)
df_man.head()
DATE Enterococci
10187 2005-05-02 10.0
5811 2005-05-02 9.9
12516 2005-05-02 20.0
4561 2005-05-09 9.9
14220 2005-05-09 10.0
df_man.index = range(len(df_man.index))
df_man.head()
DATE Enterococci
0 2005-05-02 10.0
1 2005-05-02 9.9
2 2005-05-02 20.0
3 2005-05-09 9.9
4 2005-05-09 10.0
df_jfk = df_all_weather[df_all_weather.NAME == 'JFK INTERNATIONAL AIRPORT, NY US']
df_jfk.head()
STATION NAME LATITUDE LONGITUDE ELEVATION DATE AWND DAPR FMTM MDPR ... WT11 WT13 WT14 WT15 WT16 WT17 WT18 WT19 WT21 WT22
49261 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 40.6386 -73.7622 3.4 1/1/2008 15.88 NaN 827.0 NaN ... NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN
49262 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 40.6386 -73.7622 3.4 1/2/2008 21.25 NaN 1750.0 NaN ... NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN
49263 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 40.6386 -73.7622 3.4 1/3/2008 16.78 NaN 445.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
49264 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 40.6386 -73.7622 3.4 1/4/2008 12.97 NaN 1146.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
49265 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 40.6386 -73.7622 3.4 1/5/2008 6.93 NaN 2203.0 NaN ... NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN

5 rows × 45 columns

df_jfk.columns
Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND', 'DAPR', 'FMTM', 'MDPR', 'MDSF', 'PGTM', 'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'TOBS', 'TSUN', 'WDF2', 'WDF5', 'WESD', 'WESF', 'WSF2', 'WSF5', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08', 'WT09', 'WT11', 'WT13', 'WT14', 'WT15', 'WT16', 'WT17', 'WT18', 'WT19', 'WT21', 'WT22'], dtype='object')
df_jfk = df_jfk[['DATE','PRCP']]
df_jfk['DATE'] = pd.to_datetime(df_jfk['DATE'])
df_jfk.sort_values(by='DATE',inplace=True)
df_merged = pd.merge(df_man, df_jfk, how='inner', on='DATE')
df_merged.head()
DATE Enterococci PRCP
0 2008-04-21 4.0 0.00
1 2008-04-21 0.0 0.00
2 2008-04-21 0.0 0.00
3 2008-04-28 120.0 1.05
4 2008-04-28 28.0 1.05
df_merged.head()
DATE Enterococci PRCP
0 2008-04-21 4.0 0.00
1 2008-04-21 0.0 0.00
2 2008-04-21 0.0 0.00
3 2008-04-28 120.0 1.05
4 2008-04-28 28.0 1.05
def eWarn(m): if m > 104: return 1 else: return 0
df_merged['Warning'] = df_merged.Enterococci.apply(eWarn)
df_merged.head()
DATE Enterococci PRCP Warning
0 2008-04-21 4.0 0.00 0
1 2008-04-21 0.0 0.00 0
2 2008-04-21 0.0 0.00 0
3 2008-04-28 120.0 1.05 1
4 2008-04-28 28.0 1.05 0
df_merged.Warning.value_counts()
0 649 1 38 Name: Warning, dtype: int64
df_merged.plot.scatter(x='PRCP',y='Warning',color='black') plt.title('Warning vs. PRCP')
Text(0.5,1,'Warning vs. PRCP')
Image in a Jupyter notebook
X = np.array(df_merged.PRCP) y = df_merged.Warning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(X_train.reshape(-1,1),y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
b0 = logmodel.intercept_ b1 = logmodel.coef_
X2 = sorted(X) df_merged.plot.scatter(x='PRCP',y='Warning',color='black') plt.plot(X2,1/(1+np.exp(-b0-b1*X2)).reshape(-1,1),'r') plt.title('Warning vs. PRCP')
Text(0.5,1,'Warning vs. PRCP')
Image in a Jupyter notebook
predictions = logmodel.predict(X_test.reshape(-1,1))
print(confusion_matrix(y_test,logmodel.predict(X_test.reshape(-1, 1))))
[[213 0] [ 14 0]]