{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we will use a multi-layer perceptron to develop time series forecasting models.\n",
    "The dataset used for the examples of this notebook is on air pollution measured by concentration of\n",
    "particulate matter (PM) of diameter less than or equal to 2.5 micrometers. There are other variables\n",
    "such as air pressure, air temparature, dewpoint and so on.\n",
    "Two time series models are developed - one on air pressure and the other on pm2.5.\n",
    "The dataset has been downloaded from UCI Machine Learning Repository.\n",
    "https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#set current working directory\n",
    "path=\"/home/user/time.series\"\n",
    "os.chdir(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Read the dataset into a pandas.DataFrame\n",
    "df = pd.read_csv('datasets/PRSA_data_2010.1.1-2014.12.31.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of the dataframe: (43824, 13)\n"
     ]
    }
   ],
   "source": [
    "print('Shape of the dataframe:', df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>No</th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>hour</th>\n",
       "      <th>pm2.5</th>\n",
       "      <th>DEWP</th>\n",
       "      <th>TEMP</th>\n",
       "      <th>PRES</th>\n",
       "      <th>cbwd</th>\n",
       "      <th>Iws</th>\n",
       "      <th>Is</th>\n",
       "      <th>Ir</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-21</td>\n",
       "      <td>-11.0</td>\n",
       "      <td>1021.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>1.79</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-21</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>1020.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>4.92</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-21</td>\n",
       "      <td>-11.0</td>\n",
       "      <td>1019.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>6.71</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-21</td>\n",
       "      <td>-14.0</td>\n",
       "      <td>1019.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>9.84</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-20</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>1018.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>12.97</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let's see the first five rows of the DataFrame\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "\n",
    "Rows having NaN values in column pm2.5 are dropped.\n",
    "\"\"\"\n",
    "df.dropna(subset=['pm2.5'], axis=0, inplace=True)\n",
    "df.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To make sure that the rows are in the right order of date and time of observations,\n",
    "a new column datetime is created from the date and time related columns of the DataFrame.\n",
    "The new column consists of Python's datetime.datetime objects. The DataFrame is sorted in ascending order\n",
    "over this column."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['datetime'] = df[['year', 'month', 'day', 'hour']].apply(lambda row: datetime.datetime(year=row['year'], month=row['month'], day=row['day'],\n",
    "                                                                                          hour=row['hour']), axis=1)\n",
    "df.sort_values('datetime', ascending=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,1,'Box plot of PM2.5')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUgAAAFoCAYAAAA1uGIFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFflJREFUeJzt3X+UX3V95/HnO5kR0VQIQSkEZORErWxL0bIt2VqLFG0S/NFz1m5rd5vQrdBfG1jYc1yF0BBM6bpWKqZdEVkraNfaUvsDSMIBKS1EoZuchpSWHxl0qATBMBERIZCZfPrHvd/pnW/mPfOdZL6ZzOT5OOd75t7P/dzP/Xzunbxy7/3e73eilIIkaV9zprsDknSoMiAlKWFASlLCgJSkhAEpSQkDUpISBqSmTETcFREfOEjb+o2IeCoinouIBQdjmzr8GJCzQEQMRMQLdVh8JyJujYiTprtfmYjoi4gSET37uX4vcDXwzlLKvFLKYNL+c/VrICI+1FheIuLbze1HRG9dVhplvxcR2yPiexHxUEQsH6dPZ0XE3sY2n4uIFfszPh06DMjZ492llHnA8cBTwLpp7k83HQe8HPinCeodXe+T9wO/HRFLGsu+AyxtzC+ty5q+D7wbOApYAVwTEf9hnO09UQd263VDB2PRIcyAnGVKKbuBm4BTW2URcVRE3BgROyPisYhYFRFz6mWfiog/b9T9aER8JSKive2IOC8iNkXEH0TEd+uzqp8Zqx8RMafezmP1mdmNEXFUvfjv6p/P1Gdai8dY/4iI+EREPFG/PlGXvQF4uLH+nR3sk69RhekPN4o/DzTPCJcDN7att7qU8lApZW8p5T7gbmCfvmr2MiBnmYh4BfALwL2N4nVUZ0GnAD9NFQa/Ui/7H8CP1OH3U8CvAitK/hnUnwAeBY4FVgNfjohjxqh3Xv16e73decAf1MveVv88uj7T+toY618GnAmcDvwo8OPAqlLKI8C/a6x/dtJPAKLyk/U6/9BY9JfA2yLi6IiYD/wU8FfjtHMk8O8Z/6z1NfV90W9ExO9HxCvH65tmgFKKrxn+AgaA54BngD3AE8CP1MvmAi8Bpzbq/xpwV2P+J4BdwGPA+8fZznl129Eo+3vgl+vpu4AP1NNfAX6zUe+Ndd96gD6gAD3jbOtRYFlj/meBgXp63PUby5+humx+ELiwsbwAi4Dr633x68Bn6rKStHkDsLE59rblP0h11j4HeB3VWfKnp/t3w9eBvfbrJrkOST9XSrkjIuYC7wX+NiJOpQqDXqrwa3kMWNiaKaXcFxFfB14D/OkE29lR6kRotHXCGPVOGGObPVT3Dzsx1vpjbWc8x5ZShsZZfiPwu0AA/zOrFBEfo7o8f3vb2EeUUp4EnqxnvxERHwRuoQpgzVBeYs8ypZThUsqXgWHgrcDTVGduJzeqvRbY0ZqJiN8CjqA6O/zgBJtY2HZ/8rX1eu2eGGObQ1RvIHXyFVJjrT/Wdg7E3VRvah0H3DNWhYhYQ/UGzjtLKc9Oou2C/75mPA/gLFPfc3svMB94sJQyTHVW+DsR8QMRcTJwCfCFuv4bgLXAfwF+GfhgRJw+ziZeA1xYPxbz88CbgPVj1PsicHFEvC4i5gFXAV+qz+h2Anup7k1mvgisiohXR8SxwG+3+jxV6rPBdwPvGevMMCI+DPwScE5pe5RojLpvj4iT6/1/EvC/GOeepmYGL7Fnj5sjYpjqzOUxqjdaWm8orKR6o+brwG6q+22frZ8D/ALw0VLK/QARcSnw+Yg4o5Ty4hjbuQ94PdWZ6VPA+5Lw+CzVJfHfUT2Sc1vdD0opz0fE7wCb6mcal5RS7m1bfy3wKmBbPf9nddmUauyjsVxFdf+2v3HSfFUp5SqAiHgOWFpKuRt4M9W+nA8MAn9B9UaTZrBIbqlI+4iI86jehHnrdPdFOhi8xJakhAEpSQkvsSUp4RmkJCUMSElKTOoxn2OPPbb09fV1qSuSdHBs2bLl6VLKqyeqN6mA7OvrY/PmzfvfK0k6BETEYxPX8hJbklIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJSvRMdwfWrVtHf39/x/V37NgBwMKFCw9424sWLWLlypUH3I6k2WnaA7K/v5+tDzzI8CuO6aj+3Oe/C8CTLx5Y1+c+v+uA1pc0+017QAIMv+IYXvihZR3VPfKh9QAd15+oHUnKeA9SkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhJdD8h169axbt26bm/msOH+lA6enm5voL+/v9ubOKy4P6WDx0tsSUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSYme6e6AJuf+++8H4KyzzprejhxkEUEpZZ/ynp4ehoeH6e3tpa+vj3e9611cffXVo5aVUrjgggu4/vrrOf/88/nMZz7Dxz72Mfr6+rj88st56aWXmDNnzqj2e3t7ueSSS/jkJz/J6tWrWbBgwajt3nnnnVx55ZWsXr2a0047jTVr1rB69WoALr/8ckoprF27FmBk2YIFCxgcHBw139Isb67Tvn573fY2s/qrVq1ieHiYuXPnsnbt2o76cuGFF/Lxj3+ciOAjH/nIPvtgrL5MtfHG1u1tA8y94oorOq583XXXXXHBBRdMagMbN24EYOnSpenyb33n+wwd+/qO2ut9ejtAx/XHa+eE+fPSfh2qPve5z013Fw4pe/fuBWB4eJjBwUHuvffefZYBbNmyhVLKyM+vfvWrPPvss9xzzz3s2rWLwcFBdu3aNfLauXMn27Zt45FHHmH37t0sXrx41HbPP/989u7dy6ZNm3j++ee5++672b17N1u3buWee+7h6aef5sUXX2Tr1q0jyxYvXsy11147ar6lWd5cp3399rrtbWb1N23axODg4Ei/OunLtm3b6O/vZ+fOnSPrtMvamCrjje1Atr1mzZpvXXHFFddNVM9L7BnkcDtr3B9jnWWO5bnnnuPWW28dt87AwAClFDZu3Mjg4OBI+Z133snQ0BAAQ0ND3HrrrZRS2LBhAxs2bBipt379ejZs2DDSRn9/Pxs3btynzcHBwZHyVhvt0636zbrtbWb1m30C2LBhQ0d9GRgYGLVOcx+0123fR1Mh2y/j7cup1vVL7B07dvDCCy9w0UUXjbm8v7+fOS919ks9lebsfpb+/u+l/dLsNzw83HG9G2+8kYsvvhiAq666asx29uzZMyqg9+zZQ0SM1Fm7du2oM95WmzfccMNI+Z49e0at396HUsqoNpptZvVbYd5st5O+tK/T3AfAqLrt+2gqZPtlvH051SY8g4yICyJic0Rs3rlz55R3QDrUDQ0Ncfvtt4+aH8tYZ6+tsqGhIQYGBkadebbavOOOO0bKSykj6zSnW/WbddvbzOq396t1djhRX9rXae6D9rrt+2gqZPtlvH051SY8gyylXAdcB3DGGWdM+lRv4cKFAFxzzTVjLr/ooovY8vWnJtvsAdv78lex6JTj0n4dirzEnh49PT284x3vGDU/VoiM9UZSq6ynp4cTTzyRxx9/nKGhoVFtnnPOOaxfv56hoaGRM85SyqjpVv1Sykjd9jaz+jfffPOofkUEJ5988oR9aR9Hcx+0123fR1Mh2y/j7cup5j1IHbbmzp3bcb3ly5ePzF966aVjttPb20tvb+9IeW9vLz09PSN1Vq1axZw5c/Zpc8WKFSPlzXXa11++fPmouu1tZvVbZc1+ddKX9nWa+6C9bvs+mgrZfhlvX041A3IGueuuu6a7C4e81pnGRObNm8e55547bp2+vj4igiVLlox6jOTss88e+cfa09PDueeeS0SwdOnSUU9FLFu2jKVLl460sWjRIpYsWbJPmwsWLBgpb7XRPt2q36zb3mZWv/1JjaVLl3bUl76+vlHrtD9K096XqX7UJtsv4+3LqeZzkJoRpvo5yDVr1tDX10d/f/+Ez0GOdXZy6aWXcuWVV3LZZZdx2mmnMTAwMFKvv7+fUsrIfHPZihUrRs23tJdn02PVHW/d1vLt27ePPAfZaV+az0FmZ2hZG1NlvLF1e9sA0eljEVDdg9y8efOkNtB6l3iie5Av/NCyjto78qH1AB3XH6+dH5th9yBh4v0paWIRsaWUcsZE9bzElqSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpTo6fYGFi1a1O1NHFbcn9LB0/WAXLlyZbc3cVhxf0oHj5fYkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpQwICUpYUBKUsKAlKSEASlJCQNSkhIGpCQlDEhJShiQkpTome4OAMx9fhdHPrS+w7qDAB3XH2+bcNwBtSFpdpv2gFy0aNGk6u/YMQTAwoUHGm7HTXrbkg4v0x6QK1eunO4uSNKYvAcpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUoYkJKUMCAlKWFASlLCgJSkhAEpSQkDUpISBqQkJQxISUpEKaXzyhE7gcf2YzvHAk/vx3ozheOb2RzfzLY/4zu5lPLqiSpNKiD3V0RsLqWc0fUNTRPHN7M5vpmtm+PzEluSEgakJCUOVkBed5C2M10c38zm+Ga2ro3voNyDlKSZyEtsSUp0NSAjYklEPBwR/RHxoW5uq1si4qSI+JuI+OeI+KeIuKguPyYibo+I7fXP+XV5RMQn6zFvi4i3TO8IOhMRcyPiHyLilnr+dRFxXz2OL0XEy+ryI+r5/np533T2uxMRcXRE3BQRD0XEgxGxeDYdv4i4uP7dfCAivhgRL5/pxy8iPhsR346IBxplkz5mEbGirr89IlZMuiOllK68gLnAo8ApwMuA+4FTu7W9Lo7jeOAt9fQPAI8ApwL/G/hQXf4h4KP19DJgAxDAmcB90z2GDsd5CfD/gFvq+T8FfrGevhb4jXr6N4Fr6+lfBL403X3vYGw3AB+op18GHD1bjh+wEPgGcGTjuJ03048f8DbgLcADjbJJHTPgGODr9c/59fT8SfWjiwNcDNzWmP8w8OHp3vFTMK6/At4BPAwcX5cdDzxcT38aeH+j/ki9Q/UFnAh8BTgbuKX+RXsa6Gk/lsBtwOJ6uqeuF9M9hnHGdlQdINFWPiuOXx2Q36xDoKc+fj87G44f0NcWkJM6ZsD7gU83ykfV6+TVzUvs1oFrebwum7Hqy5E3A/cBx5VSvlUvehI4rp6eieP+BPBBYG89vwB4ppQyVM83xzAyvnr5d+v6h6rXATuBP6pvIVwfEa9klhy/UsoO4PeAfwG+RXU8tjB7jl/TZI/ZAR9L36TpUETMA/4c+O+llGeby0r139OMfBwgIt4FfLuUsmW6+9IlPVSXap8qpbwZ+D7V5dmIGX785gPvpfqP4ATglcCSae3UQXCwjlk3A3IHcFJj/sS6bMaJiF6qcPzjUsqX6+KnIuL4evnxwLfr8pk27p8E3hMRA8CfUF1mXwMcHRE9dZ3mGEbGVy8/Chg8mB2epMeBx0sp99XzN1EF5mw5fucA3yil7Cyl7AG+THVMZ8vxa5rsMTvgY9nNgPz/wOvrd9NeRnVD+K+7uL2uiIgA/i/wYCnl6saivwZa74qtoLo32SpfXr+zdibw3cZlwSGnlPLhUsqJpZQ+qmN0ZynlPwN/A7yvrtY+vta431fXP2TPvkopTwLfjIg31kU/A/wzs+T4UV1anxkRr6h/V1vjmxXHr81kj9ltwDsjYn59pv3OuqxzXb7JuozqXd9Hgcum+6bvfo7hrVSn8tuArfVrGdV9m68A24E7gGPq+gH8YT3mfwTOmO4xTGKsZ/Fv72KfAvw90A/8GXBEXf7yer6/Xn7KdPe7g3GdDmyuj+FfUr2jOWuOH7AGeAh4APg8cMRMP37AF6nuqe6hugr41f05ZsB/rcfaD/zKZPvhJ2kkKeGbNJKUMCAlKWFASlLCgJSkhAEpSQkDUoeUiDg9Ir5WfzvNtoj4haTeeRGxMyK21q8PHOy+avbzMR8dUiLiDVSfJNseESdQfa74TaWUZ9rqnUf1vNt/m4Zu6jDhGaSmXET01d+9+Mf19y/eVH/SYyAifrc+49scEW+JiNsi4tGI+HWAUsojpZTt9fQTVB8nm/DPc0rdYECqW94I/J9SypuAZ6m+hxDgX0oppwN3A5+j+rjbmVSfBhklIn6c6vsbH0228R/ry/CbIuKkpI603wxIdcs3Symb6ukvUH1kE/7t8/j/SPXFpt8rpewEXoyIo1sr119G8Hmqj4e1voat6Wagr5RyGnA71ZfiSlPKgFS3tN/cbs2/WP/c25huzfcARMSrgFupPr9/75iNlzJYSmmtfz3wY1PRaanJgFS3vDYiFtfTvwTc08lK9Tc//QVwYynlpnHqHd+YfQ/w4P52VMoYkOqWh4HfiogHqb4951MdrvefqP4eyXmNR3hOB4iIKyPiPXW9C+tHge4HLqT6OyzSlPIxH025+k9T3FJK+eFp7op0QDyDlKSEZ5CSlPAMUpISBqQkJQxISUoYkJKUMCAlKWFASlLiXwGFDXfWe6fDWgAAAABJRU5ErkJggg=="
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let us draw a box plot to visualize the central tendency and dispersion of PRES\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.boxplot(df['pm2.5'])\n",
    "g.set_title('Box plot of PM2.5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/seaborn/timeseries.py:183: UserWarning: The tsplot function is deprecated and will be removed or replaced (in a substantially altered version) in a future release.\n",
      "  warnings.warn(msg, UserWarning)\n"
     ]
    }
   ],
   "source": [
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.tsplot(df['pm2.5'])\n",
    "g.set_title('Time series of pm2.5')\n",
    "g.set_xlabel('Index')\n",
    "g.set_ylabel('pm2.5 readings')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's plot the series for six months to check if any pattern apparently exists.\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.tsplot(df['pm2.5'].loc[df['datetime']<=datetime.datetime(year=2010,month=6,day=30)], color='g')\n",
    "g.set_title('pm2.5 during 2010')\n",
    "g.set_xlabel('Index')\n",
    "g.set_ylabel('pm2.5 readings')\n",
    "\n",
    "#Let's zoom in on one month.\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.tsplot(df['pm2.5'].loc[df['datetime']<=datetime.datetime(year=2010,month=1,day=31)], color='g')\n",
    "g.set_title('pm2.5 during Jan 2010')\n",
    "g.set_xlabel('Index')\n",
    "g.set_ylabel('pm2.5 readings')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gradient descent algorithms perform better (for example converge faster) if the variables are wihtin range [-1, 1]. Many sources relax the boundary to even [-3, 3]. The pm2.5 variable is mixmax scaled to bound the tranformed variable within [0,1]."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "scaler = MinMaxScaler(feature_range=(0, 1))\n",
    "df['scaled_pm2.5'] = scaler.fit_transform(np.array(df['pm2.5']).reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Before training the model, the dataset is split in two parts - train set and validation set.\n",
    "The neural network is trained on the train set. This means computation of the loss function, back propagation\n",
    "and weights updated by a gradient descent algorithm is done on the train set. The validation set is\n",
    "used to evaluate the model and to determine the number of epochs in model training. Increasing the number of \n",
    "epochs will further decrease the loss function on the train set but might not neccesarily have the same effect\n",
    "for the validation set due to overfitting on the train set.Hence, the number of epochs is controlled by keeping\n",
    "a tap on the loss function computed for the validation set. We use Keras with Tensorflow backend to define and train\n",
    "the model. All the steps involved in model training and validation is done by calling appropriate functions\n",
    "of the Keras API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of train: (33096, 15)\n",
      "Shape of test: (8661, 15)\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Let's start by splitting the dataset into train and validation. The dataset's time period if from\n",
    "Jan 1st, 2010 to Dec 31st, 2014. The first fours years - 2010 to 2013 is used as train and\n",
    "2014 is kept for validation.\n",
    "\"\"\"\n",
    "split_date = datetime.datetime(year=2014, month=1, day=1, hour=0)\n",
    "df_train = df.loc[df['datetime']<split_date]\n",
    "df_val = df.loc[df['datetime']>=split_date]\n",
    "print('Shape of train:', df_train.shape)\n",
    "print('Shape of test:', df_val.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>No</th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>hour</th>\n",
       "      <th>pm2.5</th>\n",
       "      <th>DEWP</th>\n",
       "      <th>TEMP</th>\n",
       "      <th>PRES</th>\n",
       "      <th>cbwd</th>\n",
       "      <th>Iws</th>\n",
       "      <th>Is</th>\n",
       "      <th>Ir</th>\n",
       "      <th>datetime</th>\n",
       "      <th>scaled_pm2.5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>25</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>129.0</td>\n",
       "      <td>-16</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>1020.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>1.79</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010-01-02 00:00:00</td>\n",
       "      <td>0.129779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>26</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>148.0</td>\n",
       "      <td>-15</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>1020.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>2.68</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010-01-02 01:00:00</td>\n",
       "      <td>0.148893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>27</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>159.0</td>\n",
       "      <td>-11</td>\n",
       "      <td>-5.0</td>\n",
       "      <td>1021.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>3.57</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010-01-02 02:00:00</td>\n",
       "      <td>0.159960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>28</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>181.0</td>\n",
       "      <td>-7</td>\n",
       "      <td>-5.0</td>\n",
       "      <td>1022.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>5.36</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2010-01-02 03:00:00</td>\n",
       "      <td>0.182093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>29</td>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>138.0</td>\n",
       "      <td>-7</td>\n",
       "      <td>-5.0</td>\n",
       "      <td>1022.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>6.25</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2010-01-02 04:00:00</td>\n",
       "      <td>0.138833</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   No  year  month  day  hour  pm2.5  DEWP  TEMP    PRES cbwd   Iws  Is  Ir  \\\n",
       "0  25  2010      1    2     0  129.0   -16  -4.0  1020.0   SE  1.79   0   0   \n",
       "1  26  2010      1    2     1  148.0   -15  -4.0  1020.0   SE  2.68   0   0   \n",
       "2  27  2010      1    2     2  159.0   -11  -5.0  1021.0   SE  3.57   0   0   \n",
       "3  28  2010      1    2     3  181.0    -7  -5.0  1022.0   SE  5.36   1   0   \n",
       "4  29  2010      1    2     4  138.0    -7  -5.0  1022.0   SE  6.25   2   0   \n",
       "\n",
       "             datetime  scaled_pm2.5  \n",
       "0 2010-01-02 00:00:00      0.129779  \n",
       "1 2010-01-02 01:00:00      0.148893  \n",
       "2 2010-01-02 02:00:00      0.159960  \n",
       "3 2010-01-02 03:00:00      0.182093  \n",
       "4 2010-01-02 04:00:00      0.138833  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#First five rows of train\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>No</th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>hour</th>\n",
       "      <th>pm2.5</th>\n",
       "      <th>DEWP</th>\n",
       "      <th>TEMP</th>\n",
       "      <th>PRES</th>\n",
       "      <th>cbwd</th>\n",
       "      <th>Iws</th>\n",
       "      <th>Is</th>\n",
       "      <th>Ir</th>\n",
       "      <th>datetime</th>\n",
       "      <th>scaled_pm2.5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33096</th>\n",
       "      <td>35065</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>-20</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1014.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>143.48</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-01-01 00:00:00</td>\n",
       "      <td>0.024145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33097</th>\n",
       "      <td>35066</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>53.0</td>\n",
       "      <td>-20</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1013.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>147.50</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-01-01 01:00:00</td>\n",
       "      <td>0.053320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33098</th>\n",
       "      <td>35067</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>65.0</td>\n",
       "      <td>-20</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1013.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>151.52</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-01-01 02:00:00</td>\n",
       "      <td>0.065392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33099</th>\n",
       "      <td>35068</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>70.0</td>\n",
       "      <td>-20</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1013.0</td>\n",
       "      <td>NW</td>\n",
       "      <td>153.31</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-01-01 03:00:00</td>\n",
       "      <td>0.070423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33100</th>\n",
       "      <td>35069</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>79.0</td>\n",
       "      <td>-18</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1012.0</td>\n",
       "      <td>cv</td>\n",
       "      <td>0.89</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-01-01 04:00:00</td>\n",
       "      <td>0.079477</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          No  year  month  day  hour  pm2.5  DEWP  TEMP    PRES cbwd     Iws  \\\n",
       "33096  35065  2014      1    1     0   24.0   -20   7.0  1014.0   NW  143.48   \n",
       "33097  35066  2014      1    1     1   53.0   -20   7.0  1013.0   NW  147.50   \n",
       "33098  35067  2014      1    1     2   65.0   -20   6.0  1013.0   NW  151.52   \n",
       "33099  35068  2014      1    1     3   70.0   -20   6.0  1013.0   NW  153.31   \n",
       "33100  35069  2014      1    1     4   79.0   -18   3.0  1012.0   cv    0.89   \n",
       "\n",
       "       Is  Ir            datetime  scaled_pm2.5  \n",
       "33096   0   0 2014-01-01 00:00:00      0.024145  \n",
       "33097   0   0 2014-01-01 01:00:00      0.053320  \n",
       "33098   0   0 2014-01-01 02:00:00      0.065392  \n",
       "33099   0   0 2014-01-01 03:00:00      0.070423  \n",
       "33100   0   0 2014-01-01 04:00:00      0.079477  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#First five rows of validation\n",
    "df_val.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Reset the indices of the validation set\n",
    "df_val.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/seaborn/timeseries.py:183: UserWarning: The tsplot function is deprecated and will be removed or replaced (in a substantially altered version) in a future release.\n",
      "  warnings.warn(msg, UserWarning)\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "The train and validation time series of scaled pm2.5 is also plotted.\n",
    "\"\"\"\n",
    "\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.tsplot(df_train['scaled_pm2.5'], color='b')\n",
    "g.set_title('Time series of scaled pm2.5 in train set')\n",
    "g.set_xlabel('Index')\n",
    "g.set_ylabel('Scaled pm2.5 readings')\n",
    "\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "g = sns.tsplot(df_val['scaled_pm2.5'], color='r')\n",
    "g.set_title('Time series of scaled pm2.5 in validation set')\n",
    "g.set_xlabel('Index')\n",
    "g.set_ylabel('Scaled pm2.5 readings')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we need to generate regressors (X) and target variable (y) for train and validation. 2-D array of regressor and 1-D array of target is created from the original 1-D array of columm standardized_pm2.5 in the DataFrames. For the time series forecasting model, Past seven days of observations are used to predict for the next day. This is equivalent to a AR(7) model. We define a function which takes the original time series and the number of timesteps in regressors as input to generate the arrays of X and y."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def makeXy(ts, nb_timesteps):\n",
    "    \"\"\"\n",
    "    Input: \n",
    "           ts: original time series\n",
    "           nb_timesteps: number of time steps in the regressors\n",
    "    Output: \n",
    "           X: 2-D array of regressors\n",
    "           y: 1-D array of target \n",
    "    \"\"\"\n",
    "    X = []\n",
    "    y = []\n",
    "    for i in range(nb_timesteps, ts.shape[0]):\n",
    "        X.append(list(ts.loc[i-nb_timesteps:i-1]))\n",
    "        y.append(ts.loc[i])\n",
    "    X, y = np.array(X), np.array(y)\n",
    "    return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of train arrays: (33089, 7) (33089,)\n"
     ]
    }
   ],
   "source": [
    "X_train, y_train = makeXy(df_train['scaled_pm2.5'], 7)\n",
    "print('Shape of train arrays:', X_train.shape, y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of validation arrays: (8654, 7) (8654,)\n"
     ]
    }
   ],
   "source": [
    "X_val, y_val = makeXy(df_val['scaled_pm2.5'], 7)\n",
    "print('Shape of validation arrays:', X_val.shape, y_val.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The input to convolution layers must be of shape (number of samples, number of timesteps, number of features per timestep). In this case we are modeling only pm2.5 hence number of features per timestep is one. Number of timesteps is seven and number of samples is same as the number of samples in X_train and X_val, which are reshaped to 3D arrays."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of arrays after reshaping: (33089, 7, 1) (8654, 7, 1)\n"
     ]
    }
   ],
   "source": [
    "#X_train and X_val are reshaped to 3D arrays\n",
    "X_train, X_val = X_train.reshape((X_train.shape[0], X_train.shape[1], 1)),\\\n",
    "                 X_val.reshape((X_val.shape[0], X_val.shape[1], 1))\n",
    "print('Shape of arrays after reshaping:', X_train.shape, X_val.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we define the MLP using the Keras Functional API. In this approach a layer can be declared as the input of the following layer at the time of defining the next layer. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.layers import Dense\n",
    "from keras.layers import Input\n",
    "from keras.layers import Dropout\n",
    "from keras.layers import Flatten\n",
    "from keras.layers.convolutional import ZeroPadding1D\n",
    "from keras.layers.convolutional import Conv1D\n",
    "from keras.layers.pooling import AveragePooling1D\n",
    "from keras.optimizers import SGD\n",
    "from keras.models import Model\n",
    "from keras.models import load_model\n",
    "from keras.callbacks import ModelCheckpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define input layer which has shape (None, 7) and of type float32. None indicates the number of instances\n",
    "input_layer = Input(shape=(7,1), dtype='float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ZeroPadding1D layer is added next to add zeros at the begining and end of each series. Zeropadding ensure that the downstream convolution layer does not reduce the dimension of the output sequences. Pooling layer, added after the convolution layer is used to downsampling the input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add zero padding\n",
    "zeropadding_layer = ZeroPadding1D(padding=1)(input_layer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first argument of Conv1D is the number of filters, which determine the number of features in the output. Second argument indicates length of the 1D convolution window. The third argument is strides and represent the number of places to shift the convolution window. Lastly, setting use_bias as True, add a bias value during computation of an output feature. Here, the 1D convolution can be thought of as generating local AR models over rolling window of three time units."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "`NHWC` for data_format is deprecated, use `NWC` instead\n"
     ]
    }
   ],
   "source": [
    "#Add 1D convolution layers\n",
    "conv1D_layer1 = Conv1D(64, 3, strides=1, use_bias=True)(zeropadding_layer)\n",
    "conv1D_layer2 = Conv1D(32, 3, strides=1, use_bias=True)(conv1D_layer1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "AveragePooling1D is added next to downsample the input by taking average over pool size of three with stride of one timesteps. The average pooling in this case can be thought of as taking moving averages over a rolling window of three time units. We have used average pooling instead of max pooling to generate the moving averages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add AveragePooling1D layer\n",
    "avgpooling_layer = AveragePooling1D(pool_size=3, strides=1)(conv1D_layer2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The preceeding pooling layer returns 3D output. Hence before passing to the output layer, a Flatten layer is added. The Flatten layer reshapes the input to (number of samples, number of timesteps*number of features per timestep), which is then fed to the output layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add Flatten layer\n",
    "flatten_layer = Flatten()(avgpooling_layer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#A couple of Dense layers are also added\n",
    "dense_layer1 = Dense(32)(avgpooling_layer)\n",
    "dense_layer2 = Dense(16)(dense_layer1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "dropout_layer = Dropout(0.2)(flatten_layer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Finally the output layer gives prediction for the next day's air pressure.\n",
    "output_layer = Dense(1, activation='linear')(dropout_layer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The input, dense and output layers will now be packed inside a Model, which is wrapper class for training and making\n",
    "predictions. The box plot of pm2.5 shows the presence of outliers. Hence, mean absolute error (MAE) is used as absolute deviations suffer less fluctuations compared to squared deviations.\n",
    "\n",
    "The network's weights are optimized by the Adam algorithm. Adam stands for adaptive moment estimation\n",
    "and has been a popular choice for training deep neural networks. Unlike, stochastic gradient descent, adam uses\n",
    "different learning rates for each weight and separately updates the same as the training progresses. The learning rate of a weight is updated based on exponentially weighted moving averages of the weight's gradients and the squared gradients."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, 7, 1)              0         \n",
      "_________________________________________________________________\n",
      "zero_padding1d_1 (ZeroPaddin (None, 9, 1)              0         \n",
      "_________________________________________________________________\n",
      "conv1d_1 (Conv1D)            (None, 7, 64)             256       \n",
      "_________________________________________________________________\n",
      "conv1d_2 (Conv1D)            (None, 5, 32)             6176      \n",
      "_________________________________________________________________\n",
      "average_pooling1d_1 (Average (None, 3, 32)             0         \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 96)                0         \n",
      "_________________________________________________________________\n",
      "dropout_1 (Dropout)          (None, 96)                0         \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 1)                 97        \n",
      "=================================================================\n",
      "Total params: 6,529\n",
      "Trainable params: 6,529\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "ts_model = Model(inputs=input_layer, outputs=output_layer)\n",
    "ts_model.compile(loss='mean_absolute_error', optimizer='adam')#SGD(lr=0.001, decay=1e-5))\n",
    "ts_model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The model is trained by calling the fit function on the model object and passing the X_train and y_train. The training \n",
    "is done for a predefined number of epochs. Additionally, batch_size defines the number of samples of train set to be\n",
    "used for a instance of back propagation.The validation dataset is also passed to evaluate the model after every epoch\n",
    "completes. A ModelCheckpoint object tracks the loss function on the validation set and saves the model for the epoch,\n",
    "at which the loss function has been minimum."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 33089 samples, validate on 8654 samples\n",
      "Epoch 1/20\n",
      "33089/33089 [==============================] - 9s 271us/step - loss: 0.0185 - val_loss: 0.0138\n",
      "Epoch 2/20\n",
      "33089/33089 [==============================] - 10s 289us/step - loss: 0.0168 - val_loss: 0.0139\n",
      "Epoch 3/20\n",
      "33089/33089 [==============================] - 10s 295us/step - loss: 0.0165 - val_loss: 0.0138\n",
      "Epoch 4/20\n",
      "33089/33089 [==============================] - 10s 293us/step - loss: 0.0165 - val_loss: 0.0142\n",
      "Epoch 5/20\n",
      "33089/33089 [==============================] - 9s 284us/step - loss: 0.0163 - val_loss: 0.0133\n",
      "Epoch 6/20\n",
      "33089/33089 [==============================] - 10s 294us/step - loss: 0.0162 - val_loss: 0.0151\n",
      "Epoch 7/20\n",
      "33089/33089 [==============================] - 10s 298us/step - loss: 0.0162 - val_loss: 0.0134\n",
      "Epoch 8/20\n",
      "33089/33089 [==============================] - 10s 292us/step - loss: 0.0163 - val_loss: 0.0149\n",
      "Epoch 9/20\n",
      "33089/33089 [==============================] - 10s 288us/step - loss: 0.0161 - val_loss: 0.0133\n",
      "Epoch 10/20\n",
      "33089/33089 [==============================] - 10s 295us/step - loss: 0.0160 - val_loss: 0.0131\n",
      "Epoch 11/20\n",
      "33089/33089 [==============================] - 10s 301us/step - loss: 0.0161 - val_loss: 0.0147\n",
      "Epoch 12/20\n",
      "33089/33089 [==============================] - 9s 287us/step - loss: 0.0160 - val_loss: 0.0138\n",
      "Epoch 13/20\n",
      "33089/33089 [==============================] - 10s 296us/step - loss: 0.0159 - val_loss: 0.0140\n",
      "Epoch 14/20\n",
      "33089/33089 [==============================] - 10s 295us/step - loss: 0.0160 - val_loss: 0.0134\n",
      "Epoch 15/20\n",
      "33089/33089 [==============================] - 10s 292us/step - loss: 0.0160 - val_loss: 0.0130\n",
      "Epoch 16/20\n",
      "33089/33089 [==============================] - 10s 288us/step - loss: 0.0159 - val_loss: 0.0138\n",
      "Epoch 17/20\n",
      "33089/33089 [==============================] - 10s 300us/step - loss: 0.0161 - val_loss: 0.0132\n",
      "Epoch 18/20\n",
      "33089/33089 [==============================] - 10s 302us/step - loss: 0.0160 - val_loss: 0.0137\n",
      "Epoch 19/20\n",
      "33089/33089 [==============================] - 10s 314us/step - loss: 0.0159 - val_loss: 0.0146\n",
      "Epoch 20/20\n",
      "33089/33089 [==============================] - 10s 294us/step - loss: 0.0160 - val_loss: 0.0130\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f7068639be0>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "save_weights_at = os.path.join('keras_models', 'PRSA_data_PM2.5_1DConv_weights.{epoch:02d}-{val_loss:.4f}.hdf5')\n",
    "save_best = ModelCheckpoint(save_weights_at, monitor='val_loss', verbose=0,\n",
    "                            save_best_only=True, save_weights_only=False, mode='min',\n",
    "                            period=1)\n",
    "ts_model.fit(x=X_train, y=y_train, batch_size=16, epochs=20,\n",
    "             verbose=1, callbacks=[save_best], validation_data=(X_val, y_val),\n",
    "             shuffle=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prediction are made for the pm2.5 from the best saved model. The model's predictions, which are on the standardized  pm2.5, are inverse transformed to get predictions of original pm2.5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'os' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-23-9a710f268333>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbest_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'keras_models'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'PRSA_data_PM2.5_1DConv_weights.18-0.0128.hdf5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mpreds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbest_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mpred_pm25\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minverse_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mpred_pm25\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpred_pm25\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined"
     ]
    }
   ],
   "source": [
    "best_model = load_model(os.path.join('keras_models', 'PRSA_data_PM2.5_1DConv_weights.18-0.0128.hdf5'))\n",
    "preds = best_model.predict(X_val)\n",
    "pred_pm25 = scaler.inverse_transform(preds)\n",
    "pred_pm25 = np.squeeze(pred_pm25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_absolute_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_val' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-25-4135d558bcb1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmae\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmean_absolute_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_val\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pm2.5'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpred_pm25\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'MAE for the validation set:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmae\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df_val' is not defined"
     ]
    }
   ],
   "source": [
    "mae = mean_absolute_error(df_val['pm2.5'].loc[7:], pred_pm25)\n",
    "print('MAE for the validation set:', round(mae, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'plt' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-26-5582b33dc36c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#Let's plot the first 50 actual and predicted values of pm2.5.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_val\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pm2.5'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m56\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlinestyle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmarker\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'*'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpred_pm25\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlinestyle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmarker\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlegend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Actual'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'Predicted'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'plt' is not defined"
     ]
    }
   ],
   "source": [
    "#Let's plot the first 50 actual and predicted values of pm2.5.\n",
    "plt.figure(figsize=(5.5, 5.5))\n",
    "plt.plot(range(50), df_val['pm2.5'].loc[7:56], linestyle='-', marker='*', color='r')\n",
    "plt.plot(range(50), pred_pm25[:50], linestyle='-', marker='.', color='b')\n",
    "plt.legend(['Actual','Predicted'], loc=2)\n",
    "plt.title('Actual vs Predicted pm2.5')\n",
    "plt.ylabel('pm2.5')\n",
    "plt.xlabel('Index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (Ubuntu Linux)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}