{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# SIT307 Assignment 2\n", "\n", "Students:\n", "\n", "Mitchell Razga - 218232709\n", "\n", "Madushi Menahari Jayasundara - 217206634\n", "\n", "Mario Silva - 217425643\n" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Load Modules and Packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Import Modules and Packages\n", "\n", "# Data handling Modules\n", "import numpy as np\n", "import pandas as pd\n", "import itertools as iter\n", "\n", "# Graphing and Visulisation Modules\n", "import matplotlib.pyplot as plt\n", "from IPython.display import Image, display, HTML\n", "import pydotplus\n", "import seaborn as sns\n", "\n", "# Graphing default settings\n", "%matplotlib inline\n", "sns.set(font_scale=1.25)\n", "\n", "# Sklearn Modules\n", "import sklearn\n", "from sklearn import tree\n", "from sklearn import preprocessing\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.model_selection import train_test_split\n", "# Classifiers\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import SVC" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Import Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading data from CSV....\n" ] } ], "source": [ "# Read Dataset from CSV and create a pandas DataFrame\n", "print(\"Reading data from CSV....\")\n", "data = pd.read_csv('data/PPD.csv', delimiter=',')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Encode Data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YearSemesterHands RaisedResources VisitedAnnouncements ViewedDiscussions Participated InGradeGender_FGender_MNationality_Egypt...Subject_ScienceSubject_SpanishParent Responsible_FatherParent Responsible_MotherParent Survey Completed_NoParent Survey Completed_YesParent School Satisfaction_BadParent School Satisfaction_GoodAbsence Days_Above-7Absence Days_Under-7
08130903335Middle-Level100...0010101001
18135805070High-Level100...0010010101
22198886031High-Level100...0001100101
32110202297Low-Level100...0010101010
42111202098Low-Level100...0010101010
52189924028High-Level100...0001010101
68225153253Middle-Level100...0010101001
78280715251Middle-Level100...0010010101
88285661223Middle-Level100...0010101001
98245585243High-Level100...0001010101
108222514240Middle-Level100...0010011001
118272514224High-Level100...0001011010
122275815134High-Level100...0001100101
1322591998Low-Level100...0010101010
142210122993Low-Level100...0010101010
152279934923High-Level100...0001010101
168125151233Low-Level010...0010101010
172120883128Middle-Level010...0010010110
182190984138High-Level010...0010010101
192180952128High-Level010...0010010101
202110187138Middle-Level010...0010010110
212110175021Middle-Level010...0010101001
222110104051Low-Level010...0010101010
232120905061Middle-Level010...0001011010
242110305091Low-Level010...0010011010
252169822028High-Level010...0001010101
262115902197Middle-Level010...0001010101
2721410117Low-Level010...0001100110
288285756253High-Level010...0001011001
298210353013Low-Level010...0001101010
..................................................................
4504232143229Middle-Level010...1010100110
451422234159Low-Level010...1010101010
4524272645989High-Level010...1001010101
4534282847979Middle-Level010...1001010110
4544242342939Middle-Level010...1010100110
4558287884010Middle-Level100...0110010101
45611210514040Low-Level010...0110101010
4578217214214Middle-Level010...0110010101
4588227414914Middle-Level010...0101101001
4598270813984Middle-Level010...0110101001
4608227908214High-Level010...0101010101
4618217614214Middle-Level010...0110101001
4628287814219High-Level010...0101010101
463827612214Low-Level010...0110101010
46482175024Low-Level010...0110101010
465825214214Low-Level010...0110100110
4668227413261Middle-Level010...0101011010
4678296614294High-Level010...0101011001
4688257514634Middle-Level010...0101010101
4698277694113Middle-Level010...0110010101
4708280514024Middle-Level010...0110100101
4718262618240Middle-Level010...0110011001
4728272831290High-Level010...0101010101
4738287812270High-Level010...0101011001
4748272901230Middle-Level010...0101101001
475822116230Low-Level010...0110101001
4768253210Low-Level010...0110010101
477825172110Low-Level010...0110101010
4788251421229Middle-Level010...0101101010
47982972120Low-Level010...0110010110
\n", "

480 rows × 63 columns

\n", "
" ] }, "execution_count": 11, "metadata": { }, "output_type": "execute_result" } ], "source": [ "# Create new pandas DataFrame\n", "data_encoded = pd.DataFrame\n", "\n", "# Encode existing DataFrame and add to new dataframe\n", "encode_columns = ['Gender', 'Nationality', 'Birthplace', 'Education Level', 'Classroom', 'Subject', 'Parent Responsible', 'Parent Survey Completed', 'Parent School Satisfaction', 'Absence Days']\n", "data_encoded = pd.get_dummies(data, columns = encode_columns)\n", "data_encoded" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Configure data for classifier" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "# Remove 'target' column to allow for sklearn DecisionTreeClassifier handling\n", "test_data = data_encoded.drop(columns='Grade')\n", "target_data = data_encoded['Grade'].values" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Generate Test Data - 50/50 Split" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training Data size: 240 240 \n", "Test Data size: 240 240\n" ] } ], "source": [ "# Split data\n", "# CHANGE to predicted/estimated\n", "X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5)\n", "print(\"Training Data size:\", len(X_train),len(y_train), \"\\nTest Data size: \",len(X_test),len(y_test))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Generate Decision Tree - 50/50 Split" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accurracy: 0.6875\n", "\n" ] } ], "source": [ "# Generate CART Decision Tree\n", "dtree = DecisionTreeClassifier(random_state=2000)\n", "dtree.fit(X_train, y_train)\n", "predictions = dtree.predict(X_test)\n", "\n", "# Check accurracy\n", "print(\"Accurracy: \", accuracy_score(y_test, predictions))\n", "print(dtree.get_params)\n", "#print(dtree.feature_importances_)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Visualise" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "# Visualise Decision Tree\n", "# Create DOT data\n", "dot_data = tree.export_graphviz(dtree, out_file=None,\n", " feature_names=test_data.columns,\n", " class_names=['High-Level', 'Middle-Level', 'Low-Level'])\n", "\n", "# Draw graph\n", "graph = pydotplus.graph_from_dot_data(dot_data)\n", "\n", "# Show graph\n", "Image(graph.create_png())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Procedure for testing accurracy\n", "def DecisionTreeParameterTest(parameter_variables, parameter_values):\n", " score = []\n", " parameters = {}\n", " for pvalue in parameter_values:\n", " print(parameters)\n", " parameters[parameter_variables]=pvalue\n", " dtree = DecisionTreeClassifier(random_state=2000, **parameters)\n", " dtree_predictions = dtree.fit(X_train, y_train).predict(X_test)\n", " dtree_accuracy = accuracy_score(y_test, dtree_predictions)\n", " score.append(dtree_accuracy)\n", " plt.subplots(figsize=(10,10))\n", " plot = sns.pointplot(x=parameter_values, y=score)\n", " plot.set(xlabel='Parameter Values', ylabel='Accuracy', title=\"Decision Tree Parameter Test: \" + parameter_variables)\n", " plt.savefig(\"Decision Tree Parameter Test: \" + parameter_variables + \".png\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "ename": "IndentationError", "evalue": "expected an indented block (, line 38)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m38\u001b[0m\n\u001b[0;31m def PlotDecisionTreeAccuracy(x, y):\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n" ] } ], "source": [ "# Functions for testing accurracy\n", "### Tidy up\n", "def DecisionTreeAccuracyTest(parameters):\n", " # Initialise dictionary to hold the results\n", " results = {}\n", " # Generate a decision with each parameter combination\n", " TestAllParameterCombinations(parameters, results)\n", " # Display Results\n", " PlotDecisionTreeAccuracy(list(results.keys()), list(results.values()))\n", " DisplayResults(results)\n", "\n", "def TestAllParameterCombinations(parameters, results):\n", " # Combine parameters and values into a list\n", " parameters, values = zip(*parameters.items())\n", " # Get every possible combination\n", " for value in iter.product(*values):\n", " # Determine parameters\n", " current_parameters = dict(zip(parameters, value))\n", " # Generate Decision Tree with these parameters\n", " GenerateDecisionTree(current_parameters, results)\n", " return results\n", "\n", "def GenerateDecisionTree(current_parameters, results):\n", " # Generate Decision Tree using specified parameters\n", " dtree = DecisionTreeClassifier(random_state=2000, **current_parameters)\n", " # Determine Accuracy of the Decision Tree\n", " dtree_predictions = dtree.fit(X_train, y_train).predict(X_test)\n", " dtree_accuracy = accuracy_score(y_test, dtree_predictions)\n", " # Convert parameter text into readable format\n", " formatted = \" \\n \".join((\"{} = {}\".format(*i) for i in current_parameters.items()))\n", " # Add test results to dictionary\n", " results.update({formatted: dtree_accuracy})\n", " return results\n", "\n", "def FindBestParameterVariables():\n", " # Get P\n", "\n", "def PlotDecisionTreeAccuracy(x, y):\n", " # Create Plot\n", " plt.subplots(figsize=(15,15))\n", " plot = sns.pointplot(x=x, y=y)\n", " # Set Plot visuals\n", " plot.set(xlabel='Parameter Values', ylabel='Accuracy', title=\"Decision Tree Parameter Test\")\n", " plot.set_xticklabels(plot.get_xticklabels(), rotation=90)\n", " # Save and Show Plot\n", " plt.savefig(\"Decision Tree Parameter Test.png\")\n", " plt.show()\n", "\n", "def DisplayResults(results):\n", " # Sort Results highest to lowest and display top 5\n", " sorted_results = {}\n", " for key in sorted(results, key=results.get, reverse=True)[:5]:\n", " sorted_results.update({key: results[key]})\n", " # Generate Pandas DataFrame from dictionary\n", " df = pd.DataFrame(list(sorted_results.items()), columns=['Parameters', 'Accuracy'])\n", " # Pretty print DataFrame\n", " display(HTML(df.to_html().replace(\"\\\\n\",\"
\")))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "# Best default decision tree\n", "DecisionTreeAccuracyTest(parameters={'max_depth': [1, 5, 10, 25], 'min_samples_split': [2, 4, 8, 16]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Best default min samples split\n", "DecisionTreeAccuracyTest(parameters={'min_samples_split': [2, 4, 8, 16, 32, 64, 128, 256, 512]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Best default min samples leaf\n", "DecisionTreeAccuracyTest(parameters={'min_samples_leaf': [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Best default min weight fraction leaf\n", "DecisionTreeAccuracyTest(parameters={'min_weight_fraction_leaf': [0.1, 0.2, 0.3, 0.4, 0.5]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "## Best default max features\n", "DecisionTreeAccuracyTest(parameters={'max_features': [1, 5, 10, 50, 60, 62]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "## Best default max features\n", "DecisionTreeAccuracyTest(parameters={'max_leaf_nodes': [2, 4, 8, 16, 32, 64, 128, 256]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "## Best default max features\n", "DecisionTreeAccuracyTest(parameters={'min_impurity_decrease': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]})" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Random Forests\n" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "# Generate Random Forest\n", "rf = RandomForestClassifier(random_state=2000)\n", "\n", "# Train the model on training data\n", "rf.fit(X_train, y_train);\n", "predictions = rf.predict(X_test)\n", "\n", "# Check accurracy\n", "print(\"Train Accuracy: \", accuracy_score(y_train, rf.predict(X_train)))\n", "print(\"Test Accuracy: \", accuracy_score(y_test, predictions))\n", "print(\"Classification Report: \\n\", classification_report(y_test, predictions))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# Improving Accuracy" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "Below are methods we used to improve accuracy" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Remove Outliers" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Remove less important columns" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "data2 = data.drop(columns='Grade')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Show Correlation" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "corr = data_encoded.corr()\n", "corr" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Remove Columns" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "data.drop('Grade',axis=1)\n", "data.drop('Birthplace',axis=1)\n", "data.drop('Nationality',axis=1)\n", "data.drop('Parent Responsible',axis=1)\n", "data.drop('Parent School Satisfaction',axis=1)\n", "data_removed = data_encoded.drop(columns='Grade','Birthplace','Nationality','Parent Responsible','Parent School Satisfaction')\n", "data_removed" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Increase Split" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "# Split data 50/50\n", "X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5)\n", "print(\"Training Data size:\", len(X_train),len(y_train), \"\\nTest Data size: \",len(X_test),len(y_test))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Split data 80/20\n", "X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .2)\n", "print(\"Training Data size:\", len(X_train),len(y_train), \"\\nTest Data size: \",len(X_test),len(y_test))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Generate Random Forest\n", "rf = RandomForestClassifier(random_state=2000)\n", "\n", "# Train the model on training data\n", "rf.fit(X_train, y_train);\n", "predictions = rf.predict(X_test)\n", "\n", "# Check accurracy\n", "print(\"Train Accuracy: \", accuracy_score(y_train, rf.predict(X_train)))\n", "print(\"Test Accuracy: \", accuracy_score(y_test, predictions))\n", "print(\"Classification Report: \\n\", classification_report(y_test, predictions))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Change parameters" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Exploring the number of estimators in the random forest\n", "score = []\n", "est = []\n", "estimators = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]\n", "for e in estimators:\n", " rf = RandomForestClassifier(n_estimators=e, random_state=2000)\n", " rf_predictions = rf.fit(X_train, y_train).predict(X_test)\n", " rf_accuracy = accuracy_score(y_test, rf_predictions)\n", " score.append(rf_accuracy)\n", " est.append(e)\n", "plt.subplots(figsize=(10,10))\n", "plot = sns.pointplot(x=est, y=score)\n", "plot.set(xlabel='Number of estimators', ylabel='Accuracy',\n", " title='Accuracy score of Random Forests by number of estimators',)\n", "plt.savefig(\"Estimator Accurracy.png\")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Exploring minimum leaf samples\n", "score = []\n", "leaf = []\n", "leaf_options = [1, 5, 10, 50, 100, 200]\n", "for l in leaf_options:\n", " rf = RandomForestClassifier(n_estimators=30, random_state=2000, min_samples_leaf=l)\n", " rf_predictions = rf.fit(X_train, y_train).predict(X_test)\n", " rf_accuracy = accuracy_score(y_test, rf_predictions)\n", " score.append(rf_accuracy)\n", " leaf.append(l)\n", "plt.subplots(figsize=(10,10))\n", "plot = sns.pointplot(x=leaf, y=score)\n", "plot.set(xlabel='Number of minimum leaf samples', ylabel='Accuracy', \n", " title='Accuracy score of Random Forests by number of minimum leaf samples')\n", "plt.savefig(\"Leaf Accurracy.png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Generate Random Forest\n", "rf = RandomForestClassifier(min_samples_leaf=5,random_state=2000)\n", "\n", "# Train the model on training data\n", "rf.fit(X_train, y_train);\n", "predictions = rf.predict(X_test)\n", "\n", "# Check accurracy\n", "print(\"Train Accuracy: \", accuracy_score(y_train, rf.predict(X_train)))\n", "print(\"Test Accuracy: \", accuracy_score(y_test, predictions))\n", "print(\"Classification Report: \\n\", classification_report(y_test, predictions))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Visualise Decision Tree\n", "# Create DOT data\n", "dot_data = tree.export_graphviz(rf.estimators_[0], out_file=None, \n", " feature_names=test_data.columns, \n", " class_names=['High-Level', 'Middle-Level', 'Low-Level'])\n", "\n", "# Draw graph\n", "graph = pydotplus.graph_from_dot_data(dot_data) \n", "\n", "# Show graph\n", "Image(graph.create_png())" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ ], "source": [ "# Confusion matrix\n", "# classification report\n", "# Min, Max, Avg accuracy" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "rf.feature_importances_" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "Features = data.drop('Grade',axis=1)\n", "Target = data['Grade']\n", "label = preprocessing.LabelEncoder()\n", "Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index\n", "for col in Cat_Colums:\n", " Features[col] = label.fit_transform(Features[col])" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "for index, f in enumerate(rf.feature_importances_):\n", " if f > 0.05:\n", " print(encoded_data.columns[index], f)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "indices = np.argsort(importances)[::-1]\n", "for f in range(X_train.shape[1]):\n", " print(\"%2d) %-*s %f\" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))\n", "h = sns.barplot(importances[indices],feat_labels[indices])" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "- When n_estimaters reaches its plateau" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Compared to other methods" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "svm = SVC()\n", "svm.fit(X_train,y_train)\n", "print(\"Score of SVM\",svm.score(X_test,y_test))" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "from sklearn.neural_network import MLPClassifier\n", "mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,\n", " hidden_layer_sizes=(5, 2), random_state=2000 )\n", "mlp.fit(X_tr_std,y_tr)\n", "y_pred_mlp= mlp.predict(X_test_std)\n", "print('Misclassified samples: %d' % (y_test != y_pred_mlp).sum())\n", "print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_mlp))" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "rf=RandomForestClassifier()\n", "\n", "rf.fit(X_tr_std, y_tr)\n", "y_pred_rf = rf.predict(X_test_std)\n", "print('Misclassified samples: %d' % (y_test != y_pred_rf).sum())\n", "print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_rf))" ] }, { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [ "from sklearn.svm import SVC\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import accuracy_score\n", "svm = SVC(kernel='linear', C=2.0, random_state=2000)\n", "svm.fit(X_tr_std, y_tr)\n", "y_pred = svm.predict(X_test_std)\n", "print('Misclassified samples: %d' % (y_test != y_pred).sum())\n", "print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (Anaconda 5)", "language": "python", "name": "anaconda5" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 0 }