| Download

Assignment

Project: Assignment

Path: Assignment.ipynb

Views: ²³

Kernel: Python 3 (Anaconda 5)

SIT307 Assignment 2

Students:

Mitchell Razga - 218232709

Madushi Menahari Jayasundara - 217206634

Mario Silva - 217425643

Load Modules and Packages

In [1]:

# Import Modules and Packages

# Data handling Modules
import numpy as np
import pandas as pd
import itertools as iter

# Graphing and Visulisation Modules
import matplotlib.pyplot as plt
from IPython.display import Image, display, HTML
import pydotplus
import seaborn as sns

# Graphing default settings
%matplotlib inline
sns.set(font_scale=1.25)

# Sklearn Modules
import sklearn
from sklearn import tree
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Import Data

In [2]:

# Read Dataset from CSV and create a pandas DataFrame
print("Reading data from CSV....")
data = pd.read_csv('data/PPD.csv', delimiter=',')

Reading data from CSV....

Encode Data

In [11]:

# Create new pandas DataFrame
data_encoded = pd.DataFrame

# Encode existing DataFrame and add to new dataframe
encode_columns = ['Gender', 'Nationality', 'Birthplace', 'Education Level', 'Classroom', 'Subject', 'Parent Responsible', 'Parent Survey Completed', 'Parent School Satisfaction', 'Absence Days']
data_encoded = pd.get_dummies(data, columns = encode_columns)
data_encoded

	Year	Semester	Hands Raised	Resources Visited	Announcements Viewed	Discussions Participated In	Grade	Gender_F	Gender_M	Nationality_Egypt	...	Subject_Science	Subject_Spanish	Parent Responsible_Father	Parent Responsible_Mother	Parent Survey Completed_No	Parent Survey Completed_Yes	Parent School Satisfaction_Bad	Parent School Satisfaction_Good	Absence Days_Above-7	Absence Days_Under-7
0	8	1	30	90	33	35	Middle-Level	1	0	0	...	0	0	1	0	1	0	1	0	0	1
1	8	1	35	80	50	70	High-Level	1	0	0	...	0	0	1	0	0	1	0	1	0	1
2	2	1	98	88	60	31	High-Level	1	0	0	...	0	0	0	1	1	0	0	1	0	1
3	2	1	10	20	22	97	Low-Level	1	0	0	...	0	0	1	0	1	0	1	0	1	0
4	2	1	11	20	20	98	Low-Level	1	0	0	...	0	0	1	0	1	0	1	0	1	0
5	2	1	89	92	40	28	High-Level	1	0	0	...	0	0	0	1	0	1	0	1	0	1
6	8	2	25	15	32	53	Middle-Level	1	0	0	...	0	0	1	0	1	0	1	0	0	1
7	8	2	80	71	52	51	Middle-Level	1	0	0	...	0	0	1	0	0	1	0	1	0	1
8	8	2	85	66	12	23	Middle-Level	1	0	0	...	0	0	1	0	1	0	1	0	0	1
9	8	2	45	58	52	43	High-Level	1	0	0	...	0	0	0	1	0	1	0	1	0	1
10	8	2	22	51	42	40	Middle-Level	1	0	0	...	0	0	1	0	0	1	1	0	0	1
11	8	2	72	51	42	24	High-Level	1	0	0	...	0	0	0	1	0	1	1	0	1	0
12	2	2	75	81	51	34	High-Level	1	0	0	...	0	0	0	1	1	0	0	1	0	1
13	2	2	5	9	19	98	Low-Level	1	0	0	...	0	0	1	0	1	0	1	0	1	0
14	2	2	10	12	29	93	Low-Level	1	0	0	...	0	0	1	0	1	0	1	0	1	0
15	2	2	79	93	49	23	High-Level	1	0	0	...	0	0	0	1	0	1	0	1	0	1
16	8	1	25	15	12	33	Low-Level	0	1	0	...	0	0	1	0	1	0	1	0	1	0
17	2	1	20	88	31	28	Middle-Level	0	1	0	...	0	0	1	0	0	1	0	1	1	0
18	2	1	90	98	41	38	High-Level	0	1	0	...	0	0	1	0	0	1	0	1	0	1
19	2	1	80	95	21	28	High-Level	0	1	0	...	0	0	1	0	0	1	0	1	0	1
20	2	1	10	18	71	38	Middle-Level	0	1	0	...	0	0	1	0	0	1	0	1	1	0
21	2	1	10	17	50	21	Middle-Level	0	1	0	...	0	0	1	0	1	0	1	0	0	1
22	2	1	10	10	40	51	Low-Level	0	1	0	...	0	0	1	0	1	0	1	0	1	0
23	2	1	20	90	50	61	Middle-Level	0	1	0	...	0	0	0	1	0	1	1	0	1	0
24	2	1	10	30	50	91	Low-Level	0	1	0	...	0	0	1	0	0	1	1	0	1	0
25	2	1	69	82	20	28	High-Level	0	1	0	...	0	0	0	1	0	1	0	1	0	1
26	2	1	15	90	21	97	Middle-Level	0	1	0	...	0	0	0	1	0	1	0	1	0	1
27	2	1	4	10	11	7	Low-Level	0	1	0	...	0	0	0	1	1	0	0	1	1	0
28	8	2	85	75	62	53	High-Level	0	1	0	...	0	0	0	1	0	1	1	0	0	1
29	8	2	10	35	30	13	Low-Level	0	1	0	...	0	0	0	1	1	0	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
450	4	2	32	14	32	29	Middle-Level	0	1	0	...	1	0	1	0	1	0	0	1	1	0
451	4	2	22	34	15	9	Low-Level	0	1	0	...	1	0	1	0	1	0	1	0	1	0
452	4	2	72	64	59	89	High-Level	0	1	0	...	1	0	0	1	0	1	0	1	0	1
453	4	2	82	84	79	79	Middle-Level	0	1	0	...	1	0	0	1	0	1	0	1	1	0
454	4	2	42	34	29	39	Middle-Level	0	1	0	...	1	0	1	0	1	0	0	1	1	0
455	8	2	87	88	40	10	Middle-Level	1	0	0	...	0	1	1	0	0	1	0	1	0	1
456	11	2	10	51	40	40	Low-Level	0	1	0	...	0	1	1	0	1	0	1	0	1	0
457	8	2	17	21	42	14	Middle-Level	0	1	0	...	0	1	1	0	0	1	0	1	0	1
458	8	2	27	41	49	14	Middle-Level	0	1	0	...	0	1	0	1	1	0	1	0	0	1
459	8	2	70	81	39	84	Middle-Level	0	1	0	...	0	1	1	0	1	0	1	0	0	1
460	8	2	27	90	82	14	High-Level	0	1	0	...	0	1	0	1	0	1	0	1	0	1
461	8	2	17	61	42	14	Middle-Level	0	1	0	...	0	1	1	0	1	0	1	0	0	1
462	8	2	87	81	42	19	High-Level	0	1	0	...	0	1	0	1	0	1	0	1	0	1
463	8	2	7	61	22	14	Low-Level	0	1	0	...	0	1	1	0	1	0	1	0	1	0
464	8	2	17	50	2	4	Low-Level	0	1	0	...	0	1	1	0	1	0	1	0	1	0
465	8	2	5	21	42	14	Low-Level	0	1	0	...	0	1	1	0	1	0	0	1	1	0
466	8	2	27	41	32	61	Middle-Level	0	1	0	...	0	1	0	1	0	1	1	0	1	0
467	8	2	96	61	42	94	High-Level	0	1	0	...	0	1	0	1	0	1	1	0	0	1
468	8	2	57	51	46	34	Middle-Level	0	1	0	...	0	1	0	1	0	1	0	1	0	1
469	8	2	77	69	41	13	Middle-Level	0	1	0	...	0	1	1	0	0	1	0	1	0	1
470	8	2	80	51	40	24	Middle-Level	0	1	0	...	0	1	1	0	1	0	0	1	0	1
471	8	2	62	61	82	40	Middle-Level	0	1	0	...	0	1	1	0	0	1	1	0	0	1
472	8	2	72	83	12	90	High-Level	0	1	0	...	0	1	0	1	0	1	0	1	0	1
473	8	2	87	81	22	70	High-Level	0	1	0	...	0	1	0	1	0	1	1	0	0	1
474	8	2	72	90	12	30	Middle-Level	0	1	0	...	0	1	0	1	1	0	1	0	0	1
475	8	2	2	11	62	30	Low-Level	0	1	0	...	0	1	1	0	1	0	1	0	0	1
476	8	2	5	3	2	10	Low-Level	0	1	0	...	0	1	1	0	0	1	0	1	0	1
477	8	2	5	17	21	10	Low-Level	0	1	0	...	0	1	1	0	1	0	1	0	1	0
478	8	2	51	42	12	29	Middle-Level	0	1	0	...	0	1	0	1	1	0	1	0	1	0
479	8	2	9	7	21	20	Low-Level	0	1	0	...	0	1	1	0	0	1	0	1	1	0

480 rows × 63 columns

Configure data for classifier

In [12]:

# Remove 'target' column to allow for sklearn DecisionTreeClassifier handling
test_data = data_encoded.drop(columns='Grade')
target_data = data_encoded['Grade'].values

Generate Test Data - 50/50 Split

In [13]:

# Split data
# CHANGE to predicted/estimated
X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5)
print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))

Training Data size: 240 240 
Test Data size:  240 240

Generate Decision Tree - 50/50 Split

In [14]:

# Generate CART Decision Tree
dtree = DecisionTreeClassifier(random_state=2000)
dtree.fit(X_train, y_train)
predictions = dtree.predict(X_test)

# Check accurracy
print("Accurracy: ", accuracy_score(y_test, predictions))
print(dtree.get_params)
#print(dtree.feature_importances_)

Accurracy:  0.6875
<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2000,
            splitter='best')>

Visualise

# Visualise Decision Tree
# Create DOT data
dot_data = tree.export_graphviz(dtree, out_file=None,
                                feature_names=test_data.columns,
                                class_names=['High-Level', 'Middle-Level', 'Low-Level'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

In [15]:

# Procedure for testing accurracy
def DecisionTreeParameterTest(parameter_variables, parameter_values):
    score = []
    parameters = {}
    for pvalue in parameter_values:
        print(parameters)
        parameters[parameter_variables]=pvalue
        dtree = DecisionTreeClassifier(random_state=2000, **parameters)
        dtree_predictions = dtree.fit(X_train, y_train).predict(X_test)
        dtree_accuracy = accuracy_score(y_test, dtree_predictions)
        score.append(dtree_accuracy)
    plt.subplots(figsize=(10,10))
    plot = sns.pointplot(x=parameter_values, y=score)
    plot.set(xlabel='Parameter Values', ylabel='Accuracy', title="Decision Tree Parameter Test: " + parameter_variables)
    plt.savefig("Decision Tree Parameter Test: " + parameter_variables + ".png")

In [16]:

# Functions for testing accurracy
### Tidy up
def DecisionTreeAccuracyTest(parameters):
    # Initialise dictionary to hold the results
    results = {}
    # Generate a decision with each parameter combination
    TestAllParameterCombinations(parameters, results)
    # Display Results
    PlotDecisionTreeAccuracy(list(results.keys()), list(results.values()))
    DisplayResults(results)

def TestAllParameterCombinations(parameters, results):
    # Combine parameters and values into a list
    parameters, values = zip(*parameters.items())
    # Get every possible combination
    for value in iter.product(*values):
        # Determine parameters
        current_parameters = dict(zip(parameters, value))
        # Generate Decision Tree with these parameters
        GenerateDecisionTree(current_parameters, results)
    return results

def GenerateDecisionTree(current_parameters, results):
    # Generate Decision Tree using specified parameters
    dtree = DecisionTreeClassifier(random_state=2000, **current_parameters)
    # Determine Accuracy of the Decision Tree
    dtree_predictions = dtree.fit(X_train, y_train).predict(X_test)
    dtree_accuracy = accuracy_score(y_test, dtree_predictions)
    # Convert parameter text into readable format
    formatted = " \n ".join(("{} = {}".format(*i) for i in current_parameters.items()))
    # Add test results to dictionary
    results.update({formatted: dtree_accuracy})
    return results

def FindBestParameterVariables():
    # Get P

def PlotDecisionTreeAccuracy(x, y):
    # Create Plot
    plt.subplots(figsize=(15,15))
    plot = sns.pointplot(x=x, y=y)
    # Set Plot visuals
    plot.set(xlabel='Parameter Values', ylabel='Accuracy', title="Decision Tree Parameter Test")
    plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
    # Save and Show Plot
    plt.savefig("Decision Tree Parameter Test.png")
    plt.show()

def DisplayResults(results):
    # Sort Results highest to lowest and display top 5
    sorted_results = {}
    for key in sorted(results, key=results.get, reverse=True)[:5]:
        sorted_results.update({key: results[key]})
    # Generate Pandas DataFrame from dictionary
    df = pd.DataFrame(list(sorted_results.items()), columns=['Parameters', 'Accuracy'])
    # Pretty print DataFrame
    display(HTML(df.to_html().replace("\\n","<br>")))

  File "<ipython-input-16-297303b31446>", line 38
    def PlotDecisionTreeAccuracy(x, y):
      ^
IndentationError: expected an indented block

In [0]:

# Best default decision tree
DecisionTreeAccuracyTest(parameters={'max_depth': [1, 5, 10, 25], 'min_samples_split': [2, 4, 8, 16]})

In [0]:

# Best default min samples split
DecisionTreeAccuracyTest(parameters={'min_samples_split': [2, 4, 8, 16, 32, 64, 128, 256, 512]})

In [0]:

# Best default min samples leaf
DecisionTreeAccuracyTest(parameters={'min_samples_leaf': [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700]})

In [0]:

# Best default min weight fraction leaf
DecisionTreeAccuracyTest(parameters={'min_weight_fraction_leaf': [0.1, 0.2, 0.3, 0.4, 0.5]})

In [0]:

## Best default max features
DecisionTreeAccuracyTest(parameters={'max_features': [1, 5, 10, 50, 60, 62]})

In [0]:

## Best default max features
DecisionTreeAccuracyTest(parameters={'max_leaf_nodes': [2, 4, 8, 16, 32, 64, 128, 256]})

In [0]:

## Best default max features
DecisionTreeAccuracyTest(parameters={'min_impurity_decrease': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]})

In [0]:

Random Forests

In [0]:

# Generate Random Forest
rf = RandomForestClassifier(random_state=2000)

# Train the model on training data
rf.fit(X_train, y_train);
predictions = rf.predict(X_test)

# Check accurracy
print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, predictions))
print("Classification Report: \n", classification_report(y_test, predictions))

In [0]:

Improving Accuracy

Below are methods we used to improve accuracy

Remove Outliers

In [0]:

Remove less important columns

In [0]:

data2 = data.drop(columns='Grade')

Show Correlation

In [0]:

corr = data_encoded.corr()
corr

In [0]:

Remove Columns

In [0]:

data.drop('Grade',axis=1)
data.drop('Birthplace',axis=1)
data.drop('Nationality',axis=1)
data.drop('Parent Responsible',axis=1)
data.drop('Parent School Satisfaction',axis=1)
data_removed = data_encoded.drop(columns='Grade','Birthplace','Nationality','Parent Responsible','Parent School Satisfaction')
data_removed

Increase Split

# Split data 50/50
X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5)
print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))

In [0]:

# Split data 80/20
X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .2)
print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))

In [0]:

# Generate Random Forest
rf = RandomForestClassifier(random_state=2000)

# Train the model on training data
rf.fit(X_train, y_train);
predictions = rf.predict(X_test)

# Check accurracy
print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, predictions))
print("Classification Report: \n", classification_report(y_test, predictions))

Change parameters

In [0]:

In [0]:

# Exploring the number of estimators in the random forest
score = []
est = []
estimators = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
for e in estimators:
    rf = RandomForestClassifier(n_estimators=e, random_state=2000)
    rf_predictions = rf.fit(X_train, y_train).predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    score.append(rf_accuracy)
    est.append(e)
plt.subplots(figsize=(10,10))
plot = sns.pointplot(x=est, y=score)
plot.set(xlabel='Number of estimators', ylabel='Accuracy',
         title='Accuracy score of Random Forests by number of estimators',)
plt.savefig("Estimator Accurracy.png")

In [0]:

# Exploring minimum leaf samples
score = []
leaf = []
leaf_options = [1, 5, 10, 50, 100, 200]
for l in leaf_options:
    rf = RandomForestClassifier(n_estimators=30, random_state=2000, min_samples_leaf=l)
    rf_predictions = rf.fit(X_train, y_train).predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    score.append(rf_accuracy)
    leaf.append(l)
plt.subplots(figsize=(10,10))
plot = sns.pointplot(x=leaf, y=score)
plot.set(xlabel='Number of minimum leaf samples', ylabel='Accuracy', 
         title='Accuracy score of Random Forests by number of minimum leaf samples')
plt.savefig("Leaf Accurracy.png")
plt.show()

In [0]:

In [0]:

# Generate Random Forest
rf = RandomForestClassifier(min_samples_leaf=5,random_state=2000)

# Train the model on training data
rf.fit(X_train, y_train);
predictions = rf.predict(X_test)

# Check accurracy
print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, predictions))
print("Classification Report: \n", classification_report(y_test, predictions))

In [0]:

# Visualise Decision Tree
# Create DOT data
dot_data = tree.export_graphviz(rf.estimators_[0], out_file=None, 
                                feature_names=test_data.columns,  
                                class_names=['High-Level', 'Middle-Level', 'Low-Level'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())

In [0]:

# Confusion matrix
# classification report
# Min, Max, Avg accuracy

In [0]:

rf.feature_importances_

In [0]:

Features = data.drop('Grade',axis=1)
Target = data['Grade']
label = preprocessing.LabelEncoder()
Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index
for col in Cat_Colums:
    Features[col] = label.fit_transform(Features[col])

In [0]:

In [0]:

for index, f in enumerate(rf.feature_importances_):
    if f > 0.05:
        print(encoded_data.columns[index], f)

In [0]:

indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))
h = sns.barplot(importances[indices],feat_labels[indices])

In [0]:

- When n_estimaters reaches its plateau

Compared to other methods

svm = SVC()
svm.fit(X_train,y_train)
print("Score of SVM",svm.score(X_test,y_test))

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=2000 )
mlp.fit(X_tr_std,y_tr)
y_pred_mlp= mlp.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred_mlp).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_mlp))

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

rf.fit(X_tr_std, y_tr)
y_pred_rf = rf.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred_rf).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_rf))

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
svm = SVC(kernel='linear', C=2.0, random_state=2000)
svm.fit(X_tr_std, y_tr)
y_pred = svm.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))