Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Assignment

Project: Assignment
Views: 23
Kernel: Python 3 (Anaconda 5)

SIT307 Assignment 2

Students:

Mitchell Razga - 218232709

Madushi Menahari Jayasundara - 217206634

Mario Silva - 217425643

Load Modules and Packages

# Import Modules and Packages # Data handling Modules import numpy as np import pandas as pd import itertools as iter # Graphing and Visulisation Modules import matplotlib.pyplot as plt from IPython.display import Image, display, HTML import pydotplus import seaborn as sns # Graphing default settings %matplotlib inline sns.set(font_scale=1.25) # Sklearn Modules import sklearn from sklearn import tree from sklearn import preprocessing from sklearn.metrics import accuracy_score, classification_report from sklearn.model_selection import train_test_split # Classifiers from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC

Import Data

# Read Dataset from CSV and create a pandas DataFrame print("Reading data from CSV....") data = pd.read_csv('data/PPD.csv', delimiter=',')
Reading data from CSV....

Encode Data

# Create new pandas DataFrame data_encoded = pd.DataFrame # Encode existing DataFrame and add to new dataframe encode_columns = ['Gender', 'Nationality', 'Birthplace', 'Education Level', 'Classroom', 'Subject', 'Parent Responsible', 'Parent Survey Completed', 'Parent School Satisfaction', 'Absence Days'] data_encoded = pd.get_dummies(data, columns = encode_columns) data_encoded
Year Semester Hands Raised Resources Visited Announcements Viewed Discussions Participated In Grade Gender_F Gender_M Nationality_Egypt ... Subject_Science Subject_Spanish Parent Responsible_Father Parent Responsible_Mother Parent Survey Completed_No Parent Survey Completed_Yes Parent School Satisfaction_Bad Parent School Satisfaction_Good Absence Days_Above-7 Absence Days_Under-7
0 8 1 30 90 33 35 Middle-Level 1 0 0 ... 0 0 1 0 1 0 1 0 0 1
1 8 1 35 80 50 70 High-Level 1 0 0 ... 0 0 1 0 0 1 0 1 0 1
2 2 1 98 88 60 31 High-Level 1 0 0 ... 0 0 0 1 1 0 0 1 0 1
3 2 1 10 20 22 97 Low-Level 1 0 0 ... 0 0 1 0 1 0 1 0 1 0
4 2 1 11 20 20 98 Low-Level 1 0 0 ... 0 0 1 0 1 0 1 0 1 0
5 2 1 89 92 40 28 High-Level 1 0 0 ... 0 0 0 1 0 1 0 1 0 1
6 8 2 25 15 32 53 Middle-Level 1 0 0 ... 0 0 1 0 1 0 1 0 0 1
7 8 2 80 71 52 51 Middle-Level 1 0 0 ... 0 0 1 0 0 1 0 1 0 1
8 8 2 85 66 12 23 Middle-Level 1 0 0 ... 0 0 1 0 1 0 1 0 0 1
9 8 2 45 58 52 43 High-Level 1 0 0 ... 0 0 0 1 0 1 0 1 0 1
10 8 2 22 51 42 40 Middle-Level 1 0 0 ... 0 0 1 0 0 1 1 0 0 1
11 8 2 72 51 42 24 High-Level 1 0 0 ... 0 0 0 1 0 1 1 0 1 0
12 2 2 75 81 51 34 High-Level 1 0 0 ... 0 0 0 1 1 0 0 1 0 1
13 2 2 5 9 19 98 Low-Level 1 0 0 ... 0 0 1 0 1 0 1 0 1 0
14 2 2 10 12 29 93 Low-Level 1 0 0 ... 0 0 1 0 1 0 1 0 1 0
15 2 2 79 93 49 23 High-Level 1 0 0 ... 0 0 0 1 0 1 0 1 0 1
16 8 1 25 15 12 33 Low-Level 0 1 0 ... 0 0 1 0 1 0 1 0 1 0
17 2 1 20 88 31 28 Middle-Level 0 1 0 ... 0 0 1 0 0 1 0 1 1 0
18 2 1 90 98 41 38 High-Level 0 1 0 ... 0 0 1 0 0 1 0 1 0 1
19 2 1 80 95 21 28 High-Level 0 1 0 ... 0 0 1 0 0 1 0 1 0 1
20 2 1 10 18 71 38 Middle-Level 0 1 0 ... 0 0 1 0 0 1 0 1 1 0
21 2 1 10 17 50 21 Middle-Level 0 1 0 ... 0 0 1 0 1 0 1 0 0 1
22 2 1 10 10 40 51 Low-Level 0 1 0 ... 0 0 1 0 1 0 1 0 1 0
23 2 1 20 90 50 61 Middle-Level 0 1 0 ... 0 0 0 1 0 1 1 0 1 0
24 2 1 10 30 50 91 Low-Level 0 1 0 ... 0 0 1 0 0 1 1 0 1 0
25 2 1 69 82 20 28 High-Level 0 1 0 ... 0 0 0 1 0 1 0 1 0 1
26 2 1 15 90 21 97 Middle-Level 0 1 0 ... 0 0 0 1 0 1 0 1 0 1
27 2 1 4 10 11 7 Low-Level 0 1 0 ... 0 0 0 1 1 0 0 1 1 0
28 8 2 85 75 62 53 High-Level 0 1 0 ... 0 0 0 1 0 1 1 0 0 1
29 8 2 10 35 30 13 Low-Level 0 1 0 ... 0 0 0 1 1 0 1 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
450 4 2 32 14 32 29 Middle-Level 0 1 0 ... 1 0 1 0 1 0 0 1 1 0
451 4 2 22 34 15 9 Low-Level 0 1 0 ... 1 0 1 0 1 0 1 0 1 0
452 4 2 72 64 59 89 High-Level 0 1 0 ... 1 0 0 1 0 1 0 1 0 1
453 4 2 82 84 79 79 Middle-Level 0 1 0 ... 1 0 0 1 0 1 0 1 1 0
454 4 2 42 34 29 39 Middle-Level 0 1 0 ... 1 0 1 0 1 0 0 1 1 0
455 8 2 87 88 40 10 Middle-Level 1 0 0 ... 0 1 1 0 0 1 0 1 0 1
456 11 2 10 51 40 40 Low-Level 0 1 0 ... 0 1 1 0 1 0 1 0 1 0
457 8 2 17 21 42 14 Middle-Level 0 1 0 ... 0 1 1 0 0 1 0 1 0 1
458 8 2 27 41 49 14 Middle-Level 0 1 0 ... 0 1 0 1 1 0 1 0 0 1
459 8 2 70 81 39 84 Middle-Level 0 1 0 ... 0 1 1 0 1 0 1 0 0 1
460 8 2 27 90 82 14 High-Level 0 1 0 ... 0 1 0 1 0 1 0 1 0 1
461 8 2 17 61 42 14 Middle-Level 0 1 0 ... 0 1 1 0 1 0 1 0 0 1
462 8 2 87 81 42 19 High-Level 0 1 0 ... 0 1 0 1 0 1 0 1 0 1
463 8 2 7 61 22 14 Low-Level 0 1 0 ... 0 1 1 0 1 0 1 0 1 0
464 8 2 17 50 2 4 Low-Level 0 1 0 ... 0 1 1 0 1 0 1 0 1 0
465 8 2 5 21 42 14 Low-Level 0 1 0 ... 0 1 1 0 1 0 0 1 1 0
466 8 2 27 41 32 61 Middle-Level 0 1 0 ... 0 1 0 1 0 1 1 0 1 0
467 8 2 96 61 42 94 High-Level 0 1 0 ... 0 1 0 1 0 1 1 0 0 1
468 8 2 57 51 46 34 Middle-Level 0 1 0 ... 0 1 0 1 0 1 0 1 0 1
469 8 2 77 69 41 13 Middle-Level 0 1 0 ... 0 1 1 0 0 1 0 1 0 1
470 8 2 80 51 40 24 Middle-Level 0 1 0 ... 0 1 1 0 1 0 0 1 0 1
471 8 2 62 61 82 40 Middle-Level 0 1 0 ... 0 1 1 0 0 1 1 0 0 1
472 8 2 72 83 12 90 High-Level 0 1 0 ... 0 1 0 1 0 1 0 1 0 1
473 8 2 87 81 22 70 High-Level 0 1 0 ... 0 1 0 1 0 1 1 0 0 1
474 8 2 72 90 12 30 Middle-Level 0 1 0 ... 0 1 0 1 1 0 1 0 0 1
475 8 2 2 11 62 30 Low-Level 0 1 0 ... 0 1 1 0 1 0 1 0 0 1
476 8 2 5 3 2 10 Low-Level 0 1 0 ... 0 1 1 0 0 1 0 1 0 1
477 8 2 5 17 21 10 Low-Level 0 1 0 ... 0 1 1 0 1 0 1 0 1 0
478 8 2 51 42 12 29 Middle-Level 0 1 0 ... 0 1 0 1 1 0 1 0 1 0
479 8 2 9 7 21 20 Low-Level 0 1 0 ... 0 1 1 0 0 1 0 1 1 0

480 rows × 63 columns

Configure data for classifier

# Remove 'target' column to allow for sklearn DecisionTreeClassifier handling test_data = data_encoded.drop(columns='Grade') target_data = data_encoded['Grade'].values

Generate Test Data - 50/50 Split

# Split data # CHANGE to predicted/estimated X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5) print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))
Training Data size: 240 240 Test Data size: 240 240

Generate Decision Tree - 50/50 Split

# Generate CART Decision Tree dtree = DecisionTreeClassifier(random_state=2000) dtree.fit(X_train, y_train) predictions = dtree.predict(X_test) # Check accurracy print("Accurracy: ", accuracy_score(y_test, predictions)) print(dtree.get_params) #print(dtree.feature_importances_)
Accurracy: 0.6875 <bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=2000, splitter='best')>

Visualise

# Visualise Decision Tree # Create DOT data dot_data = tree.export_graphviz(dtree, out_file=None, feature_names=test_data.columns, class_names=['High-Level', 'Middle-Level', 'Low-Level']) # Draw graph graph = pydotplus.graph_from_dot_data(dot_data) # Show graph Image(graph.create_png())
# Procedure for testing accurracy def DecisionTreeParameterTest(parameter_variables, parameter_values): score = [] parameters = {} for pvalue in parameter_values: print(parameters) parameters[parameter_variables]=pvalue dtree = DecisionTreeClassifier(random_state=2000, **parameters) dtree_predictions = dtree.fit(X_train, y_train).predict(X_test) dtree_accuracy = accuracy_score(y_test, dtree_predictions) score.append(dtree_accuracy) plt.subplots(figsize=(10,10)) plot = sns.pointplot(x=parameter_values, y=score) plot.set(xlabel='Parameter Values', ylabel='Accuracy', title="Decision Tree Parameter Test: " + parameter_variables) plt.savefig("Decision Tree Parameter Test: " + parameter_variables + ".png")
# Functions for testing accurracy ### Tidy up def DecisionTreeAccuracyTest(parameters): # Initialise dictionary to hold the results results = {} # Generate a decision with each parameter combination TestAllParameterCombinations(parameters, results) # Display Results PlotDecisionTreeAccuracy(list(results.keys()), list(results.values())) DisplayResults(results) def TestAllParameterCombinations(parameters, results): # Combine parameters and values into a list parameters, values = zip(*parameters.items()) # Get every possible combination for value in iter.product(*values): # Determine parameters current_parameters = dict(zip(parameters, value)) # Generate Decision Tree with these parameters GenerateDecisionTree(current_parameters, results) return results def GenerateDecisionTree(current_parameters, results): # Generate Decision Tree using specified parameters dtree = DecisionTreeClassifier(random_state=2000, **current_parameters) # Determine Accuracy of the Decision Tree dtree_predictions = dtree.fit(X_train, y_train).predict(X_test) dtree_accuracy = accuracy_score(y_test, dtree_predictions) # Convert parameter text into readable format formatted = " \n ".join(("{} = {}".format(*i) for i in current_parameters.items())) # Add test results to dictionary results.update({formatted: dtree_accuracy}) return results def FindBestParameterVariables(): # Get P def PlotDecisionTreeAccuracy(x, y): # Create Plot plt.subplots(figsize=(15,15)) plot = sns.pointplot(x=x, y=y) # Set Plot visuals plot.set(xlabel='Parameter Values', ylabel='Accuracy', title="Decision Tree Parameter Test") plot.set_xticklabels(plot.get_xticklabels(), rotation=90) # Save and Show Plot plt.savefig("Decision Tree Parameter Test.png") plt.show() def DisplayResults(results): # Sort Results highest to lowest and display top 5 sorted_results = {} for key in sorted(results, key=results.get, reverse=True)[:5]: sorted_results.update({key: results[key]}) # Generate Pandas DataFrame from dictionary df = pd.DataFrame(list(sorted_results.items()), columns=['Parameters', 'Accuracy']) # Pretty print DataFrame display(HTML(df.to_html().replace("\\n","<br>")))
File "<ipython-input-16-297303b31446>", line 38 def PlotDecisionTreeAccuracy(x, y): ^ IndentationError: expected an indented block
# Best default decision tree DecisionTreeAccuracyTest(parameters={'max_depth': [1, 5, 10, 25], 'min_samples_split': [2, 4, 8, 16]})
# Best default min samples split DecisionTreeAccuracyTest(parameters={'min_samples_split': [2, 4, 8, 16, 32, 64, 128, 256, 512]})
# Best default min samples leaf DecisionTreeAccuracyTest(parameters={'min_samples_leaf': [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700]})
# Best default min weight fraction leaf DecisionTreeAccuracyTest(parameters={'min_weight_fraction_leaf': [0.1, 0.2, 0.3, 0.4, 0.5]})
## Best default max features DecisionTreeAccuracyTest(parameters={'max_features': [1, 5, 10, 50, 60, 62]})
## Best default max features DecisionTreeAccuracyTest(parameters={'max_leaf_nodes': [2, 4, 8, 16, 32, 64, 128, 256]})
## Best default max features DecisionTreeAccuracyTest(parameters={'min_impurity_decrease': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]})

Random Forests

# Generate Random Forest rf = RandomForestClassifier(random_state=2000) # Train the model on training data rf.fit(X_train, y_train); predictions = rf.predict(X_test) # Check accurracy print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train))) print("Test Accuracy: ", accuracy_score(y_test, predictions)) print("Classification Report: \n", classification_report(y_test, predictions))

Improving Accuracy

Below are methods we used to improve accuracy

Remove Outliers

Remove less important columns

data2 = data.drop(columns='Grade')

Show Correlation

corr = data_encoded.corr() corr

Remove Columns

data.drop('Grade',axis=1) data.drop('Birthplace',axis=1) data.drop('Nationality',axis=1) data.drop('Parent Responsible',axis=1) data.drop('Parent School Satisfaction',axis=1) data_removed = data_encoded.drop(columns='Grade','Birthplace','Nationality','Parent Responsible','Parent School Satisfaction') data_removed

Increase Split

# Split data 50/50 X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .5) print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))
# Split data 80/20 X_train, X_test, y_train, y_test = train_test_split(test_data, target_data, test_size = .2) print("Training Data size:", len(X_train),len(y_train), "\nTest Data size: ",len(X_test),len(y_test))
# Generate Random Forest rf = RandomForestClassifier(random_state=2000) # Train the model on training data rf.fit(X_train, y_train); predictions = rf.predict(X_test) # Check accurracy print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train))) print("Test Accuracy: ", accuracy_score(y_test, predictions)) print("Classification Report: \n", classification_report(y_test, predictions))

Change parameters

# Exploring the number of estimators in the random forest score = [] est = [] estimators = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] for e in estimators: rf = RandomForestClassifier(n_estimators=e, random_state=2000) rf_predictions = rf.fit(X_train, y_train).predict(X_test) rf_accuracy = accuracy_score(y_test, rf_predictions) score.append(rf_accuracy) est.append(e) plt.subplots(figsize=(10,10)) plot = sns.pointplot(x=est, y=score) plot.set(xlabel='Number of estimators', ylabel='Accuracy', title='Accuracy score of Random Forests by number of estimators',) plt.savefig("Estimator Accurracy.png")
# Exploring minimum leaf samples score = [] leaf = [] leaf_options = [1, 5, 10, 50, 100, 200] for l in leaf_options: rf = RandomForestClassifier(n_estimators=30, random_state=2000, min_samples_leaf=l) rf_predictions = rf.fit(X_train, y_train).predict(X_test) rf_accuracy = accuracy_score(y_test, rf_predictions) score.append(rf_accuracy) leaf.append(l) plt.subplots(figsize=(10,10)) plot = sns.pointplot(x=leaf, y=score) plot.set(xlabel='Number of minimum leaf samples', ylabel='Accuracy', title='Accuracy score of Random Forests by number of minimum leaf samples') plt.savefig("Leaf Accurracy.png") plt.show()
# Generate Random Forest rf = RandomForestClassifier(min_samples_leaf=5,random_state=2000) # Train the model on training data rf.fit(X_train, y_train); predictions = rf.predict(X_test) # Check accurracy print("Train Accuracy: ", accuracy_score(y_train, rf.predict(X_train))) print("Test Accuracy: ", accuracy_score(y_test, predictions)) print("Classification Report: \n", classification_report(y_test, predictions))
# Visualise Decision Tree # Create DOT data dot_data = tree.export_graphviz(rf.estimators_[0], out_file=None, feature_names=test_data.columns, class_names=['High-Level', 'Middle-Level', 'Low-Level']) # Draw graph graph = pydotplus.graph_from_dot_data(dot_data) # Show graph Image(graph.create_png())
# Confusion matrix # classification report # Min, Max, Avg accuracy
rf.feature_importances_
Features = data.drop('Grade',axis=1) Target = data['Grade'] label = preprocessing.LabelEncoder() Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index for col in Cat_Colums: Features[col] = label.fit_transform(Features[col])
for index, f in enumerate(rf.feature_importances_): if f > 0.05: print(encoded_data.columns[index], f)
indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]])) h = sns.barplot(importances[indices],feat_labels[indices])
- When n_estimaters reaches its plateau

Compared to other methods

svm = SVC() svm.fit(X_train,y_train) print("Score of SVM",svm.score(X_test,y_test))
from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=2000 ) mlp.fit(X_tr_std,y_tr) y_pred_mlp= mlp.predict(X_test_std) print('Misclassified samples: %d' % (y_test != y_pred_mlp).sum()) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_mlp))
from sklearn.ensemble import RandomForestClassifier rf=RandomForestClassifier() rf.fit(X_tr_std, y_tr) y_pred_rf = rf.predict(X_test_std) print('Misclassified samples: %d' % (y_test != y_pred_rf).sum()) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_rf))
from sklearn.svm import SVC from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score svm = SVC(kernel='linear', C=2.0, random_state=2000) svm.fit(X_tr_std, y_tr) y_pred = svm.predict(X_test_std) print('Misclassified samples: %d' % (y_test != y_pred).sum()) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))