Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 39
Kernel: R (R-Project)

Lecture 22: Nearest Neighbor Classifier, continued

Today:

  1. Nearest Neighbor Classifier

  2. Assessing a Classifier

Setup and Data Upload

library("dplyr") library("ggplot2") # ignore the next two lines; these are just to make plots a bit smaller library('repr') options(repr.plot.width=3, repr.plot.height=3)
# dataset cancerdata <- read.csv("breast-cancer.csv")
dim(cancerdata) head(cancerdata, 3)
  1. 683
  2. 11
IDClump.ThicknessUniformity.of.Cell.SizeUniformity.of.Cell.ShapeMarginal.AdhesionSingle.Epithelial.Cell.SizeBare.NucleiBland.ChromatinNormal.NucleoliMitosesClass
10000255 1 1 1 2 1 3 1 1 0
10029455 4 4 5 7 10 3 2 1 0
10154253 1 1 1 2 2 3 1 1 0
# Pick 400 rows to "train" the classifier; use the remaining 283 to test/assess the classifier trainingdata <- cancerdata[ 1:400, 1:11 ] testdata <- cancerdata[ 401:683, 1:11 ]
dim(trainingdata) dim(testdata)
  1. 400
  2. 11
  1. 283
  2. 11

k-Nearest Neighbor Classifier: Building the pieces

# Step 1: Measure distance between new data point and each training data point # Given: trainingdata and testdata[1,] # Goal: find distance between testdata[1,] and each row of trainingdata, in terms of specified features featurecolumns <- 2:10 df <- data.frame( Class = trainingdata$Class, distance = double( 400 ) ) count <- 1 while( count <= 400){ df$distance[count] <- sqrt(sum( ( testdata[ 1, featurecolumns ] - trainingdata[ count , featurecolumns ] ) ** 2 )) count <- count + 1 }
# clicker friends <- data.frame( name = c( 'Ada', 'Ben', 'Chandra', 'Dante'), age = c( 23, 18, 20, 25) ) youngest_indices <- order( friends$age)[1:2] friends$name[youngest_indices]
  1. Ben
  2. Chandra
Levels:
  1. 'Ada'
  2. 'Ben'
  3. 'Chandra'
  4. 'Dante'
# Step 2: Find the k nearest neighbors k <- 6 indices_of_knn <- order( df$distance )[ 1:k ] labels_of_knn <- df$Class[ indices_of_knn ] labels_of_knn
  1. 1
  2. 1
  3. 1
  4. 0
  5. 0
  6. 0
# Steps 3 & 4: Take a majority vote to predict label of new data point num_of_ones <- sum( labels_of_knn == 1 ) num_of_zeros <- sum( labels_of_knn == 0 ) num_of_ones num_of_zeros if( num_of_ones > num_of_zeros){ predicted_label <- 1 }else if(num_of_ones < num_of_zeros){ predicted_label <- 0 }else{ predicted_label <- sample( 0:1, 1) # if there is a tie, toss a coin } predicted_label
3
3
1

k-Nearest Neighbor Classifier: Predicting the labels of each test data

# Combine and modify the above four steps to predict the labels of the test data, using the kNN classifier # number of test data num_test_rows <- dim(testdata)[1] # set up an empty data frame, to be filled with predicted labels prediction <- data.frame( pred_label = double(num_test_rows) ) # number of training data num_training <- dim(trainingdata)[1] # set up an empty data frame, to be filled with distance between test data point to each training data point df <- data.frame( class = trainingdata$Class, distance = double(num_training) ) # specify which feature columns to be included featurecolumns <- c(2:10) # Loop over each test data point count_test <- 1 while( count_test <= num_test_rows){ # Step 1: Measure distance between new current test data point and each training data point # Given: trainingdata and testdata[count_test,] # Goal: find distance between testdata[count_test,] and each row of trainingdata, in terms of specified features count <- 1 while( count <= num_training){ df$distance[count] <- sqrt( sum( (trainingdata[count, featurecolumns] - testdata[count_test, featurecolumns])^2 ) ) count <- count + 1 } # Step 2: Find the k nearest neighbors k <- 5 # As an example, let's say we look at the 5 nearest neighbors indices_of_knn <- order( df$distance )[1:k] labels_of_knn <- df$class[indices_of_knn] # Steps 3 & 4: Take a majority vote to predict label of new data point num_ones <- sum( labels_of_knn == 1) num_zeros <- sum( labels_of_knn == 0) if(num_ones >= num_zeros){ predictedlabel <- 1 }else{ predictedlabel <- 0 } # Store the predicted label in the prediction data frame prediction$pred_label[count_test] <- predictedlabel # go to the next test data point count_test <- count_test + 1 }