Lecture 22: Nearest Neighbor Classifier, continued

Today:

Nearest Neighbor Classifier
Assessing a Classifier

Setup and Data Upload

In [2]:

library("dplyr")
library("ggplot2")

# ignore the next two lines; these are just to make plots a bit smaller
library('repr')
options(repr.plot.width=3, repr.plot.height=3)

In [3]:

# dataset
cancerdata <- read.csv("breast-cancer.csv")

In [4]:

dim(cancerdata)
head(cancerdata, 3)

ID	Clump.Thickness	Uniformity.of.Cell.Size	Uniformity.of.Cell.Shape	Marginal.Adhesion	Single.Epithelial.Cell.Size	Bare.Nuclei	Bland.Chromatin	Normal.Nucleoli	Mitoses
1000025	5	1	1	1	2	1	3	1	1
1002945	5	4	4	5	7	10	3	2	1
1015425	3	1	1	1	2	2	3	1	1

In [5]:

# Pick 400 rows to "train" the classifier; use the remaining 283 to test/assess the classifier
trainingdata <- cancerdata[ 1:400, 1:11 ]
testdata <- cancerdata[ 401:683, 1:11 ]

In [6]:

dim(trainingdata)
dim(testdata)

k-Nearest Neighbor Classifier: Building the pieces

In [7]:

# Step 1: Measure distance between new data point and each training data point

# Given: trainingdata and testdata[1,]
# Goal: find distance between testdata[1,] and each row of trainingdata, in terms of specified features


featurecolumns <- 2:10

df <- data.frame( Class = trainingdata$Class,
                  distance = double( 400 )  )

count <- 1

while( count <= 400){
    
    df$distance[count] <- sqrt(sum(  ( testdata[ 1, featurecolumns ] - trainingdata[ count , featurecolumns ] ) ** 2  ))

    count <- count + 1
}

In [8]:

# clicker
friends <- data.frame( name = c( 'Ada', 'Ben', 'Chandra', 'Dante'), age = c( 23, 18, 20, 25) )

youngest_indices <- order( friends$age)[1:2]
friends$name[youngest_indices]

Ben
Chandra

Levels:

'Ada'
'Ben'
'Chandra'
'Dante'

In [9]:

# Step 2: Find the k nearest neighbors


k <- 6
indices_of_knn <- order( df$distance )[ 1:k ]
labels_of_knn <- df$Class[ indices_of_knn ]
labels_of_knn

In [10]:

# Steps 3 & 4: Take a majority vote to predict label of new data point

num_of_ones <-  sum(  labels_of_knn == 1 )
num_of_zeros <- sum(  labels_of_knn == 0 )

num_of_ones
num_of_zeros

if( num_of_ones > num_of_zeros){
    predicted_label <- 1
}else if(num_of_ones < num_of_zeros){
    predicted_label <- 0 
}else{
    predicted_label <- sample( 0:1, 1) # if there is a tie, toss a coin
}

predicted_label

k-Nearest Neighbor Classifier: Predicting the labels of each test data

In [11]:

# Combine and modify the above four steps to predict the labels of the test data, using the kNN classifier

# number of test data
num_test_rows <- dim(testdata)[1]

# set up an empty data frame, to be filled with predicted labels
prediction <- data.frame( pred_label = double(num_test_rows) )

# number of training data
num_training <- dim(trainingdata)[1]
# set up an empty data frame, to be filled with distance between test data point to each training data point
df <- data.frame( class = trainingdata$Class, distance = double(num_training) )

# specify which feature columns to be included
featurecolumns <- c(2:10)  


# Loop over each test data point
count_test <- 1
while( count_test <= num_test_rows){
    
    # Step 1: Measure distance between new current test data point and each training data point
    # Given: trainingdata and testdata[count_test,]
    # Goal: find distance between testdata[count_test,] and each row of trainingdata, in terms of specified features
    count <- 1
    while( count <= num_training){

        df$distance[count] <- sqrt(  sum( (trainingdata[count, featurecolumns] - testdata[count_test, featurecolumns])^2 ) )

        count <- count + 1
    }

    # Step 2: Find the k nearest neighbors

    k <- 5  # As an example, let's say we look at the 5 nearest neighbors
    indices_of_knn <- order( df$distance )[1:k]
    labels_of_knn <- df$class[indices_of_knn]
    
    # Steps 3 & 4: Take a majority vote to predict label of new data point

    num_ones <- sum( labels_of_knn == 1)
    num_zeros <- sum( labels_of_knn == 0)

    if(num_ones >= num_zeros){
        predictedlabel <- 1
    }else{
        predictedlabel <- 0
    }

    # Store the predicted label in the prediction data frame
    prediction$pred_label[count_test] <- predictedlabel
    
    
    # go to the next test data point
    count_test <- count_test + 1
}

In [0]:

In [0]: