k-Nearest Neighbours

k-NN is a simple non-parametric classification algorithm that makes no assumptions on the distribution of the data. It simply looks at he k-nearest neighbours of a given point and assigns the majority class to that point.

library(class) # for knn
## Warning: package 'class' was built under R version 4.4.2
library(MASS) # for generating data from the MVN distribution

We will generate some synthetic data from a multivariate normal distribution and then classify the points using k-NN.

simA <- mvrnorm(1000, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), nrow = 2))
simB <- mvrnorm(1000, mu = c(5, 3), Sigma = matrix(c(12, 2, 2, 15), nrow = 2))
simT <- rbind(simA, simB)
dim(simT)
## [1] 2000    2

Add labels to the data, 1 for class A and 2 for class B.

class_ind <- rep(c(1, 2), each = 1000)
simT <- cbind(simT, class_ind)
plot(simT[, 1], simT[, 2], col = (simT[, 3]))

Note: as.factor is used to distinguish that the class membership is a categorical variable (not numerical), although it isn’t strictly necessary to use it here.

Splitting the data

We will split the data into training, testing and validation sets with proportions 0.5, 0.25 and 0.25 respectively.

# note that the first 900 rows are class A and the next 900 are class B
train_ind <- c(1:500, 1001:1500)
test_ind <- c(501:750, 1501:1750)
val_ind <- c(751:1000, 1751:2000)

train <- simT[train_ind, -3]
test <- simT[test_ind, -3]
val <- simT[val_ind, -3]

Performing k-NN

The knn function from the class package is used to perform k-NN classification. It takes in the training data train, its class labels cl and k as arguments. The function returns the predicted class labels for the test data.

knn_pred <- knn(train = train, test, cl = simT[train_ind, 3], k = 5)

Let’s look agt the missclassification rate for k=5.

table <- table(knn_pred, simT[test_ind, 3])
print(table)
##         
## knn_pred   1   2
##        1 223  51
##        2  27 199
missclassification_rate <- 1 - sum(diag(table)) / sum(table)
print(paste("Missclassification rate: ", missclassification_rate))
## [1] "Missclassification rate:  0.156"

Choosing the best k

Testing the missclassification rate for different values of k.

kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)
ntest <- nrow(test)

k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")

for (i in 1:kmax) {
  result <- knn(train, test, cl = simT[train_ind, 3], k = i)
  table <- table(result, simT[test_ind, 3])
  sum_agree <- sum(diag(table))
  k_summary[i, 2] <- (ntest - sum_agree) / ntest
}

k_summary[1:10, ]
##        k % misclassified
##  [1,]  1           0.164
##  [2,]  2           0.178
##  [3,]  3           0.164
##  [4,]  4           0.178
##  [5,]  5           0.156
##  [6,]  6           0.160
##  [7,]  7           0.164
##  [8,]  8           0.164
##  [9,]  9           0.158
## [10,] 10           0.154

We can plot the the results:

plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")

Pick the lowest value of k that gives the lowest misclassification rate. We can use which.min which returns the index of the first occurrence of the minimum value in a vector.

min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))
## [1] "Best k:  22"

Validation

Now we can use the best \(k\) to classify the validation set to estimate the classification error.

val_pred <- knn(train, val, cl = simT[train_ind, 3], k = min_k)
table <- table(val_pred, simT[val_ind, 3])
error <- 1 - sum(diag(table)) / sum(table)
print(paste("Validation error: ", error))
## [1] "Validation error:  0.15"

We can plot how the model performs on the validation set.

par(mfrow = c(1, 2))
plot(val[, 1], val[, 2], col = val_pred, main = "Predicted")
plot(val[, 1], val[, 2], col = simT[val_ind, 3], main = "Actual")

Choosing the best k using cross-validation

We can use cross-validation to choose the best value of \(k\). Let’s use 10-fold cross-validation.

# split the data into 10 folds
folds <- cut(1:1000, breaks = 10, labels = FALSE)
folds <- rep(folds, 2)

kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)

k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")

for (i in 1:kmax) {
  proportion_classified <- 0
  for (j in 1:10) {
    train_ind <- which(folds != j) # train on all folds except the j-th fold
    test_ind <- which(folds == j) # test on the j-th fold

    result <- knn(train = simT[train_ind, -3], simT[test_ind, -3], cl = simT[train_ind, 3], k = i)
    table <- table(result, simT[test_ind, 3])
    proportion_classified <- proportion_classified + sum(diag(table)) / sum(table)
  }
  k_summary[i, 2] <- 1 - proportion_classified / 10
}

k_summary[1:10, ]
##        k % misclassified
##  [1,]  1          0.1795
##  [2,]  2          0.1895
##  [3,]  3          0.1695
##  [4,]  4          0.1635
##  [5,]  5          0.1640
##  [6,]  6          0.1695
##  [7,]  7          0.1605
##  [8,]  8          0.1550
##  [9,]  9          0.1580
## [10,] 10          0.1530
plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")

min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))
## [1] "Best k:  34"