k-NN is a simple non-parametric classification algorithm that makes no assumptions on the distribution of the data. It simply looks at he k-nearest neighbours of a given point and assigns the majority class to that point.
library(class) # for knn## Warning: package 'class' was built under R version 4.4.2library(MASS) # for generating data from the MVN distributionWe will generate some synthetic data from a multivariate normal distribution and then classify the points using k-NN.
simA <- mvrnorm(1000, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), nrow = 2))
simB <- mvrnorm(1000, mu = c(5, 3), Sigma = matrix(c(12, 2, 2, 15), nrow = 2))
simT <- rbind(simA, simB)
dim(simT)## [1] 2000    2Add labels to the data, 1 for class A and 2 for class B.
class_ind <- rep(c(1, 2), each = 1000)
simT <- cbind(simT, class_ind)
plot(simT[, 1], simT[, 2], col = (simT[, 3]))Note: as.factor is used to distinguish that the class
membership is a categorical variable (not numerical), although it isn’t
strictly necessary to use it here.
We will split the data into training, testing and validation sets with proportions 0.5, 0.25 and 0.25 respectively.
# note that the first 900 rows are class A and the next 900 are class B
train_ind <- c(1:500, 1001:1500)
test_ind <- c(501:750, 1501:1750)
val_ind <- c(751:1000, 1751:2000)
train <- simT[train_ind, -3]
test <- simT[test_ind, -3]
val <- simT[val_ind, -3]The knn function from the class package is
used to perform k-NN classification. It takes in the training data
train, its class labels cl and k
as arguments. The function returns the predicted class labels for the
test data.
knn_pred <- knn(train = train, test, cl = simT[train_ind, 3], k = 5)Let’s look agt the missclassification rate for k=5.
table <- table(knn_pred, simT[test_ind, 3])
print(table)##         
## knn_pred   1   2
##        1 223  51
##        2  27 199missclassification_rate <- 1 - sum(diag(table)) / sum(table)
print(paste("Missclassification rate: ", missclassification_rate))## [1] "Missclassification rate:  0.156"Testing the missclassification rate for different values of k.
kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)
ntest <- nrow(test)
k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")
for (i in 1:kmax) {
  result <- knn(train, test, cl = simT[train_ind, 3], k = i)
  table <- table(result, simT[test_ind, 3])
  sum_agree <- sum(diag(table))
  k_summary[i, 2] <- (ntest - sum_agree) / ntest
}
k_summary[1:10, ]##        k % misclassified
##  [1,]  1           0.164
##  [2,]  2           0.178
##  [3,]  3           0.164
##  [4,]  4           0.178
##  [5,]  5           0.156
##  [6,]  6           0.160
##  [7,]  7           0.164
##  [8,]  8           0.164
##  [9,]  9           0.158
## [10,] 10           0.154We can plot the the results:
plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")Pick the lowest value of k that gives the lowest misclassification
rate. We can use which.min which returns the index of the
first occurrence of the minimum value in a vector.
min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))## [1] "Best k:  22"Now we can use the best \(k\) to classify the validation set to estimate the classification error.
val_pred <- knn(train, val, cl = simT[train_ind, 3], k = min_k)
table <- table(val_pred, simT[val_ind, 3])
error <- 1 - sum(diag(table)) / sum(table)
print(paste("Validation error: ", error))## [1] "Validation error:  0.15"We can plot how the model performs on the validation set.
par(mfrow = c(1, 2))
plot(val[, 1], val[, 2], col = val_pred, main = "Predicted")
plot(val[, 1], val[, 2], col = simT[val_ind, 3], main = "Actual")We can use cross-validation to choose the best value of \(k\). Let’s use 10-fold cross-validation.
# split the data into 10 folds
folds <- cut(1:1000, breaks = 10, labels = FALSE)
folds <- rep(folds, 2)
kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)
k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")
for (i in 1:kmax) {
  proportion_classified <- 0
  for (j in 1:10) {
    train_ind <- which(folds != j) # train on all folds except the j-th fold
    test_ind <- which(folds == j) # test on the j-th fold
    result <- knn(train = simT[train_ind, -3], simT[test_ind, -3], cl = simT[train_ind, 3], k = i)
    table <- table(result, simT[test_ind, 3])
    proportion_classified <- proportion_classified + sum(diag(table)) / sum(table)
  }
  k_summary[i, 2] <- 1 - proportion_classified / 10
}
k_summary[1:10, ]##        k % misclassified
##  [1,]  1          0.1795
##  [2,]  2          0.1895
##  [3,]  3          0.1695
##  [4,]  4          0.1635
##  [5,]  5          0.1640
##  [6,]  6          0.1695
##  [7,]  7          0.1605
##  [8,]  8          0.1550
##  [9,]  9          0.1580
## [10,] 10          0.1530plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))## [1] "Best k:  34"