k-NN is a simple non-parametric classification algorithm that makes no assumptions on the distribution of the data. It simply looks at he k-nearest neighbours of a given point and assigns the majority class to that point.
library(class) # for knn
## Warning: package 'class' was built under R version 4.4.2
library(MASS) # for generating data from the MVN distribution
We will generate some synthetic data from a multivariate normal distribution and then classify the points using k-NN.
simA <- mvrnorm(1000, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), nrow = 2))
simB <- mvrnorm(1000, mu = c(5, 3), Sigma = matrix(c(12, 2, 2, 15), nrow = 2))
simT <- rbind(simA, simB)
dim(simT)
## [1] 2000 2
Add labels to the data, 1 for class A and 2 for class B.
class_ind <- rep(c(1, 2), each = 1000)
simT <- cbind(simT, class_ind)
plot(simT[, 1], simT[, 2], col = (simT[, 3]))
Note: as.factor
is used to distinguish that the class
membership is a categorical variable (not numerical), although it isn’t
strictly necessary to use it here.
We will split the data into training, testing and validation sets with proportions 0.5, 0.25 and 0.25 respectively.
# note that the first 900 rows are class A and the next 900 are class B
train_ind <- c(1:500, 1001:1500)
test_ind <- c(501:750, 1501:1750)
val_ind <- c(751:1000, 1751:2000)
train <- simT[train_ind, -3]
test <- simT[test_ind, -3]
val <- simT[val_ind, -3]
The knn
function from the class
package is
used to perform k-NN classification. It takes in the training data
train
, its class labels cl
and k
as arguments. The function returns the predicted class labels for the
test
data.
knn_pred <- knn(train = train, test, cl = simT[train_ind, 3], k = 5)
Let’s look agt the missclassification rate for k=5.
table <- table(knn_pred, simT[test_ind, 3])
print(table)
##
## knn_pred 1 2
## 1 223 51
## 2 27 199
missclassification_rate <- 1 - sum(diag(table)) / sum(table)
print(paste("Missclassification rate: ", missclassification_rate))
## [1] "Missclassification rate: 0.156"
Testing the missclassification rate for different values of k.
kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)
ntest <- nrow(test)
k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")
for (i in 1:kmax) {
result <- knn(train, test, cl = simT[train_ind, 3], k = i)
table <- table(result, simT[test_ind, 3])
sum_agree <- sum(diag(table))
k_summary[i, 2] <- (ntest - sum_agree) / ntest
}
k_summary[1:10, ]
## k % misclassified
## [1,] 1 0.164
## [2,] 2 0.178
## [3,] 3 0.164
## [4,] 4 0.178
## [5,] 5 0.156
## [6,] 6 0.160
## [7,] 7 0.164
## [8,] 8 0.164
## [9,] 9 0.158
## [10,] 10 0.154
We can plot the the results:
plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")
Pick the lowest value of k that gives the lowest misclassification
rate. We can use which.min
which returns the index of the
first occurrence of the minimum value in a vector.
min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))
## [1] "Best k: 22"
Now we can use the best \(k\) to classify the validation set to estimate the classification error.
val_pred <- knn(train, val, cl = simT[train_ind, 3], k = min_k)
table <- table(val_pred, simT[val_ind, 3])
error <- 1 - sum(diag(table)) / sum(table)
print(paste("Validation error: ", error))
## [1] "Validation error: 0.15"
We can plot how the model performs on the validation set.
par(mfrow = c(1, 2))
plot(val[, 1], val[, 2], col = val_pred, main = "Predicted")
plot(val[, 1], val[, 2], col = simT[val_ind, 3], main = "Actual")
We can use cross-validation to choose the best value of \(k\). Let’s use 10-fold cross-validation.
# split the data into 10 folds
folds <- cut(1:1000, breaks = 10, labels = FALSE)
folds <- rep(folds, 2)
kmax <- 50
k <- 1:kmax
p <- rep(0, kmax)
k_summary <- cbind(k, p)
colnames(k_summary) <- c("k", "% misclassified")
for (i in 1:kmax) {
proportion_classified <- 0
for (j in 1:10) {
train_ind <- which(folds != j) # train on all folds except the j-th fold
test_ind <- which(folds == j) # test on the j-th fold
result <- knn(train = simT[train_ind, -3], simT[test_ind, -3], cl = simT[train_ind, 3], k = i)
table <- table(result, simT[test_ind, 3])
proportion_classified <- proportion_classified + sum(diag(table)) / sum(table)
}
k_summary[i, 2] <- 1 - proportion_classified / 10
}
k_summary[1:10, ]
## k % misclassified
## [1,] 1 0.1795
## [2,] 2 0.1895
## [3,] 3 0.1695
## [4,] 4 0.1635
## [5,] 5 0.1640
## [6,] 6 0.1695
## [7,] 7 0.1605
## [8,] 8 0.1550
## [9,] 9 0.1580
## [10,] 10 0.1530
plot(k_summary[, 1], k_summary[, 2], type = "l", xlab = "k", ylab = "% misclassified")
min_k <- which.min(k_summary[, 2])
print(paste("Best k: ", min_k))
## [1] "Best k: 34"