olive <- read.csv("datasets/olive.csv")
set.seed(123)
N <- sample(1:323, size = 50)
D <- sample(324:421, size = 50)
S <- sample(422:572, size = 50)
acidsamp <- olive[c(N, D, S), 3:10]
acidsamp_cmds <- cmdscale(dist(acidsamp), k = 2, eig = TRUE)
acidsamp_cmds
## $points
## [,1] [,2]
## 179 427.219756 -44.268611
## 14 -774.637419 120.458029
## 195 686.795336 -2.781312
## 306 1219.105580 -319.686505
## 118 392.154160 -141.274308
## 299 1089.176977 -323.934324
## 229 506.842424 -208.847004
## 244 484.430420 -93.558750
## 322 1075.800125 -296.656537
## 153 435.665447 -161.163374
## 90 364.873190 -98.666963
## 91 270.373979 -67.905024
## 256 389.098466 -139.379592
## 197 590.715813 -77.391031
## 312 751.874960 -292.892171
## 137 382.132528 -188.252377
## 26 26.174153 -202.669378
## 7 -757.133097 55.261494
## 308 431.518991 -163.302791
## 254 593.262950 -106.793674
## 211 51.178747 -77.543614
## 78 -222.090902 -194.968444
## 81 -64.018387 -127.263716
## 43 184.688919 -154.389260
## 143 626.005044 -87.633464
## 32 250.171180 -200.031743
## 109 379.576075 -44.945363
## 263 -609.346168 -236.752863
## 23 -444.979120 -32.644826
## 135 319.258384 -77.549607
## 224 500.535240 -256.941872
## 166 565.084825 -56.622817
## 217 451.501941 -292.752609
## 290 -266.203569 -98.263400
## 69 91.145814 -268.776876
## 72 100.679873 -293.482838
## 76 -23.721782 -175.037384
## 63 -133.840514 -392.492452
## 141 425.268722 -39.766019
## 210 436.856245 -71.892445
## 314 1108.083706 -428.326553
## 277 508.760902 -240.503552
## 41 -17.848661 -123.118838
## 313 858.175142 -317.766400
## 223 415.425845 -241.818721
## 16 -280.489499 -39.546744
## 116 589.029633 1.786511
## 94 610.958898 -68.690775
## 262 43.004955 -87.163607
## 235 527.088039 -21.281688
## 395 127.872631 194.066222
## 409 76.859906 168.962582
## 420 497.127293 222.372172
## 362 75.368960 118.561012
## 354 -2.134092 160.065338
## 404 111.609434 159.096692
## 373 647.883161 261.659441
## 357 67.011263 158.977858
## 327 147.659353 159.167942
## 336 419.885033 209.777621
## 392 128.163358 171.847704
## 348 64.011678 170.837402
## 375 337.282853 241.241274
## 345 100.613568 107.602530
## 355 39.011009 115.056234
## 410 314.950956 238.852086
## 358 38.115945 168.496251
## 363 6.867943 197.035204
## 353 100.919520 122.725660
## 335 361.343696 228.914538
## 417 440.017451 269.418327
## 403 66.696685 240.485578
## 387 -16.896631 223.376897
## 337 409.082452 240.193954
## 394 176.361905 251.712727
## 390 189.868121 109.396541
## 346 21.142985 135.025079
## 360 8.665299 198.485916
## 331 22.491692 115.530435
## 374 452.881875 227.564772
## 415 469.578415 260.461076
## 369 49.638262 103.131606
## 340 486.260582 167.432878
## 385 403.718597 188.627625
## 396 65.174450 243.913559
## 377 301.650065 240.047158
## 405 61.392334 209.216907
## 416 334.150102 246.306944
## 399 83.533119 213.949159
## 411 406.064910 187.143473
## 400 66.172612 145.626919
## 338 349.985982 245.193920
## 347 14.625329 153.238265
## 372 329.812841 172.222394
## 421 474.413587 228.843407
## 366 4.271817 181.695336
## 330 88.429568 131.575296
## 352 89.605056 101.305931
## 380 361.634563 234.305842
## 386 6.976311 229.416961
## 523 51.565147 22.198800
## 556 -502.926749 145.006246
## 426 -696.445922 -71.898040
## 491 -450.229450 -88.482708
## 437 -687.022863 -77.666144
## 445 -649.382348 -96.164977
## 453 -698.222231 -54.916308
## 442 -683.052752 -77.835543
## 476 -428.415193 -49.050783
## 496 -504.351605 -48.313261
## 504 -191.090668 -220.458692
## 460 -700.043563 -160.073474
## 475 -383.715599 -25.376590
## 558 -358.446948 145.048289
## 469 -702.098674 -79.091777
## 498 -392.701630 -61.632515
## 562 -336.176871 -33.046697
## 532 -361.000012 191.043846
## 561 -393.792080 15.565675
## 422 -694.172605 -66.045382
## 451 -719.900903 -53.985054
## 515 -616.517495 -150.547081
## 568 -166.614527 -164.052463
## 509 -612.137568 -164.895668
## 560 -321.719398 5.365754
## 563 -505.647845 11.340701
## 559 -461.572698 115.957008
## 441 -687.209425 -77.248699
## 525 -147.718868 186.502279
## 488 -447.068514 -37.236701
## 514 -633.510846 -145.200719
## 457 -713.809853 -55.987282
## 473 -274.478253 -237.538272
## 529 -282.386756 66.841792
## 443 -702.589075 -59.082174
## 470 -690.464522 -75.236871
## 463 -743.679585 -91.710294
## 480 -297.206173 -117.167680
## 505 -608.078373 -34.725715
## 432 -696.482549 -67.175432
## 564 -382.300160 -13.983427
## 429 -720.687109 -38.687833
## 467 -720.040168 -28.940568
## 506 -517.050842 -113.692796
## 487 -417.946614 -14.370959
## 557 -470.474336 101.678694
## 530 -383.873683 131.185578
## 569 -420.147225 47.227827
## 493 -580.531040 -109.876978
## 465 -739.641050 -93.841396
##
## $eig
## [1] 3.279952e+07 4.242974e+06 2.778704e+05 1.157159e+05 8.263281e+04
## [6] 2.346772e+04 7.582906e+03 5.395512e+03 1.117523e-08 2.013365e-09
## [11] 1.685660e-09 1.537039e-09 1.417738e-09 1.222937e-09 1.090113e-09
## [16] 8.290391e-10 7.759173e-10 6.801184e-10 4.864411e-10 4.213739e-10
## [21] 4.094898e-10 3.572302e-10 3.490804e-10 3.161002e-10 2.792696e-10
## [26] 2.755511e-10 2.658948e-10 2.557513e-10 2.541600e-10 2.226078e-10
## [31] 2.146574e-10 2.115814e-10 2.048817e-10 1.520932e-10 1.370202e-10
## [36] 1.198429e-10 1.166664e-10 1.118213e-10 1.058304e-10 1.023346e-10
## [41] 8.328555e-11 8.149369e-11 8.148143e-11 7.922675e-11 7.810205e-11
## [46] 7.803348e-11 6.398549e-11 6.169093e-11 5.817167e-11 5.480546e-11
## [51] 4.714920e-11 4.696581e-11 4.483958e-11 4.007372e-11 3.580961e-11
## [56] 3.445920e-11 3.414161e-11 3.388443e-11 3.096850e-11 2.972932e-11
## [61] 2.630018e-11 2.613223e-11 2.392226e-11 2.136580e-11 1.770987e-11
## [66] 1.673932e-11 1.658988e-11 1.586081e-11 1.497265e-11 1.413226e-11
## [71] 1.302525e-11 1.186214e-11 1.184481e-11 1.084021e-11 6.886057e-12
## [76] 6.452765e-12 6.186574e-12 6.111511e-12 5.337698e-12 4.745252e-12
## [81] 3.968809e-12 6.198418e-13 2.138177e-13 -2.301836e-13 -2.495897e-12
## [86] -3.073930e-12 -3.604083e-12 -3.838081e-12 -5.058081e-12 -9.297083e-12
## [91] -9.304059e-12 -9.889393e-12 -9.914366e-12 -1.108673e-11 -1.191921e-11
## [96] -1.210580e-11 -1.266052e-11 -1.320901e-11 -1.688944e-11 -1.806698e-11
## [101] -1.817761e-11 -2.168352e-11 -2.244422e-11 -2.338238e-11 -2.869512e-11
## [106] -2.970929e-11 -3.008708e-11 -3.195371e-11 -3.461473e-11 -3.749660e-11
## [111] -3.877996e-11 -4.016813e-11 -4.073082e-11 -4.701789e-11 -5.369347e-11
## [116] -5.388519e-11 -5.511856e-11 -5.904619e-11 -6.182352e-11 -7.704097e-11
## [121] -7.775544e-11 -7.806119e-11 -7.924059e-11 -8.317791e-11 -9.084675e-11
## [126] -9.602659e-11 -1.001562e-10 -1.033909e-10 -1.041977e-10 -1.101353e-10
## [131] -1.257649e-10 -1.372324e-10 -1.426847e-10 -1.736353e-10 -1.784441e-10
## [136] -1.914889e-10 -2.759313e-10 -2.766728e-10 -3.242833e-10 -3.521565e-10
## [141] -3.786000e-10 -4.186406e-10 -4.580609e-10 -4.764192e-10 -8.292115e-10
## [146] -1.015324e-09 -2.095976e-09 -2.224213e-09 -4.286572e-09 -7.156656e-09
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.986349 0.986349
We can now plot the observations in the 2 dimensional space.
x <- acidsamp_cmds$points[, 1]
y <- acidsamp_cmds$points[, 2]
cols <- c(rep(1, 50), rep(2, 50), rep(3, 50))
plot(x, y, type = "n", xlab = "", ylab = "", main = "Classical")
text(x, y, olive[c(N, D, S), 1], cex = 1, col = cols)
Fix \(x_1\) at the origin and construct a matrix \(B\) such that
\[b_{ij}=-\frac{1}{2}(d_{ij}^2-d_{1i}^2-d_{1j}^2)\]
d <- dist(acidsamp[1:10, ])
d2 <- as.matrix(d^2)
n <- dim(d2)[1]
m <- dim(acidsamp)[2]
B <- matrix(0, nrow = n, ncol = n)
for (i in 1:n) {
for (j in 1:n) {
B[i, j] <- -0.5 * (d2[i, j] - d2[i, 1] - d2[j, 1])
}
}
Now compute the eigenvalues and eigenvectors of \(B\).
res <- eigen(B)
res$values
## [1] 3.235186e+06 1.117165e+05 8.299727e+03 4.867870e+03 8.660542e+02
## [6] 5.124428e+02 1.719652e+02 2.866762e+01 4.485425e-11 0.000000e+00
Calculate new observations of the same dimensionality as the original data.
newdata <- matrix(0, nrow = n, ncol = m)
for (i in 1:n) {
for (j in 1:m) {
newdata[i, j] <- sqrt(res$values[j]) * res$vectors[i, j]
}
}
Compare the distance matrices:
sum(dist(newdata))
## [1] 28863.9
sum(dist(acidsamp[1:n, ]))
## [1] 28863.9
Define a function predict for projecting new data into the MDS space.
predict <- function(BMatrix, k) {
res <- eigen(BMatrix)
n <- dim(BMatrix)[1]
prediciton <- matrix(0, nrow = n, ncol = k)
for (i in 1:n) {
for (j in 1:k) {
prediciton[i, j] <- sqrt(res$values[j]) * res$vectors[i, j]
}
}
return(prediciton)
}
Run the function for \(k=2\).
predict(B, 2)
## [,1] [,2]
## [1,] 0.000000 0.00000
## [2,] 1203.811064 -167.58632
## [3,] -240.640299 95.22897
## [4,] -837.895678 -58.55267
## [5,] 7.297074 -108.74612
## [6,] -720.584408 -126.36960
## [7,] -122.823548 -144.38341
## [8,] -70.031481 -41.73776
## [9,] -696.525530 -86.19368
## [10,] -40.184159 -115.41847
Do the same with the cmdscale
function.
cmdscale(dist(acidsamp[1:10, ]), k = 2, eig = TRUE)
## $points
## [,1] [,2]
## 179 -148.67770 -78.26797
## 14 -1358.17277 37.96532
## 195 95.20482 -171.87990
## 306 686.30871 10.70529
## 118 -160.21337 27.39087
## 299 566.34888 68.76667
## 229 -31.44253 71.57468
## 244 -80.41337 -37.88076
## 322 543.99687 33.00112
## 153 -112.93953 38.62468
##
## $eig
## [1] 3.009384e+06 5.184249e+04 8.147892e+03 3.025409e+03 5.163232e+02
## [6] 4.298861e+02 1.612138e+02 2.330035e+01 2.820563e-10 -1.557510e-11
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.9959968 0.9959968
The results are a bit different, but not complete off. We can for example see that the order of the eigenvalues is the same.
res$values
## [1] 3.235186e+06 1.117165e+05 8.299727e+03 4.867870e+03 8.660542e+02
## [6] 5.124428e+02 1.719652e+02 2.866762e+01 4.485425e-11 0.000000e+00
Why is this? We need to realize that fi you take any MDS solution and rotate, translate or reflect it, you will get an equivalent solution.
Procrustes analysis is a technique used to compare two MDS solutions. The goal is to find the best transformation that aligns the two configurations.
library(vegan)
## Loading required package: permute
## Loading required package: lattice
sol1 <- cmdscale(dist(acidsamp[1:10, ]), k = 2, eig = TRUE)
sol2 <- predict(B, 2)
procrustes(sol1$points, sol2, symmetric = TRUE)
##
## Call:
## procrustes(X = sol1$points, Y = sol2, symmetric = TRUE)
##
## Procrustes sum of squares:
## 2.565e-05
The result of a procrustes analysis is the sum of the squared differences between the two configurations. The smaller the value, the better the alignment. We can see that the two configurations are in fact identical.
These two methods are available in the MASS
package.
library(MASS)
acidsamp_samm <- sammon(dist(acidsamp), k = 2)
## Initial stress : 0.00184
## stress after 10 iters: 0.00161, magic = 0.018
## stress after 20 iters: 0.00100, magic = 0.500
## stress after 30 iters: 0.00098, magic = 0.500
acidsamp_krus <- isoMDS(dist(acidsamp), k = 2)
## initial value 1.488856
## final value 1.488816
## converged
t-SNE is a dimension reduction method which uses the idea that the distances between points can be viewed as probabilities. The method tries to minimize the difference between the high-dimensional and low-dimensional probabilities. Similarity between \(x_i\) and \(x_j\) is defined as the probability that \(x_i\) would pick \(x_j\) as its neighbor if neighbors were picked in proportion to their probability density under a Gaussian centered at \(x_i\). If we denote this probability as \(p_{i|j}\), we can see that \(p_{i|j} \neq p_{j|i}\). That is why
\[p_{ij} = \frac{p_{i|j} + p_{j|i}}{2}\]
is used in practice.
The method has a parameter called perplexity which can be viewed as the number of neighbors that should be considered in the high-dimensional space. The method is sensitive to this parameter and it should be fine tuned. Usually the perplexity should be between 5 and 50.
library(Rtsne)
set.seed(123)
acidsamp_tsne <- Rtsne(acidsamp, perplexity = 30)
plot(acidsamp_tsne$Y, col = cols, pch = cols, xlab = "tSNE1", ylab = "tSNE2")
legend("topleft",
legend = unique(olive[c(N, D, S), 1]),
col = 1:3, pch = 1:3
)
Compare the results of the 4 different methods in a plot.
par(mfrow = c(2, 2))
plot(acidsamp_cmds$points, col = cols, main = "Classical")
plot(acidsamp_samm$points, col = cols, main = "Sammon")
plot(acidsamp_krus$points, col = cols, main = "Kruskal")
plot(acidsamp_tsne$Y, col = cols, main = "t-SNE")
Given a target matrix X
and another matrix
Y
, procrustes will attempt to find the optimal scaling,
rotation, and translation of Y
to make it as similar to
X
as possible. If symmetric = TRUE
, then
procrustes will find the optimal transformation of \(\frac{Y}{\sqrt{Y^TY}}\) with respect to the
target matrix, \(\frac{X}{\sqrt{X^TX}}\).
dimred <- list(
cmds = acidsamp_cmds$points,
samm = acidsamp_samm$points,
krus = acidsamp_krus$points,
tsne = acidsamp_tsne$Y
)
# SYMMETRIC
proc <- list()
index <- 0
for (i in 1:4) {
for (j in 1:4) {
index <- index + 1
proc[[index]] <- procrustes(X = dimred[[i]], Y = dimred[[j]], symmetric = TRUE)
}
}
names(proc) <- outer(names(dimred), names(dimred),
FUN = function(x, y) {
paste(x, y, sep = "_")
}
)
round(
matrix(
sapply(proc, FUN = function(x) {
x$ss
}),
nrow = 4, byrow = TRUE,
dimnames = list(names(dimred), names(dimred))
),
digits = 5
)
## cmds samm krus tsne
## cmds 0.00000 0.00092 0.00000 0.08907
## samm 0.00092 0.00000 0.00092 0.09657
## krus 0.00000 0.00092 0.00000 0.08907
## tsne 0.08907 0.09657 0.08907 0.00000
# ASYMMETRIC
proc2 <- list()
index <- 0
for (i in 1:4) {
for (j in 1:4) {
index <- index + 1
proc2[[index]] <- procrustes(X = dimred[[i]], Y = dimred[[j]], symmetric = FALSE)
}
}
names(proc2) <- outer(names(dimred), names(dimred),
FUN = function(x, y) {
paste(x, y, sep = "_")
}
)
round(
matrix(
sapply(proc2, FUN = function(x) {
x$ss
}),
nrow = 4, byrow = TRUE,
dimnames = list(names(dimred), names(dimred))
),
digits = 0
)
## cmds samm krus tsne
## cmds 0 34108 0 3299411
## samm 34776 0 34774 3647096
## krus 0 34106 0 3299415
## tsne 907 983 907 0
Let’s plot the Procrustes errors between classical and Sammon.
plot(proc$cmds_samm)
plot(proc$cmds_samm, kind = 2)
p <- plot(proc$cmds_samm,
type = "n", to.target = TRUE,
main = "Procrustes Errors: Classical versus Sammon"
)
points(p$heads, col = cols, pch = 1) # equivalent to points(proc$cmds_samm$X, ...)
points(p$points, col = cols, pch = 2) # equivalent to points(proc$cmds_samm$Yrot, ...)
lines(proc$cmds_samm, type = "segments", col = cols, cex = 0.1)
legend("topleft", legend = c("Classical", "Sammon"), pch = c(1, 2))