Multidimensional Scaling

olive <- read.csv("datasets/olive.csv")
set.seed(123)
N <- sample(1:323, size = 50)
D <- sample(324:421, size = 50)
S <- sample(422:572, size = 50)

acidsamp <- olive[c(N, D, S), 3:10]
acidsamp_cmds <- cmdscale(dist(acidsamp), k = 2, eig = TRUE)
acidsamp_cmds
## $points
##            [,1]        [,2]
## 179  427.219756  -44.268611
## 14  -774.637419  120.458029
## 195  686.795336   -2.781312
## 306 1219.105580 -319.686505
## 118  392.154160 -141.274308
## 299 1089.176977 -323.934324
## 229  506.842424 -208.847004
## 244  484.430420  -93.558750
## 322 1075.800125 -296.656537
## 153  435.665447 -161.163374
## 90   364.873190  -98.666963
## 91   270.373979  -67.905024
## 256  389.098466 -139.379592
## 197  590.715813  -77.391031
## 312  751.874960 -292.892171
## 137  382.132528 -188.252377
## 26    26.174153 -202.669378
## 7   -757.133097   55.261494
## 308  431.518991 -163.302791
## 254  593.262950 -106.793674
## 211   51.178747  -77.543614
## 78  -222.090902 -194.968444
## 81   -64.018387 -127.263716
## 43   184.688919 -154.389260
## 143  626.005044  -87.633464
## 32   250.171180 -200.031743
## 109  379.576075  -44.945363
## 263 -609.346168 -236.752863
## 23  -444.979120  -32.644826
## 135  319.258384  -77.549607
## 224  500.535240 -256.941872
## 166  565.084825  -56.622817
## 217  451.501941 -292.752609
## 290 -266.203569  -98.263400
## 69    91.145814 -268.776876
## 72   100.679873 -293.482838
## 76   -23.721782 -175.037384
## 63  -133.840514 -392.492452
## 141  425.268722  -39.766019
## 210  436.856245  -71.892445
## 314 1108.083706 -428.326553
## 277  508.760902 -240.503552
## 41   -17.848661 -123.118838
## 313  858.175142 -317.766400
## 223  415.425845 -241.818721
## 16  -280.489499  -39.546744
## 116  589.029633    1.786511
## 94   610.958898  -68.690775
## 262   43.004955  -87.163607
## 235  527.088039  -21.281688
## 395  127.872631  194.066222
## 409   76.859906  168.962582
## 420  497.127293  222.372172
## 362   75.368960  118.561012
## 354   -2.134092  160.065338
## 404  111.609434  159.096692
## 373  647.883161  261.659441
## 357   67.011263  158.977858
## 327  147.659353  159.167942
## 336  419.885033  209.777621
## 392  128.163358  171.847704
## 348   64.011678  170.837402
## 375  337.282853  241.241274
## 345  100.613568  107.602530
## 355   39.011009  115.056234
## 410  314.950956  238.852086
## 358   38.115945  168.496251
## 363    6.867943  197.035204
## 353  100.919520  122.725660
## 335  361.343696  228.914538
## 417  440.017451  269.418327
## 403   66.696685  240.485578
## 387  -16.896631  223.376897
## 337  409.082452  240.193954
## 394  176.361905  251.712727
## 390  189.868121  109.396541
## 346   21.142985  135.025079
## 360    8.665299  198.485916
## 331   22.491692  115.530435
## 374  452.881875  227.564772
## 415  469.578415  260.461076
## 369   49.638262  103.131606
## 340  486.260582  167.432878
## 385  403.718597  188.627625
## 396   65.174450  243.913559
## 377  301.650065  240.047158
## 405   61.392334  209.216907
## 416  334.150102  246.306944
## 399   83.533119  213.949159
## 411  406.064910  187.143473
## 400   66.172612  145.626919
## 338  349.985982  245.193920
## 347   14.625329  153.238265
## 372  329.812841  172.222394
## 421  474.413587  228.843407
## 366    4.271817  181.695336
## 330   88.429568  131.575296
## 352   89.605056  101.305931
## 380  361.634563  234.305842
## 386    6.976311  229.416961
## 523   51.565147   22.198800
## 556 -502.926749  145.006246
## 426 -696.445922  -71.898040
## 491 -450.229450  -88.482708
## 437 -687.022863  -77.666144
## 445 -649.382348  -96.164977
## 453 -698.222231  -54.916308
## 442 -683.052752  -77.835543
## 476 -428.415193  -49.050783
## 496 -504.351605  -48.313261
## 504 -191.090668 -220.458692
## 460 -700.043563 -160.073474
## 475 -383.715599  -25.376590
## 558 -358.446948  145.048289
## 469 -702.098674  -79.091777
## 498 -392.701630  -61.632515
## 562 -336.176871  -33.046697
## 532 -361.000012  191.043846
## 561 -393.792080   15.565675
## 422 -694.172605  -66.045382
## 451 -719.900903  -53.985054
## 515 -616.517495 -150.547081
## 568 -166.614527 -164.052463
## 509 -612.137568 -164.895668
## 560 -321.719398    5.365754
## 563 -505.647845   11.340701
## 559 -461.572698  115.957008
## 441 -687.209425  -77.248699
## 525 -147.718868  186.502279
## 488 -447.068514  -37.236701
## 514 -633.510846 -145.200719
## 457 -713.809853  -55.987282
## 473 -274.478253 -237.538272
## 529 -282.386756   66.841792
## 443 -702.589075  -59.082174
## 470 -690.464522  -75.236871
## 463 -743.679585  -91.710294
## 480 -297.206173 -117.167680
## 505 -608.078373  -34.725715
## 432 -696.482549  -67.175432
## 564 -382.300160  -13.983427
## 429 -720.687109  -38.687833
## 467 -720.040168  -28.940568
## 506 -517.050842 -113.692796
## 487 -417.946614  -14.370959
## 557 -470.474336  101.678694
## 530 -383.873683  131.185578
## 569 -420.147225   47.227827
## 493 -580.531040 -109.876978
## 465 -739.641050  -93.841396
## 
## $eig
##   [1]  3.279952e+07  4.242974e+06  2.778704e+05  1.157159e+05  8.263281e+04
##   [6]  2.346772e+04  7.582906e+03  5.395512e+03  1.117523e-08  2.013365e-09
##  [11]  1.685660e-09  1.537039e-09  1.417738e-09  1.222937e-09  1.090113e-09
##  [16]  8.290391e-10  7.759173e-10  6.801184e-10  4.864411e-10  4.213739e-10
##  [21]  4.094898e-10  3.572302e-10  3.490804e-10  3.161002e-10  2.792696e-10
##  [26]  2.755511e-10  2.658948e-10  2.557513e-10  2.541600e-10  2.226078e-10
##  [31]  2.146574e-10  2.115814e-10  2.048817e-10  1.520932e-10  1.370202e-10
##  [36]  1.198429e-10  1.166664e-10  1.118213e-10  1.058304e-10  1.023346e-10
##  [41]  8.328555e-11  8.149369e-11  8.148143e-11  7.922675e-11  7.810205e-11
##  [46]  7.803348e-11  6.398549e-11  6.169093e-11  5.817167e-11  5.480546e-11
##  [51]  4.714920e-11  4.696581e-11  4.483958e-11  4.007372e-11  3.580961e-11
##  [56]  3.445920e-11  3.414161e-11  3.388443e-11  3.096850e-11  2.972932e-11
##  [61]  2.630018e-11  2.613223e-11  2.392226e-11  2.136580e-11  1.770987e-11
##  [66]  1.673932e-11  1.658988e-11  1.586081e-11  1.497265e-11  1.413226e-11
##  [71]  1.302525e-11  1.186214e-11  1.184481e-11  1.084021e-11  6.886057e-12
##  [76]  6.452765e-12  6.186574e-12  6.111511e-12  5.337698e-12  4.745252e-12
##  [81]  3.968809e-12  6.198418e-13  2.138177e-13 -2.301836e-13 -2.495897e-12
##  [86] -3.073930e-12 -3.604083e-12 -3.838081e-12 -5.058081e-12 -9.297083e-12
##  [91] -9.304059e-12 -9.889393e-12 -9.914366e-12 -1.108673e-11 -1.191921e-11
##  [96] -1.210580e-11 -1.266052e-11 -1.320901e-11 -1.688944e-11 -1.806698e-11
## [101] -1.817761e-11 -2.168352e-11 -2.244422e-11 -2.338238e-11 -2.869512e-11
## [106] -2.970929e-11 -3.008708e-11 -3.195371e-11 -3.461473e-11 -3.749660e-11
## [111] -3.877996e-11 -4.016813e-11 -4.073082e-11 -4.701789e-11 -5.369347e-11
## [116] -5.388519e-11 -5.511856e-11 -5.904619e-11 -6.182352e-11 -7.704097e-11
## [121] -7.775544e-11 -7.806119e-11 -7.924059e-11 -8.317791e-11 -9.084675e-11
## [126] -9.602659e-11 -1.001562e-10 -1.033909e-10 -1.041977e-10 -1.101353e-10
## [131] -1.257649e-10 -1.372324e-10 -1.426847e-10 -1.736353e-10 -1.784441e-10
## [136] -1.914889e-10 -2.759313e-10 -2.766728e-10 -3.242833e-10 -3.521565e-10
## [141] -3.786000e-10 -4.186406e-10 -4.580609e-10 -4.764192e-10 -8.292115e-10
## [146] -1.015324e-09 -2.095976e-09 -2.224213e-09 -4.286572e-09 -7.156656e-09
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.986349 0.986349

We can now plot the observations in the 2 dimensional space.

x <- acidsamp_cmds$points[, 1]
y <- acidsamp_cmds$points[, 2]
cols <- c(rep(1, 50), rep(2, 50), rep(3, 50))

plot(x, y, type = "n", xlab = "", ylab = "", main = "Classical")
text(x, y, olive[c(N, D, S), 1], cex = 1, col = cols)

Manual implementation of MDS

Fix \(x_1\) at the origin and construct a matrix \(B\) such that

\[b_{ij}=-\frac{1}{2}(d_{ij}^2-d_{1i}^2-d_{1j}^2)\]

d <- dist(acidsamp[1:10, ])
d2 <- as.matrix(d^2)
n <- dim(d2)[1]
m <- dim(acidsamp)[2]
B <- matrix(0, nrow = n, ncol = n)
for (i in 1:n) {
  for (j in 1:n) {
    B[i, j] <- -0.5 * (d2[i, j] - d2[i, 1] - d2[j, 1])
  }
}

Now compute the eigenvalues and eigenvectors of \(B\).

res <- eigen(B)
res$values
##  [1] 3.235186e+06 1.117165e+05 8.299727e+03 4.867870e+03 8.660542e+02
##  [6] 5.124428e+02 1.719652e+02 2.866762e+01 4.485425e-11 0.000000e+00

Calculate new observations of the same dimensionality as the original data.

newdata <- matrix(0, nrow = n, ncol = m)
for (i in 1:n) {
  for (j in 1:m) {
    newdata[i, j] <- sqrt(res$values[j]) * res$vectors[i, j]
  }
}

Compare the distance matrices:

sum(dist(newdata))
## [1] 28863.9
sum(dist(acidsamp[1:n, ]))
## [1] 28863.9

Now do it for \(k\) dimensions

Define a function predict for projecting new data into the MDS space.

predict <- function(BMatrix, k) {
  res <- eigen(BMatrix)
  n <- dim(BMatrix)[1]
  prediciton <- matrix(0, nrow = n, ncol = k)
  for (i in 1:n) {
    for (j in 1:k) {
      prediciton[i, j] <- sqrt(res$values[j]) * res$vectors[i, j]
    }
  }
  return(prediciton)
}

Run the function for \(k=2\).

predict(B, 2)
##              [,1]       [,2]
##  [1,]    0.000000    0.00000
##  [2,] 1203.811064 -167.58632
##  [3,] -240.640299   95.22897
##  [4,] -837.895678  -58.55267
##  [5,]    7.297074 -108.74612
##  [6,] -720.584408 -126.36960
##  [7,] -122.823548 -144.38341
##  [8,]  -70.031481  -41.73776
##  [9,] -696.525530  -86.19368
## [10,]  -40.184159 -115.41847

Do the same with the cmdscale function.

cmdscale(dist(acidsamp[1:10, ]), k = 2, eig = TRUE)
## $points
##            [,1]       [,2]
## 179  -148.67770  -78.26797
## 14  -1358.17277   37.96532
## 195    95.20482 -171.87990
## 306   686.30871   10.70529
## 118  -160.21337   27.39087
## 299   566.34888   68.76667
## 229   -31.44253   71.57468
## 244   -80.41337  -37.88076
## 322   543.99687   33.00112
## 153  -112.93953   38.62468
## 
## $eig
##  [1]  3.009384e+06  5.184249e+04  8.147892e+03  3.025409e+03  5.163232e+02
##  [6]  4.298861e+02  1.612138e+02  2.330035e+01  2.820563e-10 -1.557510e-11
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.9959968 0.9959968

The results are a bit different, but not complete off. We can for example see that the order of the eigenvalues is the same.

res$values
##  [1] 3.235186e+06 1.117165e+05 8.299727e+03 4.867870e+03 8.660542e+02
##  [6] 5.124428e+02 1.719652e+02 2.866762e+01 4.485425e-11 0.000000e+00

Why is this? We need to realize that fi you take any MDS solution and rotate, translate or reflect it, you will get an equivalent solution.

Procrustes analysis

Procrustes analysis is a technique used to compare two MDS solutions. The goal is to find the best transformation that aligns the two configurations.

library(vegan)
## Loading required package: permute
## Loading required package: lattice
sol1 <- cmdscale(dist(acidsamp[1:10, ]), k = 2, eig = TRUE)
sol2 <- predict(B, 2)
procrustes(sol1$points, sol2, symmetric = TRUE)
## 
## Call:
## procrustes(X = sol1$points, Y = sol2, symmetric = TRUE) 
## 
## Procrustes sum of squares:
## 2.565e-05

The result of a procrustes analysis is the sum of the squared differences between the two configurations. The smaller the value, the better the alignment. We can see that the two configurations are in fact identical.

Sammon Metric Least Squares & Kruskal Non-Metric Scaling

These two methods are available in the MASS package.

library(MASS)
acidsamp_samm <- sammon(dist(acidsamp), k = 2)
## Initial stress        : 0.00184
## stress after  10 iters: 0.00161, magic = 0.018
## stress after  20 iters: 0.00100, magic = 0.500
## stress after  30 iters: 0.00098, magic = 0.500
acidsamp_krus <- isoMDS(dist(acidsamp), k = 2)
## initial  value 1.488856 
## final  value 1.488816 
## converged

t-SNE dimension reduction

t-SNE is a dimension reduction method which uses the idea that the distances between points can be viewed as probabilities. The method tries to minimize the difference between the high-dimensional and low-dimensional probabilities. Similarity between \(x_i\) and \(x_j\) is defined as the probability that \(x_i\) would pick \(x_j\) as its neighbor if neighbors were picked in proportion to their probability density under a Gaussian centered at \(x_i\). If we denote this probability as \(p_{i|j}\), we can see that \(p_{i|j} \neq p_{j|i}\). That is why

\[p_{ij} = \frac{p_{i|j} + p_{j|i}}{2}\]

is used in practice.

The method has a parameter called perplexity which can be viewed as the number of neighbors that should be considered in the high-dimensional space. The method is sensitive to this parameter and it should be fine tuned. Usually the perplexity should be between 5 and 50.

library(Rtsne)

set.seed(123)
acidsamp_tsne <- Rtsne(acidsamp, perplexity = 30)
plot(acidsamp_tsne$Y, col = cols, pch = cols, xlab = "tSNE1", ylab = "tSNE2")
legend("topleft",
  legend = unique(olive[c(N, D, S), 1]),
  col = 1:3, pch = 1:3
)

Compare results

Compare the results of the 4 different methods in a plot.

par(mfrow = c(2, 2))
plot(acidsamp_cmds$points, col = cols, main = "Classical")
plot(acidsamp_samm$points, col = cols, main = "Sammon")
plot(acidsamp_krus$points, col = cols, main = "Kruskal")
plot(acidsamp_tsne$Y, col = cols, main = "t-SNE")

Apply Procrustes

Given a target matrix X and another matrix Y, procrustes will attempt to find the optimal scaling, rotation, and translation of Y to make it as similar to X as possible. If symmetric = TRUE, then procrustes will find the optimal transformation of \(\frac{Y}{\sqrt{Y^TY}}\) with respect to the target matrix, \(\frac{X}{\sqrt{X^TX}}\).

dimred <- list(
  cmds = acidsamp_cmds$points,
  samm = acidsamp_samm$points,
  krus = acidsamp_krus$points,
  tsne = acidsamp_tsne$Y
)

# SYMMETRIC
proc <- list()
index <- 0
for (i in 1:4) {
  for (j in 1:4) {
    index <- index + 1
    proc[[index]] <- procrustes(X = dimred[[i]], Y = dimred[[j]], symmetric = TRUE)
  }
}
names(proc) <- outer(names(dimred), names(dimred),
  FUN = function(x, y) {
    paste(x, y, sep = "_")
  }
)
round(
  matrix(
    sapply(proc, FUN = function(x) {
      x$ss
    }),
    nrow = 4, byrow = TRUE,
    dimnames = list(names(dimred), names(dimred))
  ),
  digits = 5
)
##         cmds    samm    krus    tsne
## cmds 0.00000 0.00092 0.00000 0.08907
## samm 0.00092 0.00000 0.00092 0.09657
## krus 0.00000 0.00092 0.00000 0.08907
## tsne 0.08907 0.09657 0.08907 0.00000
# ASYMMETRIC
proc2 <- list()
index <- 0
for (i in 1:4) {
  for (j in 1:4) {
    index <- index + 1
    proc2[[index]] <- procrustes(X = dimred[[i]], Y = dimred[[j]], symmetric = FALSE)
  }
}
names(proc2) <- outer(names(dimred), names(dimred),
  FUN = function(x, y) {
    paste(x, y, sep = "_")
  }
)
round(
  matrix(
    sapply(proc2, FUN = function(x) {
      x$ss
    }),
    nrow = 4, byrow = TRUE,
    dimnames = list(names(dimred), names(dimred))
  ),
  digits = 0
)
##       cmds  samm  krus    tsne
## cmds     0 34108     0 3299411
## samm 34776     0 34774 3647096
## krus     0 34106     0 3299415
## tsne   907   983   907       0

Visualizing Procrustes

Let’s plot the Procrustes errors between classical and Sammon.

plot(proc$cmds_samm)

plot(proc$cmds_samm, kind = 2)

p <- plot(proc$cmds_samm,
  type = "n", to.target = TRUE,
  main = "Procrustes Errors: Classical versus Sammon"
)
points(p$heads, col = cols, pch = 1) # equivalent to points(proc$cmds_samm$X, ...)
points(p$points, col = cols, pch = 2) # equivalent to points(proc$cmds_samm$Yrot, ...)
lines(proc$cmds_samm, type = "segments", col = cols, cex = 0.1)
legend("topleft", legend = c("Classical", "Sammon"), pch = c(1, 2))