Data manipulation

Load data

music <- read.csv("datasets/music.csv")
# str shows variables on rows
str(music)
## 'data.frame':    62 obs. of  8 variables:
##  $ X     : chr  "Dancing Queen" "Knowing Me" "Take a Chance" "Mamma Mia" ...
##  $ Artist: chr  "Abba" "Abba" "Abba" "Abba" ...
##  $ Type  : chr  "Rock" "Rock" "Rock" "Rock" ...
##  $ LVar  : num  17600756 9543021 9049482 7557437 6282286 ...
##  $ LAve  : num  -90 -75.8 -98.1 -90.5 -89 ...
##  $ LMax  : int  29921 27626 26372 28898 27940 25531 14699 8928 22962 15517 ...
##  $ LFEner: num  106 103 102 102 100 ...
##  $ LFreq : num  59.6 58.5 124.6 48.8 74 ...
# head shows the first few observations in a table
head(music)
##               X Artist Type     LVar      LAve  LMax   LFEner     LFreq
## 1 Dancing Queen   Abba Rock 17600756 -90.00687 29921 105.9210  59.57379
## 2    Knowing Me   Abba Rock  9543021 -75.76672 27626 102.8362  58.48031
## 3 Take a Chance   Abba Rock  9049482 -98.06292 26372 102.3249 124.59397
## 4     Mamma Mia   Abba Rock  7557437 -90.47106 28898 101.6165  48.76513
## 5   Lay All You   Abba Rock  6282286 -88.95263 27940 100.3008  74.02039
## 6 Super Trouper   Abba Rock  4665867 -69.02084 25531 100.2485  81.40140

Beware! In R we index from 1, not 0.

music[1, 1]
## [1] "Dancing Queen"

Splicing works as well:

music[c(1, 2, 3), 1]
## [1] "Dancing Queen" "Knowing Me"    "Take a Chance"
music[1:3, 1]
## [1] "Dancing Queen" "Knowing Me"    "Take a Chance"
music[1, ] # first ROW
##               X Artist Type     LVar      LAve  LMax  LFEner    LFreq
## 1 Dancing Queen   Abba Rock 17600756 -90.00687 29921 105.921 59.57379
music[, 2] # second COLUMN
##  [1] "Abba"      "Abba"      "Abba"      "Abba"      "Abba"      "Abba"     
##  [7] "Abba"      "Abba"      "Abba"      "Abba"      "Vivaldi"   "Vivaldi"  
## [13] "Vivaldi"   "Vivaldi"   "Vivaldi"   "Vivaldi"   "Vivaldi"   "Vivaldi"  
## [19] "Vivaldi"   "Vivaldi"   "Mozart"    "Mozart"    "Mozart"    "Mozart"   
## [25] "Mozart"    "Mozart"    "Eels"      "Eels"      "Eels"      "Eels"     
## [31] "Eels"      "Eels"      "Eels"      "Eels"      "Eels"      "Eels"     
## [37] "Beatles"   "Beatles"   "Beatles"   "Beatles"   "Beatles"   "Beatles"  
## [43] "Beatles"   "Beatles"   "Beatles"   "Beatles"   "Beethoven" "Beethoven"
## [49] "Beethoven" "Beethoven" "Beethoven" "Beethoven" "Beethoven" "Beethoven"
## [55] "Enya"      "Enya"      "Enya"      NA          NA          NA         
## [61] NA          NA

Can also index using col names

colnames(music)
## [1] "X"      "Artist" "Type"   "LVar"   "LAve"   "LMax"   "LFEner" "LFreq"
music$Type
##  [1] "Rock"      "Rock"      "Rock"      "Rock"      "Rock"      "Rock"     
##  [7] "Rock"      "Rock"      "Rock"      "Rock"      "Classical" "Classical"
## [13] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [19] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [25] "Classical" "Classical" "Rock"      "Rock"      "Rock"      "Rock"     
## [31] "Rock"      "Rock"      "Rock"      "Rock"      "Rock"      "Rock"     
## [37] "Rock"      "Rock"      "Rock"      "Rock"      "Rock"      "Rock"     
## [43] "Rock"      "Rock"      "Rock"      "Rock"      "Classical" "Classical"
## [49] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [55] "New wave"  "New wave"  "New wave"  NA          NA          NA         
## [61] NA          NA

All music$Type[1:10], music[1:10,"Type"], and music[1:10,3] give the first 10 entries of the 3rd column.

How to get the last row

music[nrow(music), ]
##    X Artist Type    LVar      LAve  LMax   LFEner    LFreq
## 62 5   <NA> <NA> 8651854 -6.132241 18509 83.88195 219.5377
music[-(1:nrow(music) - 1), ] # the minus sign means "not" = everything except the first nrow(music)-1 rows
##    X Artist Type    LVar      LAve  LMax   LFEner    LFreq
## 62 5   <NA> <NA> 8651854 -6.132241 18509 83.88195 219.5377

Exercise

music[c(1, 3, 4), 2:5]
##   Artist Type     LVar      LAve
## 1   Abba Rock 17600756 -90.00687
## 3   Abba Rock  9049482 -98.06292
## 4   Abba Rock  7557437 -90.47106
music[, 4:8]
##           LVar        LAve  LMax    LFEner     LFreq
## 1   17600755.6 -90.0068667 29921 105.92095  59.57379
## 2    9543020.9 -75.7667187 27626 102.83616  58.48031
## 3    9049481.5 -98.0629244 26372 102.32488 124.59397
## 4    7557437.3 -90.4710616 28898 101.61648  48.76513
## 5    6282285.6 -88.9526309 27940 100.30076  74.02039
## 6    4665866.7 -69.0208450 25531 100.24848  81.40140
## 7    3369670.3 -71.6828788 14699 104.59686 305.18689
## 8    1135862.0 -67.8190467  8928 104.34921 277.66056
## 9    6146942.6 -76.2807500 22962 102.24066 165.15799
## 10   3482881.8 -74.1300038 15517 104.36243 146.73700
## 11   3677296.2  66.6530970 24229  99.25243 329.53792
## 12    771491.8  21.6560941  6936 104.36737 843.83240
## 13   5227573.2  88.6465556 17721 104.61255 165.76781
## 14    334719.1  13.8318847  4123 104.35005 293.99972
## 15    836904.9  34.5677377 17306  88.82821 198.38305
## 16  13936436.7 216.2317586 30355 104.82354 198.46716
## 17   3636324.3   9.8366363 21450 100.52727 877.77243
## 18    295397.4   2.8143227  2985 108.28717  58.41722
## 19   4335879.0  10.9015767 22271 101.32881 176.53441
## 20    472630.0   3.8890862  8194  98.83731 526.04942
## 21   2819795.0  -5.8667602 14939 102.25358 342.26017
## 22   2836957.9  -5.6074580 13382 102.04646 511.85517
## 23   9089372.1  -5.9719205 25265 102.23796 429.27618
## 24   4056229.7  -6.2272904 21328  97.59887 343.75319
## 25   1568925.6  -6.1993790 15839  99.47580 288.44819
## 26   7758409.1  -5.7700183 22496 101.66942 459.24182
## 27  40275677.4 -10.6893416 32759 106.07617  65.48281
## 28 129472199.3  50.2115773 32759 114.00229  41.40515
## 29  18838849.0  -0.0222764 30386 105.45611 165.24210
## 30  43201194.3   1.8926897 32759 108.37835 174.64185
## 31  88547131.0   0.3358761 32744 112.00916 392.28702
## 32  16285811.4  -0.1405876 30106 103.94171 312.06322
## 33  54651552.8   1.9848514 32759 108.87503 312.37864
## 34  12322434.4   1.0259790 23221 103.29906 185.59771
## 35  63878899.8   3.3592505 32759 109.53291  66.13469
## 36  43668186.9  -2.0125667 32759 108.35559  98.83404
## 37  28806811.2  -5.7452189 24159 109.95808 126.50757
## 38  61257693.6  -6.0340682 28502 111.81813 294.67263
## 39  76729438.0  -5.9583971 30102 112.71792 100.20089
## 40  52497242.7  -5.7314591 29911 110.66333 110.39972
## 41  68104547.0  -6.1449114 30415 111.59957 107.30853
## 42  52569372.5  -5.7166301 32318 110.34618 137.10594
## 43  23080907.6  -6.0298337 28169 107.30541 173.50631
## 44  39908667.5  -6.2616915 29061 109.47442  91.49508
## 45  18819753.2  -6.1265193 21680 108.65894 164.92667
## 46  58614798.7  -5.9971242 31131 111.18901  92.46240
## 47   8368952.9  -0.9538330 26645 101.74095 354.12025
## 48    293608.3  -0.1247094  4554 103.26160 316.03761
## 49   8051764.6  -0.3316964 24194 101.20132 397.18666
## 50  23493873.6  -0.9411538 32766 106.18220 529.26679
## 51   1640232.8   1.3899979 20877  94.59029 445.00551
## 52    343973.1  -2.4748955  9225  93.38874 331.68283
## 53   3644784.2  -1.0426907 24633  97.86394 181.01349
## 54  15030950.3  -1.4394652 26066 106.39731 249.08280
## 55   1135493.1 -10.6183398  9994 102.16132 155.06430
## 56  12230252.2 -17.8372700 24968 105.75748  79.31957
## 57   1723627.9  -6.8327065 13227 101.86845  49.01748
## 58  24898675.9 -93.9961871 29830 107.73299 146.04306
## 59   1879989.2  12.7213373  8601 105.81750  58.83780
## 60    737349.6   5.7190022  7089 102.92123 175.94562
## 61   2865979.9  21.4467629 17282 102.11314  61.44533
## 62   8651854.1  -6.1322408 18509  83.88195 219.53773

Plotting

X1 <- sample(-1000:1000, size = 200, replace = FALSE)
X1 <- sort(X1)
Y1 <- X1^2 + 3 * X1 + 1
plot(x = X1, y = Y1, type = "l", col = "red", main = "Plot of Y1 vs X1")

We can also use the ggplot2 package to create a variety of aesthetically pleasing plots. ggplot2 requires the data to be in the data.frame format. These plots are constructed via layers as seen below, where the ggplot function establishes basic details such as providing the data.frame and identifying the x and y variables, then geom_line adds the curve, and labs adds the title. This layering approach and the range of components which are available allows us to build up more complicated plots by combining different layers. Some nice examples of these plots can be seen at https://r-graph-gallery.com/, such as the final plot from https://r-graph-gallery.com/web-scatterplot-and-ggrepel.html, and more information about ggplot2 can be found at https://ggplot2.tidyverse.org/.

library(ggplot2)

df <- data.frame(X1, Y1)
gg <- ggplot(data = df, mapping = aes(x = X1, y = Y1)) +
  geom_line(colour = "red") +
  labs(title = "Plot of Y1 vs X1")
gg

Summary statistics

R has builtin functions mean, sd, var, cov and cor

The music dataset has numeric variables 4:8. Get the covariance matrix of these variables.

C <- cov(music[, 4:8])

Scale the matrix to get the corelation matrix

R <- matrix(0, nrow = dim(C)[1], ncol = dim(C)[2])
for (i in 1:dim(C)[1]) {
  for (j in 1:dim(C)[2]) {
    R[i, j] <- C[i, j] / sqrt(C[i, i] * C[j, j])
  }
}
print(R)
##            [,1]        [,2]       [,3]        [,4]       [,5]
## [1,]  1.0000000  0.11168505  0.6426304  0.69611717 -0.2609936
## [2,]  0.1116850  1.00000000 -0.0358804  0.01790643  0.1843789
## [3,]  0.6426304 -0.03588040  1.0000000  0.41376440 -0.2663298
## [4,]  0.6961172  0.01790643  0.4137644  1.00000000 -0.2682983
## [5,] -0.2609936  0.18437887 -0.2663298 -0.26829833  1.0000000
cor(music[, 4:8])
##              LVar        LAve       LMax      LFEner      LFreq
## LVar    1.0000000  0.11168505  0.6426304  0.69611717 -0.2609936
## LAve    0.1116850  1.00000000 -0.0358804  0.01790643  0.1843789
## LMax    0.6426304 -0.03588040  1.0000000  0.41376440 -0.2663298
## LFEner  0.6961172  0.01790643  0.4137644  1.00000000 -0.2682983
## LFreq  -0.2609936  0.18437887 -0.2663298 -0.26829833  1.0000000

Plot LVar and LMax against each other.

plot(music$LVar, music$LMax, xlab = "LVar", ylab = "LMax", main = "LMax vs LVar")

Scale the data and plot again.

music_scaled <- scale(music[, 4:8])
print(cov(music_scaled))
##              LVar        LAve       LMax      LFEner      LFreq
## LVar    1.0000000  0.11168505  0.6426304  0.69611717 -0.2609936
## LAve    0.1116850  1.00000000 -0.0358804  0.01790643  0.1843789
## LMax    0.6426304 -0.03588040  1.0000000  0.41376440 -0.2663298
## LFEner  0.6961172  0.01790643  0.4137644  1.00000000 -0.2682983
## LFreq  -0.2609936  0.18437887 -0.2663298 -0.26829833  1.0000000
plot(music_scaled[, 1], music_scaled[, 3], xlab = "LVar", ylab = "LMax", main = "LMax vs LVar")

Basic Linear Algebra

The operator for matrix multiplication is %*%. The operator for element-wise multiplication is *.

a <- c(1, 2, 3)
b <- c(4, 5, 6)
# element-wise multiplication
print(a * b)
## [1]  4 10 18
# dot product
print(sum(a * b))
## [1] 32
# or using matrix multiplication
print(a %*% b)
##      [,1]
## [1,]   32
# transpose vector
print(t(b))
##      [,1] [,2] [,3]
## [1,]    4    5    6
print(a %*% t(b))
##      [,1] [,2] [,3]
## [1,]    4    5    6
## [2,]    8   10   12
## [3,]   12   15   18
# matrix multiplication
A <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2)
print(A)
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
B <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3)
print(B)
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
print(A %*% B)
##      [,1] [,2]
## [1,]   22   49
## [2,]   28   64

Matrix Eigendecomposition

Use builtin eigen function. It returns a list with two components: values and vectors. Eigenvalues are in decreasing order and all eigenvectors are normalized to have length 1.

res <- eigen(R)
print(res)
## eigen() decomposition
## $values
## [1] 2.3336465 1.1435388 0.7156333 0.5828106 0.2243708
## 
## $vectors
##              [,1]        [,2]        [,3]        [,4]        [,5]
## [1,] -0.588904040 0.196723647 -0.12513976 -0.04870916  0.77230777
## [2,]  0.002332504 0.827745489  0.53255700  0.13474432 -0.11427589
## [3,] -0.513915382 0.002231933 -0.26614599  0.71598368 -0.39040964
## [4,] -0.532235892 0.099177521 -0.08449907 -0.67965669 -0.48766326
## [5,]  0.325273003 0.516038806 -0.78914467 -0.07003955 -0.01570279

We can access the eigenvectors and eigenvalues as follows:

res$values
## [1] 2.3336465 1.1435388 0.7156333 0.5828106 0.2243708
res$vectors
##              [,1]        [,2]        [,3]        [,4]        [,5]
## [1,] -0.588904040 0.196723647 -0.12513976 -0.04870916  0.77230777
## [2,]  0.002332504 0.827745489  0.53255700  0.13474432 -0.11427589
## [3,] -0.513915382 0.002231933 -0.26614599  0.71598368 -0.39040964
## [4,] -0.532235892 0.099177521 -0.08449907 -0.67965669 -0.48766326
## [5,]  0.325273003 0.516038806 -0.78914467 -0.07003955 -0.01570279

Check that the eigenvectors are normalized.

for (i in 1:dim(res$vectors)[1]) {
  v <- res$vectors[, i]
  print(paste("Norm of vector", i, "is", sqrt(sum(v * v))))
}
## [1] "Norm of vector 1 is 1"
## [1] "Norm of vector 2 is 1"
## [1] "Norm of vector 3 is 1"
## [1] "Norm of vector 4 is 1"
## [1] "Norm of vector 5 is 1"

Breaks down if A is not diagonalizable.

A <- matrix(c(3, 1, 2, 0, 3, 4, 0, 0, 3), nrow = 3)
eigen(A)
## eigen() decomposition
## $values
## [1] 3 3 3
## 
## $vectors
##      [,1]          [,2]          [,3]
## [1,]    0  0.000000e+00  1.109336e-31
## [2,]    0  1.665335e-16 -1.665335e-16
## [3,]    1 -1.000000e+00  1.000000e+00

Let’s check that they are indeed eigenvectors and eigenvalues.

R %*% res$vectors
##             [,1]        [,2]        [,3]        [,4]         [,5]
## [1,] -1.37429386 0.224961119 -0.08955417 -0.02838821  0.173283336
## [2,]  0.00544324 0.946559063  0.38111550  0.07853042 -0.025640175
## [3,] -1.19929684 0.002552302 -0.19046292  0.41728289 -0.087596535
## [4,] -1.24205043 0.113413341 -0.06047035 -0.39611114 -0.109417411
## [5,]  0.75907221 0.590110384 -0.56473818 -0.04081979 -0.003523249
res$vectors * res$values
##              [,1]        [,2]        [,3]        [,4]         [,5]
## [1,] -1.374293862 0.459083454 -0.29203196 -0.11366996  1.802293345
## [2,]  0.002667309 0.946559063  0.60899957  0.15408536 -0.130678906
## [3,] -0.367774943 0.001597246 -0.19046292  0.51238173 -0.279390126
## [4,] -0.310192727 0.057801712 -0.04924696 -0.39611114 -0.284215326
## [5,]  0.072981773 0.115784055 -0.17706104 -0.01571483 -0.003523249

Exercise

  • Write a function for returning the largest eigenvalue of a matrix.
largest_eigenvalue <- function(A) {
  res <- eigen(A)
  return(res$values[1])
}

largest_eigenvalue(R)
## [1] 2.333647
  • check that the eigenvectors are indeed eigenvectors and the eigenvalues are indeed eigenvalues using diagonalization.

Because the covariance matrix is symmetric, it can be diagonalized as \(R = PDP^T\), where \(P\) is the matrix of eigenvectors and \(D\) is the diagonal matrix of eigenvalues.

print(R)
##            [,1]        [,2]       [,3]        [,4]       [,5]
## [1,]  1.0000000  0.11168505  0.6426304  0.69611717 -0.2609936
## [2,]  0.1116850  1.00000000 -0.0358804  0.01790643  0.1843789
## [3,]  0.6426304 -0.03588040  1.0000000  0.41376440 -0.2663298
## [4,]  0.6961172  0.01790643  0.4137644  1.00000000 -0.2682983
## [5,] -0.2609936  0.18437887 -0.2663298 -0.26829833  1.0000000
print(diag(res$values))
##          [,1]     [,2]      [,3]      [,4]      [,5]
## [1,] 2.333647 0.000000 0.0000000 0.0000000 0.0000000
## [2,] 0.000000 1.143539 0.0000000 0.0000000 0.0000000
## [3,] 0.000000 0.000000 0.7156333 0.0000000 0.0000000
## [4,] 0.000000 0.000000 0.0000000 0.5828106 0.0000000
## [5,] 0.000000 0.000000 0.0000000 0.0000000 0.2243708
print(res$vectors %*% diag(res$values) %*% t(res$vectors))
##            [,1]        [,2]       [,3]        [,4]       [,5]
## [1,]  1.0000000  0.11168505  0.6426304  0.69611717 -0.2609936
## [2,]  0.1116850  1.00000000 -0.0358804  0.01790643  0.1843789
## [3,]  0.6426304 -0.03588040  1.0000000  0.41376440 -0.2663298
## [4,]  0.6961172  0.01790643  0.4137644  1.00000000 -0.2682983
## [5,] -0.2609936  0.18437887 -0.2663298 -0.26829833  1.0000000