Load data
music <- read.csv("datasets/music.csv")
# str shows variables on rows
str(music)
## 'data.frame': 62 obs. of 8 variables:
## $ X : chr "Dancing Queen" "Knowing Me" "Take a Chance" "Mamma Mia" ...
## $ Artist: chr "Abba" "Abba" "Abba" "Abba" ...
## $ Type : chr "Rock" "Rock" "Rock" "Rock" ...
## $ LVar : num 17600756 9543021 9049482 7557437 6282286 ...
## $ LAve : num -90 -75.8 -98.1 -90.5 -89 ...
## $ LMax : int 29921 27626 26372 28898 27940 25531 14699 8928 22962 15517 ...
## $ LFEner: num 106 103 102 102 100 ...
## $ LFreq : num 59.6 58.5 124.6 48.8 74 ...
# head shows the first few observations in a table
head(music)
## X Artist Type LVar LAve LMax LFEner LFreq
## 1 Dancing Queen Abba Rock 17600756 -90.00687 29921 105.9210 59.57379
## 2 Knowing Me Abba Rock 9543021 -75.76672 27626 102.8362 58.48031
## 3 Take a Chance Abba Rock 9049482 -98.06292 26372 102.3249 124.59397
## 4 Mamma Mia Abba Rock 7557437 -90.47106 28898 101.6165 48.76513
## 5 Lay All You Abba Rock 6282286 -88.95263 27940 100.3008 74.02039
## 6 Super Trouper Abba Rock 4665867 -69.02084 25531 100.2485 81.40140
Beware! In R we index from 1, not 0.
music[1, 1]
## [1] "Dancing Queen"
Splicing works as well:
music[c(1, 2, 3), 1]
## [1] "Dancing Queen" "Knowing Me" "Take a Chance"
music[1:3, 1]
## [1] "Dancing Queen" "Knowing Me" "Take a Chance"
music[1, ] # first ROW
## X Artist Type LVar LAve LMax LFEner LFreq
## 1 Dancing Queen Abba Rock 17600756 -90.00687 29921 105.921 59.57379
music[, 2] # second COLUMN
## [1] "Abba" "Abba" "Abba" "Abba" "Abba" "Abba"
## [7] "Abba" "Abba" "Abba" "Abba" "Vivaldi" "Vivaldi"
## [13] "Vivaldi" "Vivaldi" "Vivaldi" "Vivaldi" "Vivaldi" "Vivaldi"
## [19] "Vivaldi" "Vivaldi" "Mozart" "Mozart" "Mozart" "Mozart"
## [25] "Mozart" "Mozart" "Eels" "Eels" "Eels" "Eels"
## [31] "Eels" "Eels" "Eels" "Eels" "Eels" "Eels"
## [37] "Beatles" "Beatles" "Beatles" "Beatles" "Beatles" "Beatles"
## [43] "Beatles" "Beatles" "Beatles" "Beatles" "Beethoven" "Beethoven"
## [49] "Beethoven" "Beethoven" "Beethoven" "Beethoven" "Beethoven" "Beethoven"
## [55] "Enya" "Enya" "Enya" NA NA NA
## [61] NA NA
Can also index using col names
colnames(music)
## [1] "X" "Artist" "Type" "LVar" "LAve" "LMax" "LFEner" "LFreq"
music$Type
## [1] "Rock" "Rock" "Rock" "Rock" "Rock" "Rock"
## [7] "Rock" "Rock" "Rock" "Rock" "Classical" "Classical"
## [13] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [19] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [25] "Classical" "Classical" "Rock" "Rock" "Rock" "Rock"
## [31] "Rock" "Rock" "Rock" "Rock" "Rock" "Rock"
## [37] "Rock" "Rock" "Rock" "Rock" "Rock" "Rock"
## [43] "Rock" "Rock" "Rock" "Rock" "Classical" "Classical"
## [49] "Classical" "Classical" "Classical" "Classical" "Classical" "Classical"
## [55] "New wave" "New wave" "New wave" NA NA NA
## [61] NA NA
All music$Type[1:10]
, music[1:10,"Type"]
,
and music[1:10,3]
give the first 10 entries of the 3rd
column.
How to get the last row
music[nrow(music), ]
## X Artist Type LVar LAve LMax LFEner LFreq
## 62 5 <NA> <NA> 8651854 -6.132241 18509 83.88195 219.5377
music[-(1:nrow(music) - 1), ] # the minus sign means "not" = everything except the first nrow(music)-1 rows
## X Artist Type LVar LAve LMax LFEner LFreq
## 62 5 <NA> <NA> 8651854 -6.132241 18509 83.88195 219.5377
music[c(1, 3, 4), 2:5]
## Artist Type LVar LAve
## 1 Abba Rock 17600756 -90.00687
## 3 Abba Rock 9049482 -98.06292
## 4 Abba Rock 7557437 -90.47106
music[, 4:8]
## LVar LAve LMax LFEner LFreq
## 1 17600755.6 -90.0068667 29921 105.92095 59.57379
## 2 9543020.9 -75.7667187 27626 102.83616 58.48031
## 3 9049481.5 -98.0629244 26372 102.32488 124.59397
## 4 7557437.3 -90.4710616 28898 101.61648 48.76513
## 5 6282285.6 -88.9526309 27940 100.30076 74.02039
## 6 4665866.7 -69.0208450 25531 100.24848 81.40140
## 7 3369670.3 -71.6828788 14699 104.59686 305.18689
## 8 1135862.0 -67.8190467 8928 104.34921 277.66056
## 9 6146942.6 -76.2807500 22962 102.24066 165.15799
## 10 3482881.8 -74.1300038 15517 104.36243 146.73700
## 11 3677296.2 66.6530970 24229 99.25243 329.53792
## 12 771491.8 21.6560941 6936 104.36737 843.83240
## 13 5227573.2 88.6465556 17721 104.61255 165.76781
## 14 334719.1 13.8318847 4123 104.35005 293.99972
## 15 836904.9 34.5677377 17306 88.82821 198.38305
## 16 13936436.7 216.2317586 30355 104.82354 198.46716
## 17 3636324.3 9.8366363 21450 100.52727 877.77243
## 18 295397.4 2.8143227 2985 108.28717 58.41722
## 19 4335879.0 10.9015767 22271 101.32881 176.53441
## 20 472630.0 3.8890862 8194 98.83731 526.04942
## 21 2819795.0 -5.8667602 14939 102.25358 342.26017
## 22 2836957.9 -5.6074580 13382 102.04646 511.85517
## 23 9089372.1 -5.9719205 25265 102.23796 429.27618
## 24 4056229.7 -6.2272904 21328 97.59887 343.75319
## 25 1568925.6 -6.1993790 15839 99.47580 288.44819
## 26 7758409.1 -5.7700183 22496 101.66942 459.24182
## 27 40275677.4 -10.6893416 32759 106.07617 65.48281
## 28 129472199.3 50.2115773 32759 114.00229 41.40515
## 29 18838849.0 -0.0222764 30386 105.45611 165.24210
## 30 43201194.3 1.8926897 32759 108.37835 174.64185
## 31 88547131.0 0.3358761 32744 112.00916 392.28702
## 32 16285811.4 -0.1405876 30106 103.94171 312.06322
## 33 54651552.8 1.9848514 32759 108.87503 312.37864
## 34 12322434.4 1.0259790 23221 103.29906 185.59771
## 35 63878899.8 3.3592505 32759 109.53291 66.13469
## 36 43668186.9 -2.0125667 32759 108.35559 98.83404
## 37 28806811.2 -5.7452189 24159 109.95808 126.50757
## 38 61257693.6 -6.0340682 28502 111.81813 294.67263
## 39 76729438.0 -5.9583971 30102 112.71792 100.20089
## 40 52497242.7 -5.7314591 29911 110.66333 110.39972
## 41 68104547.0 -6.1449114 30415 111.59957 107.30853
## 42 52569372.5 -5.7166301 32318 110.34618 137.10594
## 43 23080907.6 -6.0298337 28169 107.30541 173.50631
## 44 39908667.5 -6.2616915 29061 109.47442 91.49508
## 45 18819753.2 -6.1265193 21680 108.65894 164.92667
## 46 58614798.7 -5.9971242 31131 111.18901 92.46240
## 47 8368952.9 -0.9538330 26645 101.74095 354.12025
## 48 293608.3 -0.1247094 4554 103.26160 316.03761
## 49 8051764.6 -0.3316964 24194 101.20132 397.18666
## 50 23493873.6 -0.9411538 32766 106.18220 529.26679
## 51 1640232.8 1.3899979 20877 94.59029 445.00551
## 52 343973.1 -2.4748955 9225 93.38874 331.68283
## 53 3644784.2 -1.0426907 24633 97.86394 181.01349
## 54 15030950.3 -1.4394652 26066 106.39731 249.08280
## 55 1135493.1 -10.6183398 9994 102.16132 155.06430
## 56 12230252.2 -17.8372700 24968 105.75748 79.31957
## 57 1723627.9 -6.8327065 13227 101.86845 49.01748
## 58 24898675.9 -93.9961871 29830 107.73299 146.04306
## 59 1879989.2 12.7213373 8601 105.81750 58.83780
## 60 737349.6 5.7190022 7089 102.92123 175.94562
## 61 2865979.9 21.4467629 17282 102.11314 61.44533
## 62 8651854.1 -6.1322408 18509 83.88195 219.53773
X1 <- sample(-1000:1000, size = 200, replace = FALSE)
X1 <- sort(X1)
Y1 <- X1^2 + 3 * X1 + 1
plot(x = X1, y = Y1, type = "l", col = "red", main = "Plot of Y1 vs X1")
We can also use the ggplot2
package to create a variety
of aesthetically pleasing plots. ggplot2
requires the data
to be in the data.frame
format. These plots are constructed
via layers as seen below, where the ggplot
function
establishes basic details such as providing the data.frame
and identifying the x and y variables, then geom_line
adds
the curve, and labs
adds the title. This layering approach
and the range of components which are available allows us to build up
more complicated plots by combining different layers. Some nice examples
of these plots can be seen at https://r-graph-gallery.com/, such as the final plot
from https://r-graph-gallery.com/web-scatterplot-and-ggrepel.html,
and more information about ggplot2
can be found at https://ggplot2.tidyverse.org/.
library(ggplot2)
df <- data.frame(X1, Y1)
gg <- ggplot(data = df, mapping = aes(x = X1, y = Y1)) +
geom_line(colour = "red") +
labs(title = "Plot of Y1 vs X1")
gg
R
has builtin functions mean
,
sd
, var
, cov
and
cor
The music
dataset has numeric variables 4:8. Get the
covariance matrix of these variables.
C <- cov(music[, 4:8])
Scale the matrix to get the corelation matrix
R <- matrix(0, nrow = dim(C)[1], ncol = dim(C)[2])
for (i in 1:dim(C)[1]) {
for (j in 1:dim(C)[2]) {
R[i, j] <- C[i, j] / sqrt(C[i, i] * C[j, j])
}
}
print(R)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.0000000 0.11168505 0.6426304 0.69611717 -0.2609936
## [2,] 0.1116850 1.00000000 -0.0358804 0.01790643 0.1843789
## [3,] 0.6426304 -0.03588040 1.0000000 0.41376440 -0.2663298
## [4,] 0.6961172 0.01790643 0.4137644 1.00000000 -0.2682983
## [5,] -0.2609936 0.18437887 -0.2663298 -0.26829833 1.0000000
cor(music[, 4:8])
## LVar LAve LMax LFEner LFreq
## LVar 1.0000000 0.11168505 0.6426304 0.69611717 -0.2609936
## LAve 0.1116850 1.00000000 -0.0358804 0.01790643 0.1843789
## LMax 0.6426304 -0.03588040 1.0000000 0.41376440 -0.2663298
## LFEner 0.6961172 0.01790643 0.4137644 1.00000000 -0.2682983
## LFreq -0.2609936 0.18437887 -0.2663298 -0.26829833 1.0000000
Plot LVar and LMax against each other.
plot(music$LVar, music$LMax, xlab = "LVar", ylab = "LMax", main = "LMax vs LVar")
Scale the data and plot again.
music_scaled <- scale(music[, 4:8])
print(cov(music_scaled))
## LVar LAve LMax LFEner LFreq
## LVar 1.0000000 0.11168505 0.6426304 0.69611717 -0.2609936
## LAve 0.1116850 1.00000000 -0.0358804 0.01790643 0.1843789
## LMax 0.6426304 -0.03588040 1.0000000 0.41376440 -0.2663298
## LFEner 0.6961172 0.01790643 0.4137644 1.00000000 -0.2682983
## LFreq -0.2609936 0.18437887 -0.2663298 -0.26829833 1.0000000
plot(music_scaled[, 1], music_scaled[, 3], xlab = "LVar", ylab = "LMax", main = "LMax vs LVar")
The operator for matrix multiplication is %*%
. The
operator for element-wise multiplication is *
.
a <- c(1, 2, 3)
b <- c(4, 5, 6)
# element-wise multiplication
print(a * b)
## [1] 4 10 18
# dot product
print(sum(a * b))
## [1] 32
# or using matrix multiplication
print(a %*% b)
## [,1]
## [1,] 32
# transpose vector
print(t(b))
## [,1] [,2] [,3]
## [1,] 4 5 6
print(a %*% t(b))
## [,1] [,2] [,3]
## [1,] 4 5 6
## [2,] 8 10 12
## [3,] 12 15 18
# matrix multiplication
A <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2)
print(A)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
B <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3)
print(B)
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
print(A %*% B)
## [,1] [,2]
## [1,] 22 49
## [2,] 28 64
Use builtin eigen
function. It returns a list with two
components: values
and vectors
. Eigenvalues
are in decreasing order and all eigenvectors are normalized to have
length 1.
res <- eigen(R)
print(res)
## eigen() decomposition
## $values
## [1] 2.3336465 1.1435388 0.7156333 0.5828106 0.2243708
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.588904040 0.196723647 -0.12513976 -0.04870916 0.77230777
## [2,] 0.002332504 0.827745489 0.53255700 0.13474432 -0.11427589
## [3,] -0.513915382 0.002231933 -0.26614599 0.71598368 -0.39040964
## [4,] -0.532235892 0.099177521 -0.08449907 -0.67965669 -0.48766326
## [5,] 0.325273003 0.516038806 -0.78914467 -0.07003955 -0.01570279
We can access the eigenvectors and eigenvalues as follows:
res$values
## [1] 2.3336465 1.1435388 0.7156333 0.5828106 0.2243708
res$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.588904040 0.196723647 -0.12513976 -0.04870916 0.77230777
## [2,] 0.002332504 0.827745489 0.53255700 0.13474432 -0.11427589
## [3,] -0.513915382 0.002231933 -0.26614599 0.71598368 -0.39040964
## [4,] -0.532235892 0.099177521 -0.08449907 -0.67965669 -0.48766326
## [5,] 0.325273003 0.516038806 -0.78914467 -0.07003955 -0.01570279
Check that the eigenvectors are normalized.
for (i in 1:dim(res$vectors)[1]) {
v <- res$vectors[, i]
print(paste("Norm of vector", i, "is", sqrt(sum(v * v))))
}
## [1] "Norm of vector 1 is 1"
## [1] "Norm of vector 2 is 1"
## [1] "Norm of vector 3 is 1"
## [1] "Norm of vector 4 is 1"
## [1] "Norm of vector 5 is 1"
Breaks down if A
is not diagonalizable.
A <- matrix(c(3, 1, 2, 0, 3, 4, 0, 0, 3), nrow = 3)
eigen(A)
## eigen() decomposition
## $values
## [1] 3 3 3
##
## $vectors
## [,1] [,2] [,3]
## [1,] 0 0.000000e+00 1.109336e-31
## [2,] 0 1.665335e-16 -1.665335e-16
## [3,] 1 -1.000000e+00 1.000000e+00
Let’s check that they are indeed eigenvectors and eigenvalues.
R %*% res$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -1.37429386 0.224961119 -0.08955417 -0.02838821 0.173283336
## [2,] 0.00544324 0.946559063 0.38111550 0.07853042 -0.025640175
## [3,] -1.19929684 0.002552302 -0.19046292 0.41728289 -0.087596535
## [4,] -1.24205043 0.113413341 -0.06047035 -0.39611114 -0.109417411
## [5,] 0.75907221 0.590110384 -0.56473818 -0.04081979 -0.003523249
res$vectors * res$values
## [,1] [,2] [,3] [,4] [,5]
## [1,] -1.374293862 0.459083454 -0.29203196 -0.11366996 1.802293345
## [2,] 0.002667309 0.946559063 0.60899957 0.15408536 -0.130678906
## [3,] -0.367774943 0.001597246 -0.19046292 0.51238173 -0.279390126
## [4,] -0.310192727 0.057801712 -0.04924696 -0.39611114 -0.284215326
## [5,] 0.072981773 0.115784055 -0.17706104 -0.01571483 -0.003523249
largest_eigenvalue <- function(A) {
res <- eigen(A)
return(res$values[1])
}
largest_eigenvalue(R)
## [1] 2.333647
Because the covariance matrix is symmetric, it can be diagonalized as \(R = PDP^T\), where \(P\) is the matrix of eigenvectors and \(D\) is the diagonal matrix of eigenvalues.
print(R)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.0000000 0.11168505 0.6426304 0.69611717 -0.2609936
## [2,] 0.1116850 1.00000000 -0.0358804 0.01790643 0.1843789
## [3,] 0.6426304 -0.03588040 1.0000000 0.41376440 -0.2663298
## [4,] 0.6961172 0.01790643 0.4137644 1.00000000 -0.2682983
## [5,] -0.2609936 0.18437887 -0.2663298 -0.26829833 1.0000000
print(diag(res$values))
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2.333647 0.000000 0.0000000 0.0000000 0.0000000
## [2,] 0.000000 1.143539 0.0000000 0.0000000 0.0000000
## [3,] 0.000000 0.000000 0.7156333 0.0000000 0.0000000
## [4,] 0.000000 0.000000 0.0000000 0.5828106 0.0000000
## [5,] 0.000000 0.000000 0.0000000 0.0000000 0.2243708
print(res$vectors %*% diag(res$values) %*% t(res$vectors))
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.0000000 0.11168505 0.6426304 0.69611717 -0.2609936
## [2,] 0.1116850 1.00000000 -0.0358804 0.01790643 0.1843789
## [3,] 0.6426304 -0.03588040 1.0000000 0.41376440 -0.2663298
## [4,] 0.6961172 0.01790643 0.4137644 1.00000000 -0.2682983
## [5,] -0.2609936 0.18437887 -0.2663298 -0.26829833 1.0000000