PCA/LDA Lab CSCE 587 Fall 2018
PCA/LDA Lab # PCA & LDA Lab # We will need the following packages: # stats # ggplot2 # ggfortify # lfda # stats is already installed and loaded # The others should already be installed and only need to loaded.
PCA ############################################################ # example iris data # PCA # extract the independent variables iris_data <- iris[,-5] # generate the PCs iris_pca <- prcomp(iris_data, center = TRUE, scale. = TRUE) # Display the PCA object print(iris_pca)
PCA # convert from SD to var # get percent var #plot percent var tot_var <- sum(sapply(iris_pca$sdev,function(x) x*x)) # get percent var pct_var <- sapply(iris_pca$sdev,function(x) x*x/tot_var) #plot percent var plot(pct_var, type="l")
PCA # first two PCs account for most of the variance sum(pct_var[1:2]) # plot the first two principal components # you can use autoplot to plot the first two PCs # use the work-around plot command autoplot(prcomp(iris_data, center = TRUE, scale. = TRUE)) # ugly version of plot plot(prcomp(iris_data, center = TRUE, scale. = TRUE)$x[,1:2])
PCA # plot the first two principal components and color by species # pretty autoplot autoplot(prcomp(iris_data, center = TRUE, scale. = TRUE), data = iris, colour = 'Species') # ugly standard R plot(prcomp(iris_data , center = TRUE, scale. = TRUE)$x[,1:2], col=iris[,5]) # plot the first two principal using autoplot from ggfortify # components, color by species, and label by observation index autoplot(prcomp(iris_data , center = TRUE, scale. = TRUE), data = iris, colour = 'Species', label = TRUE, label.size = 3)
ggfortify plots for our PCA example # plot the first two PCs and draw # the eigenvectors and label the loadings autoplot(prcomp(iris_data , center = TRUE, scale. = TRUE), data = iris, colour = 'Species', loadings = TRUE, loadings.colour = 'blue',loadings.label = TRUE, loadings.label.size = 3)
LDA using the lfda package ########################################################## # # Create the LDA model. The first argument is the set of 4 # independent variables. # The second argument is the dependent variable. # The third argument is the dimensionality of the reduced space # The fourth argument is the type of metric in the embedding space # iris_LDA <- lfda(iris_data, iris[, 5], r = 2, metric="plain")
LDA using the lfda package # plot the model autoplot(iris_LDA, data = iris, frame = TRUE, frame.colour = 'Species') # # plot the model and color the data by class autoplot(iris_LDA, data = iris, colour="Species", frame = TRUE, frame.colour = 'Species')
LDA from MASS package # Create a data frame for the iris dataset Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]),Sp = rep(c("s","c","v"), rep(50,3))) # Decide which observations a comprise the training set set.seed(587) train <- sample(1:150, 75) # Display the class membership of the training data set table(Iris$Sp[train])
LDA from MASS package # create the LDA model # Formula: SP ~., i.e. Sp is the classification and the other columns # are the independant data # Iris is the data set # prior = c(1,1,1)/3 even priors # subset: which observations are used to train with z <- lda(Sp ~ ., Iris, prior = c(1,1,1)/3, subset = train)
LDA from MASS package # Project from 4 iris dimensions onto the dimension of the first eigenvector # Note: this maps all 150 flowers iris_x <- (z$scaling[,1]) %*% t(iris[,-5]) # Project from 4 iris dimensions onto the dimension of the second eigenvector iris_y <- (z$scaling[,2]) %*% t(iris[,-5])
LDA from MASS package # plot 150 iris data points plot(iris_y~iris_x) # plot 150 iris data points & color by species plot(iris_y~iris_x, col=iris[,5])
test <- c(1:150)[-train] table(Original=Iris[test,]$Sp, predict(z,Iris[test,])$class) # Create indices for a test set (use the observations NOT in the training set) test <- c(1:150)[-train] # classify the test set and display as a confusion matrix table(Original=Iris[test,]$Sp, predict(z, Iris[test,])$class)
More difficult PCA example ############################################################## # PCA example using icu data from earlier in the semester tmp <- icu[,-c(1, 2)] pca_tmp <- prcomp(tmp, center = TRUE, scale. = TRUE) print(pca_tmp) plot(pca_tmp, type="l") tot_var <- sum(sapply(pca_tmp$sdev,function(x) x*x))
More difficult PCA example pct_var <- sapply(pca_tmp$sdev,function(x) x*x/tot_var) plot(pct_var, type="l") sum(pct_var[1:10]) # autoplot autoplot(prcomp(tmp, center = TRUE, scale. = TRUE)) #regular plot plot(prcomp(tmp , center = TRUE, scale. = TRUE)$x[,1:2]) autoplot(prcomp(tmp , center = TRUE, scale. = TRUE ), data = icu, colour = 'STA') # regular plot plot(prcomp(tmp , center = TRUE, scale. = TRUE)$x[,1:2], col=icu[,2]+1)
More difficult LDA example # Find linear discriminants icu_LDA <- lfda(icu[,-c(1:2)], icu[, 2], r = 2, metric="plain") # Plot classification autoplot(icu_LDA, data=icu, colour="STA", frame=TRUE, frame.colour='STA')