K-Means Lab
Start with 1D Data # input data: https://cse.sc.edu/~rose/587/CSV/state_income.csv # 1) use wget to copy the dataset to the vm # 2) use GUI import from Text (base) to import state_income.csv #Sort data tmp = sort(state_income$V2) #Visualize data plot(tmp) # create 4 clusters kmeans=kmeans(tmp,4,15) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
Higher Dimensional Data # input data: https://cse.sc.edu/~rose/587/CSV/iris.csv # 1) use wget to copy to vm # 2) use GUI import from Text (base) to import iris.csv # Lets start with 2 dimensions: visualize data plot(iris[,1:2]) # create 4 clusters kmeans=kmeans(iris[,1:2],4,15) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
# examine kmeans object kmeans # note: clustering statistics cluster sizes cluster means clustering vector within cluster sum of squares
Plot Results # Visualize clusters plot(iris[,1:2], col=kmeans$cluster) # Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
Best K? # Is 4 the best number of clusters? # explore different number of clusters withinSumSqrs = numeric(20) for (k in 1:20) withinSumSqrs[k] = sum(kmeans(iris[,1:2],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")
Best K? Elbow Method Look for the “elbow” We want a small k that has a low WSS value What does a low WSS value indicate?
Higher Dimensional Data # Lets consider 3 dimensions: visualize data plot(iris[,1:3]) # create 4 clusters kmeans=kmeans(iris[,1:3],4,15) # Visualize the clusters plot(iris[,1:3], col=kmeans$cluster)
Higher Dimensional Data # Examine a single 2D projection plot(iris[,1:2], col=kmeans$cluster) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20) # Examine another single 2D projection plot(iris[,1:3], col=kmeans$cluster) points(kmeans$centers[,c(1,3)], col = 1:4, pch=20)
Best K? # Is 4 the best number of clusters? # explore different number of clusters withinSumSqrs = numeric(20) for (k in 1:20) withinSumSqrs[k] = sum(kmeans(iris[,1:3],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")
Higher Dimensional Data # Finally consider 4 dimensions: visualize data plot(iris[,1:4]) # create 4 clusters kmeans=kmeans(iris[,1:4],4,15) # Visualize the clusters plot(iris[,1:4], col=kmeans$cluster)
Normalization? # notice that the columns are not the same scale > summary(iris[,1:4]) Sepal.Length Sepal.Width Petal.Length Petal.Width Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 Median :5.800 Median :3.000 Median :4.350 Median :1.300 Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800 Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500 #Which will have more influence: petal width or petal length?
Normalization? # apply this function to one column # Let’s shift and scale the data to be in the range [0,1] # each dimension should be able to exert equal influence # function to “shift” a vector # subtract the min value so that it ranges from 0 to ?? myShift = function(x) { x - min(x, na.rm=TRUE)} # apply this function to one column myShift(iris[,3]) # apply this function to multiple columns as.data.frame(lapply(iris[,1:4], myShift)) #verify with summary statistics summary(as.data.frame(lapply(iris[,1:4], myShift)))
Normalization? # Still need to scale data # function to “scale” a vector myScale = function(x) { max(x,na.rm=TRUE) - min(x,na.rm=TRUE)} # apply this function to one column myScale(iris[,3]) # apply this function to multiple columns as.data.frame(lapply(iris[,1:4], myScale))
Normalization? # apply this function to a column # Put it all together # myShift= function(x) { x - min(x, na.rm=TRUE)} # myScale = function(x) { max(x,na.rm=TRUE) - min(x,na.rm=TRUE)} myNorm = function(x) { myShift(x)/myScale(x) } # apply this function to a column myNorm(iris[,3]) # apply this function to multiple columns tmp = as.data.frame(lapply(iris[,1:4], myNorm)) #verify with summary statistics summary(as.data.frame(lapply(iris[,1:4], myNorm)))
Higher Dimensional Data # visualize “normalized” data plot(tmp[,1:4]) # try from 1 to 20 clusters for (k in 1:20) withinSumSqrs[k] = sum(kmeans(tmp[,1:4],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")