Download presentation
Presentation is loading. Please wait.
1
K-Means Lab
2
Start with 1D Data # input data: # 1) use wget to copy the dataset to the vm # 2) use GUI import from Text (base) to import state_income.csv #Sort data tmp = sort(state_income$V2) #Visualize data plot(tmp) # create 4 clusters kmeans=kmeans(tmp,4,15) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
3
Higher Dimensional Data
# input data: # 1) use wget to copy to vm # 2) use GUI import from Text (base) to import iris.csv # Lets start with 2 dimensions: visualize data plot(iris[,1:2]) # create 4 clusters kmeans=kmeans(iris[,1:2],4,15) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
4
# examine kmeans object kmeans # note: clustering statistics
cluster sizes cluster means clustering vector within cluster sum of squares
5
Plot Results # Visualize clusters plot(iris[,1:2], col=kmeans$cluster) # Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20)
6
Best K? # Is 4 the best number of clusters?
# explore different number of clusters withinSumSqrs = numeric(20) for (k in 1:20) withinSumSqrs[k] = sum(kmeans(iris[,1:2],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")
7
Best K? Elbow Method Look for the “elbow”
We want a small k that has a low WSS value What does a low WSS value indicate?
8
Higher Dimensional Data
# Lets consider 3 dimensions: visualize data plot(iris[,1:3]) # create 4 clusters kmeans=kmeans(iris[,1:3],4,15) # Visualize the clusters plot(iris[,1:3], col=kmeans$cluster)
9
Higher Dimensional Data
# Examine a single 2D projection plot(iris[,1:2], col=kmeans$cluster) #Visualize the cluster centers points(kmeans$centers, col = 1:4, pch=20) # Examine another single 2D projection plot(iris[,1:3], col=kmeans$cluster) points(kmeans$centers[,c(1,3)], col = 1:4, pch=20)
10
Best K? # Is 4 the best number of clusters?
# explore different number of clusters withinSumSqrs = numeric(20) for (k in 1:20) withinSumSqrs[k] = sum(kmeans(iris[,1:3],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")
11
Higher Dimensional Data
# Finally consider 4 dimensions: visualize data plot(iris[,1:4]) # create 4 clusters kmeans=kmeans(iris[,1:4],4,15) # Visualize the clusters plot(iris[,1:4], col=kmeans$cluster)
12
Normalization? # notice that the columns are not the same scale
> summary(iris[,1:4]) Sepal.Length Sepal.Width Petal.Length Petal.Width Min. : Min. : Min. : Min. :0.100 1st Qu.: st Qu.: st Qu.: st Qu.:0.300 Median : Median : Median : Median :1.300 Mean : Mean : Mean : Mean :1.199 3rd Qu.: rd Qu.: rd Qu.: rd Qu.:1.800 Max. : Max. : Max. : Max. :2.500 #Which will have more influence: petal width or petal length?
13
Normalization? # apply this function to one column
# Let’s shift and scale the data to be in the range [0,1] # each dimension should be able to exert equal influence # function to “shift” a vector # subtract the min value so that it ranges from 0 to ?? myShift = function(x) { x - min(x, na.rm=TRUE)} # apply this function to one column myShift(iris[,3]) # apply this function to multiple columns as.data.frame(lapply(iris[,1:4], myShift)) #verify with summary statistics summary(as.data.frame(lapply(iris[,1:4], myShift)))
14
Normalization? # Still need to scale data
# function to “scale” a vector myScale = function(x) { max(x,na.rm=TRUE) - min(x,na.rm=TRUE)} # apply this function to one column myScale(iris[,3]) # apply this function to multiple columns as.data.frame(lapply(iris[,1:4], myScale))
15
Normalization? # apply this function to a column
# Put it all together # myShift= function(x) { x - min(x, na.rm=TRUE)} # myScale = function(x) { max(x,na.rm=TRUE) - min(x,na.rm=TRUE)} myNorm = function(x) { myShift(x)/myScale(x) } # apply this function to a column myNorm(iris[,3]) # apply this function to multiple columns tmp = as.data.frame(lapply(iris[,1:4], myNorm)) #verify with summary statistics summary(as.data.frame(lapply(iris[,1:4], myNorm)))
16
Higher Dimensional Data
# visualize “normalized” data plot(tmp[,1:4]) # try from 1 to 20 clusters for (k in 1:20) withinSumSqrs[k] = sum(kmeans(tmp[,1:4],centers=k)$withinss) #Visualize within cluster sum of square plot(1:20, withinSumSqrs, type="b", xlab="# Clusters", ylab="Within sum of square")
Similar presentations
© 2024 SlidePlayer.com. Inc.
All rights reserved.