SVM Lab material borrowed from tutorial by David Meyer FH Technikum Wien, Austria see: http://cran.r-project.org/web/packages/e1071/vignettes/svmdoc.pdfl
Packages # Start by loading relevant libraries: # e1071 # mlbench # # If mlbench isn’t available then you will have to install it
Glass Dataset # Retrieve/Access "Glass" data from mlbench package data(Glass, package="mlbench") #The description of the Glass data set is on the following slide # Number of Attributes: 10 (including an Id#) plus the class # attribute -- all attributes are continuously valued
Attribute Information: 1. Id number: 1 to 214 2. RI: refractive index 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10) 4. Mg: Magnesium 5. Al: Aluminum 6. Si: Silicon 7. K: Potassium 8. Ca: Calcium 9. Ba: Barium 10. Fe: Iron
Class Information: Type of glass: (class attribute) Type 1 building_windows_float_processed 2 building_windows_non_float_processed 3 vehicle_windows_float_processed 4 vehicle_windows_non_float_processed (none in this database) 5 containers 6 tableware 7 headlamps
Create Training and Test Sets # Create a row index index <- 1:nrow(Glass) # Create an index of test samples by randomly selecting 1/3 of the samples testindex <- sample(index, trunc(length(index)/3)) # Create test set testset <- Glass[testindex,] # Create training set trainset <- Glass[-testindex,]
Train the SVM model # Train the svm model using: # "Type" (column 10) as the dependent variable, # # cost = 100 as the penalty cost for C-classification # This is the ‘C’-constant of the regularization term in # the Lagrange formulation # gamma = 1 as the radial basis kernel function-specific parameter svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
Apply SVM Model # Use the SVM to predict the classification for the testset svm.pred <- predict(svm.model, testset[,-10]) # Compute the SVM confusion matrix table(pred = svm.pred, true = testset[,10]) # determine accuracy t = table(pred = svm.pred, true = testset[,10]) sum(diag(t))/sum(t)
Optimize Parameters # Approach: Grid search with 10-fold cross validation # Note: a random mixing precedes the partitioning of the data # Optimize parameters to the svm with RBF kernel # The grid search iterates with gamma = 2^-4 through 2 # and cost = 2 through 2^7 # The returned object reports the best gamma & cost # and the corresponding classification error obj = tune.svm(Type~., data = Glass, gamma = 2^(-4:1), cost = 2^(1:7))
Optimize Parameters # Inspect the results # Note the results will very unless you set the seed for the # random number generator which is used to mix the data # before the partitioning > obj Parameter tuning of ‘svm’: - sampling method: 10-fold cross validation - best parameters: gamma cost 0.0625 128 best performance: 0.2898268 Note: The performance is reported as the error The accuracy is 1 – error, in this case 1- 0.2898268 = 0.7101732
Investigate Data From Midterm Exam # Recall the Mystery Data Set testSet <- Mystery[1:436,] trainSet <- Mystery[467:1389,] # The accuracies were # Naïve Bayes: 26.7% # 1st Decision Tree: 38.4% # 2nd Decision Tree: 67%
Investigate Data From Midterm Exam # Learn the model using trainSet svm.model <- svm(class ~ Feature1 + Feature2 + Feature3 + Feature4 + Feature5+Feature6+Feature7+Feature8+Feature9, data = trainSet, cost = 2, gamma = 0.25) # Classify the data in testSet svm.pred <- predict(svm.model, testSet[,-1]) # Create the confusion matrix t = table(pred = svm.pred, true = testSet[,1]) # Calculate the accuracy sum(diag(t))/sum(t) # What do you think?
Try training with the entire dataset (This is for educational purpose only. DO NOT DO THIS IN PRACTICE) # Learn the model using trainSet svm.model <- svm(class ~ Feature1 + Feature2 + Feature3 + Feature4 + Feature5+Feature6+Feature7+Feature8+Feature9, data = Mystery, cost = 2, gamma = 0.25) # Classify the data in testSet svm.pred <- predict(svm.model, testSet[,-1]) # Create the confusion matrix t = table(pred = svm.pred, true = testSet[,1]) # Calculate the accuracy sum(diag(t))/sum(t) # What do you think?
Another online resource is: http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM