```{r} # Spiral example from: # https://github.com/WinVector/zmPDSwR/blob/master/CodeExamples/c09_Exploring_advanced_methods/00196_example_9.21_of_section_9.4.2.R library('kernlab') library('ggplot2') data('spirals') sc <- specc(spirals, centers = 2) s <- data.frame(x=spirals[,1],y=spirals[,2], class=as.factor(sc)) set.seed(2335246L) s$group <- sample.int(100,size=dim(s)[[1]],replace=T) sTrain <- subset(s,group>10) sTest <- subset(s,group<=10) mSVMG <- ksvm(class~x+y,data=sTrain,kernel='rbfdot',type='nu-svc') sTest$predSVMG <- predict(mSVMG,newdata=sTest,type='response') ggplot() + geom_text(data=sTest,aes(x=x,y=y, label=predSVMG),size=12) + geom_text(data=s,aes(x=x,y=y, label=class,color=class),alpha=0.7) + coord_fixed() + theme_bw() + theme(legend.position='none') ``` ```{r} # Actual example # isolet data from https://archive.ics.uci.edu/ml/datasets/ISOLET library('ggplot2') library('reshape2') library('kernlab') library('ROCR') dTrain = read.table("isolet1+2+3+4.data.gz", header=FALSE,sep=',', stringsAsFactors=FALSE,blank.lines.skip=TRUE) dTrain$isTest <- FALSE dTest = read.table("isolet5.data.gz", header=FALSE,sep=',', stringsAsFactors=FALSE,blank.lines.skip=TRUE) dTest$isTest <- TRUE d <- rbind(dTrain,dTest) rm(list=c('dTest','dTrain')) d$V618 <- letters[d$V618] vars <- colnames(d)[1:617] yColumn <- 'isLetter' d <- d[d$V618 %in% c('m','n'),,drop=FALSE] d[,yColumn] <- d[,'V618']=='n' formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ') ``` ```{r} # define some helper and reporting functions # calulcate area under the curve of numeric vectors x,y # length(x)==length(y) # y>=0, 0<=x<=1 and x increasing areaCalc <- function(x,y) { # append extra points to get rid of degenerate cases x <- c(0,x,1) y <- c(0,y,1) n <- length(x) sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n])) } # needs ggplot2 and reshape2 # plot the gain-curve which is what proportion of the target (make target numric) # is sorted into position by the prediction # example: # d <- data.frame(pred=runif(100)) # d$y <- runif(100)0.5) accuracy <- (tab[1,1] + tab[2,2])/sum(tab) note = ifelse(test,'test','train') print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2))) residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon) #print(paste('\tresidual.deviance',residual.deviance)) null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon) #print(paste('\tnull.deviance',null.deviance)) print(paste("\tmodel explained a", format((1-residual.deviance/null.deviance),digits=2), "fraction of the variation on",note)) } report <- function(d,modelName,title,epsilon=1.0e-2) { print("***********") print(paste("model",modelName,title)) reportStats(d,FALSE,modelName,title,epsilon) reportStats(d,TRUE,modelName,title,epsilon) print(ggplot(data=d[d$isTest==TRUE,,drop=FALSE], aes_string(x=modelName,color=yColumn)) + geom_density() + ggtitle(paste(title,'test'))) print(plotROC(paste(title,'train'), d[d$isTest==FALSE,yColumn], d[d$isTest==FALSE,modelName])$plot) print(plotROC(paste(title,'test'), d[d$isTest==TRUE,yColumn], d[d$isTest==TRUE,modelName])$plot) print(gainCurve(d[d$isTest==FALSE,yColumn], d[d$isTest==FALSE,modelName], paste(title,'train'))) print(gainCurve(d[d$isTest==TRUE,yColumn], d[d$isTest==TRUE,modelName], paste(title,'test'))) print("***********") } ``` ```{r} # do the SVM modeling modelSVM <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=1.0, kernel='rbfdot', prob.model=TRUE) d$modelSVM <- predict(modelSVM,newdata=d,type='prob')[,'TRUE',drop=TRUE] report(d,'modelSVM',"svm") help(svm) ``` ```{r} # As an example: set C=0.1, which means # to make correction terms cheap, or coefficient magnitudes expensive # should prefer minimizing exess genaralization error to training performance modelCsmall <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=0.1, kernel='rbfdot', prob.model=TRUE) d$modelCsmall <- predict(modelCsmall,newdata=d,type='prob')[,'TRUE',drop=TRUE] report(d,'modelCsmall',"svm") ``` ```{r} # As an example: set C=10.0, which means # to make correction terms expensive, or coefficient magnitudes cheap # should prefer optimizing training performance over generalization peformance modelClarge <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=10.0, kernel='rbfdot', prob.model=TRUE) d$modelClarge <- predict(modelClarge,newdata=d,type='prob')[,'TRUE',drop=TRUE] report(d,'modelClarge',"svm") ```