SVM

# Spiral example from: 
#   https://github.com/WinVector/zmPDSwR/blob/master/CodeExamples/c09_Exploring_advanced_methods/00196_example_9.21_of_section_9.4.2.R
library('kernlab')
library('ggplot2')
data('spirals') 
sc <- specc(spirals, centers = 2) 
s <- data.frame(x=spirals[,1],y=spirals[,2],
   class=as.factor(sc)) 

set.seed(2335246L)
s$group <- sample.int(100,size=dim(s)[[1]],replace=T)
sTrain <- subset(s,group>10)
sTest <- subset(s,group<=10)

mSVMG <- ksvm(class~x+y,data=sTrain,kernel='rbfdot',type='nu-svc')

## Using automatic sigma estimation (sigest) for RBF or laplace kernel

sTest$predSVMG <- predict(mSVMG,newdata=sTest,type='response')
ggplot() +
   geom_text(data=sTest,aes(x=x,y=y,
      label=predSVMG),size=12) +
   geom_text(data=s,aes(x=x,y=y,
      label=class,color=class),alpha=0.7) +
   coord_fixed() + 
   theme_bw() + theme(legend.position='none')

# Actual example
# isolet data from https://archive.ics.uci.edu/ml/datasets/ISOLET
library('ggplot2')
library('reshape2')
library('kernlab')
library('ROCR')

## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

dTrain = read.table("isolet1+2+3+4.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTest = read.table("isolet5.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)
rm(list=c('dTest','dTrain'))
d$V618 <- letters[d$V618]
vars <- colnames(d)[1:617]
yColumn <- 'isLetter'
d <- d[d$V618 %in% c('m','n'),,drop=FALSE]
d[,yColumn] <- d[,'V618']=='n'

formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')

# define some helper and reporting functions
# calulcate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc <- function(x,y) {
   # append extra points to get rid of degenerate cases
   x <- c(0,x,1)
   y <- c(0,y,1)
   n <- length(x)
   sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}

# needs ggplot2 and reshape2
# plot the gain-curve which is what proportion of the target (make target numric)
# is sorted into position by the prediction
# example:
#  d <- data.frame(pred=runif(100))
#  d$y <- runif(100)<d$pred
#  gainCurve('test',d$pred,d$y)
gainCurve = function(truthcol, predcol, title) {
  truthcol <- as.numeric(truthcol)
  # data frame of pred and truth, sorted in order of the predictions
  d = data.frame(predcol=predcol,truthcol=truthcol)
  predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
  wizard = order(d[['truthcol']], decreasing=TRUE)
  npop = dim(d)[1]
  
  # data frame the cumulative prediction/truth as a function
  # of the fraction of the population we're considering, highest first
  results = data.frame(pctpop= (1:npop)/npop,
                       model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
                       wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
  
  # calculate the areas under each curve
  # gini score is 2* (area - 0.5)
  idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
  modelArea = areaCalc(results$pctpop,results$model) - 0.5
  giniScore = modelArea/idealArea # actually, normalized gini score
  
  # melt the frame into the tall form, for plotting
  results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
                 variable.name="sort_criterion", value.name="pct_outcome")
  
  gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) + 
    geom_point() + geom_line() + 
    geom_abline(color="gray") +
    ggtitle(paste("Gain curve,", title, '\n', 
       'relative Gini score', format(giniScore,digits=2))) +
       xlab("% items in score order") + ylab("% total category") +
       scale_x_continuous(breaks=seq(0,1,0.1)) +
       scale_y_continuous(breaks=seq(0,1,0.1)) +
    scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
  gplot
}


plotROC <- function(title,outcol,predcol) {
  pred <- prediction(predcol,outcol)
  perf <- performance(pred,'tpr','fpr')
  auc <- as.numeric(performance(pred,'auc')@y.values)
  pf <- data.frame(
    FalsePositiveRate=perf@x.values[[1]],
    TruePositiveRate=perf@y.values[[1]])
  plot=ggplot() +
    geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
      fill='blue',alpha=0.3) +
      geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
      geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
      ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
  list(pf=pf,plot=plot)
}


deviance <- function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}


reportStats <- function(d,test,modelName,title,epsilon) {
  dSub <- d[d$isTest==test,,drop=FALSE]
  tab <- table(truth=dSub[,yColumn],pred=dSub[,modelName]>0.5)
  accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
  note = ifelse(test,'test','train')
  print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2)))
  residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon)
  #print(paste('\tresidual.deviance',residual.deviance))
  null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon)
  #print(paste('\tnull.deviance',null.deviance))
  print(paste("\tmodel explained a",
              format((1-residual.deviance/null.deviance),digits=2),
            "fraction of the variation on",note))  
}

report <- function(d,modelName,title,epsilon=1.0e-2) {
  print("***********")
  print(paste("model",modelName,title))
  reportStats(d,FALSE,modelName,title,epsilon)
  reportStats(d,TRUE,modelName,title,epsilon)
  print(ggplot(data=d[d$isTest==TRUE,,drop=FALSE],
               aes_string(x=modelName,color=yColumn)) + 
    geom_density() + 
    ggtitle(paste(title,'test')))
  print(plotROC(paste(title,'train'),
                d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName])$plot)
  print(plotROC(paste(title,'test'),
                d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName])$plot)
  print(gainCurve(d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName],
                paste(title,'train')))
  print(gainCurve(d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName],
                paste(title,'test')))
  print("***********")
}

# do the SVM modeling
modelSVM <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=1.0,
         kernel='rbfdot',
         prob.model=TRUE)

## Using automatic sigma estimation (sigest) for RBF or laplace kernel

d$modelSVM <- predict(modelSVM,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelSVM',"svm")

## [1] "***********"
## [1] "model modelSVM svm"
## [1] "\t train accuracy modelSVM 1"
## [1] "\tmodel explained a 0.96 fraction of the variation on train"
## [1] "\t test accuracy modelSVM 0.88"
## [1] "\tmodel explained a 0.64 fraction of the variation on test"

## [1] "***********"

help(svm)

## No documentation for 'svm' in specified packages and libraries:
## you could try '??svm'

# As an example: set C=0.1, which means
# to make correction terms cheap, or coefficient magnitudes expensive
# should prefer minimizing exess genaralization error to training performance
modelCsmall <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=0.1,
         kernel='rbfdot',
         prob.model=TRUE)

## Using automatic sigma estimation (sigest) for RBF or laplace kernel

d$modelCsmall <- predict(modelCsmall,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelCsmall',"svm")

## [1] "***********"
## [1] "model modelCsmall svm"
## [1] "\t train accuracy modelCsmall 0.9"
## [1] "\tmodel explained a 0.67 fraction of the variation on train"
## [1] "\t test accuracy modelCsmall 0.84"
## [1] "\tmodel explained a 0.48 fraction of the variation on test"

## [1] "***********"

# As an example: set C=10.0, which means
# to make correction terms expensive, or coefficient magnitudes cheap
# should prefer optimizing training performance over generalization peformance
modelClarge <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=10.0,
         kernel='rbfdot',
         prob.model=TRUE)

## Using automatic sigma estimation (sigest) for RBF or laplace kernel

d$modelClarge <- predict(modelClarge,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelClarge',"svm")

## [1] "***********"
## [1] "model modelClarge svm"
## [1] "\t train accuracy modelClarge 1"
## [1] "\tmodel explained a 0.99 fraction of the variation on train"
## [1] "\t test accuracy modelClarge 0.88"
## [1] "\tmodel explained a 0.65 fraction of the variation on test"

## [1] "***********"