# Spiral example from:
# https://github.com/WinVector/zmPDSwR/blob/master/CodeExamples/c09_Exploring_advanced_methods/00196_example_9.21_of_section_9.4.2.R
library('kernlab')
library('ggplot2')
data('spirals')
sc <- specc(spirals, centers = 2)
s <- data.frame(x=spirals[,1],y=spirals[,2],
class=as.factor(sc))
set.seed(2335246L)
s$group <- sample.int(100,size=dim(s)[[1]],replace=T)
sTrain <- subset(s,group>10)
sTest <- subset(s,group<=10)
mSVMG <- ksvm(class~x+y,data=sTrain,kernel='rbfdot',type='nu-svc')
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
sTest$predSVMG <- predict(mSVMG,newdata=sTest,type='response')
ggplot() +
geom_text(data=sTest,aes(x=x,y=y,
label=predSVMG),size=12) +
geom_text(data=s,aes(x=x,y=y,
label=class,color=class),alpha=0.7) +
coord_fixed() +
theme_bw() + theme(legend.position='none')
# Actual example
# isolet data from https://archive.ics.uci.edu/ml/datasets/ISOLET
library('ggplot2')
library('reshape2')
library('kernlab')
library('ROCR')
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
dTrain = read.table("isolet1+2+3+4.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTest = read.table("isolet5.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)
rm(list=c('dTest','dTrain'))
d$V618 <- letters[d$V618]
vars <- colnames(d)[1:617]
yColumn <- 'isLetter'
d <- d[d$V618 %in% c('m','n'),,drop=FALSE]
d[,yColumn] <- d[,'V618']=='n'
formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
# define some helper and reporting functions
# calulcate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc <- function(x,y) {
# append extra points to get rid of degenerate cases
x <- c(0,x,1)
y <- c(0,y,1)
n <- length(x)
sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}
# needs ggplot2 and reshape2
# plot the gain-curve which is what proportion of the target (make target numric)
# is sorted into position by the prediction
# example:
# d <- data.frame(pred=runif(100))
# d$y <- runif(100)<d$pred
# gainCurve('test',d$pred,d$y)
gainCurve = function(truthcol, predcol, title) {
truthcol <- as.numeric(truthcol)
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol=predcol,truthcol=truthcol)
predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
wizard = order(d[['truthcol']], decreasing=TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(pctpop= (1:npop)/npop,
model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
modelArea = areaCalc(results$pctpop,results$model) - 0.5
giniScore = modelArea/idealArea # actually, normalized gini score
# melt the frame into the tall form, for plotting
results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
variable.name="sort_criterion", value.name="pct_outcome")
gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) +
geom_point() + geom_line() +
geom_abline(color="gray") +
ggtitle(paste("Gain curve,", title, '\n',
'relative Gini score', format(giniScore,digits=2))) +
xlab("% items in score order") + ylab("% total category") +
scale_x_continuous(breaks=seq(0,1,0.1)) +
scale_y_continuous(breaks=seq(0,1,0.1)) +
scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
gplot
}
plotROC <- function(title,outcol,predcol) {
pred <- prediction(predcol,outcol)
perf <- performance(pred,'tpr','fpr')
auc <- as.numeric(performance(pred,'auc')@y.values)
pf <- data.frame(
FalsePositiveRate=perf@x.values[[1]],
TruePositiveRate=perf@y.values[[1]])
plot=ggplot() +
geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
fill='blue',alpha=0.3) +
geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
list(pf=pf,plot=plot)
}
deviance <- function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
reportStats <- function(d,test,modelName,title,epsilon) {
dSub <- d[d$isTest==test,,drop=FALSE]
tab <- table(truth=dSub[,yColumn],pred=dSub[,modelName]>0.5)
accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
note = ifelse(test,'test','train')
print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2)))
residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon)
#print(paste('\tresidual.deviance',residual.deviance))
null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon)
#print(paste('\tnull.deviance',null.deviance))
print(paste("\tmodel explained a",
format((1-residual.deviance/null.deviance),digits=2),
"fraction of the variation on",note))
}
report <- function(d,modelName,title,epsilon=1.0e-2) {
print("***********")
print(paste("model",modelName,title))
reportStats(d,FALSE,modelName,title,epsilon)
reportStats(d,TRUE,modelName,title,epsilon)
print(ggplot(data=d[d$isTest==TRUE,,drop=FALSE],
aes_string(x=modelName,color=yColumn)) +
geom_density() +
ggtitle(paste(title,'test')))
print(plotROC(paste(title,'train'),
d[d$isTest==FALSE,yColumn],
d[d$isTest==FALSE,modelName])$plot)
print(plotROC(paste(title,'test'),
d[d$isTest==TRUE,yColumn],
d[d$isTest==TRUE,modelName])$plot)
print(gainCurve(d[d$isTest==FALSE,yColumn],
d[d$isTest==FALSE,modelName],
paste(title,'train')))
print(gainCurve(d[d$isTest==TRUE,yColumn],
d[d$isTest==TRUE,modelName],
paste(title,'test')))
print("***********")
}
# do the SVM modeling
modelSVM <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=1.0,
kernel='rbfdot',
prob.model=TRUE)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
d$modelSVM <- predict(modelSVM,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelSVM',"svm")
## [1] "***********"
## [1] "model modelSVM svm"
## [1] "\t train accuracy modelSVM 1"
## [1] "\tmodel explained a 0.96 fraction of the variation on train"
## [1] "\t test accuracy modelSVM 0.88"
## [1] "\tmodel explained a 0.64 fraction of the variation on test"
## [1] "***********"
help(svm)
## No documentation for 'svm' in specified packages and libraries:
## you could try '??svm'
# As an example: set C=0.1, which means
# to make correction terms cheap, or coefficient magnitudes expensive
# should prefer minimizing exess genaralization error to training performance
modelCsmall <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=0.1,
kernel='rbfdot',
prob.model=TRUE)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
d$modelCsmall <- predict(modelCsmall,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelCsmall',"svm")
## [1] "***********"
## [1] "model modelCsmall svm"
## [1] "\t train accuracy modelCsmall 0.9"
## [1] "\tmodel explained a 0.67 fraction of the variation on train"
## [1] "\t test accuracy modelCsmall 0.84"
## [1] "\tmodel explained a 0.48 fraction of the variation on test"
## [1] "***********"
# As an example: set C=10.0, which means
# to make correction terms expensive, or coefficient magnitudes cheap
# should prefer optimizing training performance over generalization peformance
modelClarge <- ksvm(as.formula(formula),data=d[!d$isTest,],type='C-svc',C=10.0,
kernel='rbfdot',
prob.model=TRUE)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
d$modelClarge <- predict(modelClarge,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'modelClarge',"svm")
## [1] "***********"
## [1] "model modelClarge svm"
## [1] "\t train accuracy modelClarge 1"
## [1] "\tmodel explained a 0.99 fraction of the variation on train"
## [1] "\t test accuracy modelClarge 0.88"
## [1] "\tmodel explained a 0.65 fraction of the variation on test"
## [1] "***********"