See http://www.win-vector.com/blog/2014/02/bad-bayes-an-example-of-why-you-need-hold-out-testing/
library('WVPlots')
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
# make an example data set with no meaningfull signal
mkData <- function(rows,features,ndup) {
yValues <- factor(c('A','B'))
xValues <- factor(c('a','b'))
yData = sample(yValues,replace=TRUE,size=rows)
d <- data.frame(y=yData,stringsAsFactors=FALSE)
mkRandVar <- function(v) {
sample(xValues,replace=TRUE,size=rows)
}
varNames <- paste('v',seq_len(features),sep='_')
# don't use replicate as it turns factors back to strings
varValues <- data.frame(lapply(varNames,mkRandVar),
stringsAsFactors = FALSE)
colnames(varValues) <- varNames
d <- cbind(d,varValues)
if(ndup>0) {
d <- d[rep(seq_len(rows),ndup+1),,drop=FALSE]
}
list(d=d,varNames=varNames,xValues=xValues,yValues=yValues)
}
runExample <- function(rows,features,trainer,predictor,ndup=0,title) {
print('********************')
print(title)
print(sys.call(0)) # print call and arguments
defs <- mkData(rows,features,ndup)
dTrain <- defs$d
xValues <- defs$xValues
yValues <- defs$yValues
varNames <- defs$varNames
dTest <- mkData(rows,features,ndup)$d
model <- trainer(yName='y',varNames=varNames,yValues=yValues,
data=dTrain)
dTrain$predScore <- predictor(model,newdata=dTrain,yValues=yValues)
scoreThreshold <- median(dTrain$predScore)
dTrain$pred <- ifelse(dTrain$predScore>=scoreThreshold,
as.character(yValues[[2]]),
as.character(yValues[[1]]))
tabTrain <- table(truth=dTrain$y,
predict=dTrain$pred)
print('train set results')
print(tabTrain)
if(length(unique(dTrain$pred))>1) {
print(fisher.test(tabTrain))
print(ROCPlot(dTrain,'predScore','y',title=paste('Train',title,'ROC plot')))
}
dTest$predScore <- predictor(model,newdata=dTest,yValues=yValues)
dTest$pred <- ifelse(dTest$predScore>=scoreThreshold,
as.character(yValues[[2]]),
as.character(yValues[[1]]))
tabTest <- table(truth=dTest$y,
predict=dTest$pred)
print('hold-out test set results')
print(tabTest)
if(length(unique(dTest$pred))>1) {
print(fisher.test(tabTest))
print(ROCPlot(dTest,'predScore','y',title=paste('Test',title,'ROC plot')))
}
print('********************')
list(tabTrain=tabTrain,tabTest=tabTest)
}
library(e1071)
set.seed(123525) # make result more repeatable
res <- runExample(rows=800,features=400,title='Naive Bayes',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
naiveBayes(formula,data)
},
predictor=function(model,newdata,yValues) {
predict(model,newdata,type='raw')[,yValues[[2]],drop=TRUE]
}
)
## [1] "********************"
## [1] "Naive Bayes"
## runExample(rows = 800, features = 400, title = "Naive Bayes",
## trainer = function(yName, varNames, yValues, data) {
## formula <- as.formula(paste(yName, paste(varNames, collapse = " + "),
## sep = " ~ "))
## naiveBayes(formula, data)
## }, predictor = function(model, newdata, yValues) {
## predict(model, newdata, type = "raw")[, yValues[[2]],
## drop = TRUE]
## })
## [1] "train set results"
## predict
## truth A B
## A 303 89
## B 97 311
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 7.758333 15.368197
## sample estimates:
## odds ratio
## 10.87203
## [1] "hold-out test set results"
## predict
## truth A B
## A 206 190
## B 213 191
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 0.8874
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7295008 1.2957197
## sample estimates:
## odds ratio
## 0.9722492
## [1] "********************"
library(rpart)
res <- runExample(rows=800,features=400,title='Decision Tree',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
rpart(formula,data)
},
predictor=function(model,newdata,yValues) {
predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
}
)
## [1] "********************"
## [1] "Decision Tree"
## runExample(rows = 800, features = 400, title = "Decision Tree",
## trainer = function(yName, varNames, yValues, data) {
## formula <- as.formula(paste(yName, paste(varNames, collapse = " + "),
## sep = " ~ "))
## rpart(formula, data)
## }, predictor = function(model, newdata, yValues) {
## predict(model, newdata, type = "prob")[, yValues[[2]],
## drop = TRUE]
## })
## [1] "train set results"
## predict
## truth A B
## A 291 102
## B 96 311
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 6.618877 12.913771
## sample estimates:
## odds ratio
## 9.209628
## [1] "hold-out test set results"
## predict
## truth A B
## A 196 192
## B 199 213
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 0.5715
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.8199783 1.4560350
## sample estimates:
## odds ratio
## 1.092529
## [1] "********************"
# glm example
set.seed(123525) # make result more repeatable
res <- runExample(rows=800,features=400,title='GLM',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
glm(formula,data,family=binomial(link='logit'))
},
predictor=function(model,newdata,yValues) {
predict(model,newdata=newdata,type='response')
}
)
## [1] "********************"
## [1] "GLM"
## runExample(rows = 800, features = 400, title = "GLM", trainer = function(yName,
## varNames, yValues, data) {
## formula <- as.formula(paste(yName, paste(varNames, collapse = " + "),
## sep = " ~ "))
## glm(formula, data, family = binomial(link = "logit"))
## }, predictor = function(model, newdata, yValues) {
## predict(model, newdata = newdata, type = "response")
## })
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "train set results"
## predict
## truth A B
## A 309 83
## B 84 324
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 10.06874 20.49801
## sample estimates:
## odds ratio
## 14.29896
## [1] "hold-out test set results"
## predict
## truth A B
## A 195 201
## B 209 195
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 0.5245
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.679335 1.206029
## sample estimates:
## odds ratio
## 0.9052771
## [1] "********************"
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
res <- runExample(rows=800,features=400,title='Random Forest',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
randomForest(formula,data)
},
predictor=function(model,newdata,yValues) {
predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
}
)
## [1] "********************"
## [1] "Random Forest"
## runExample(rows = 800, features = 400, title = "Random Forest",
## trainer = function(yName, varNames, yValues, data) {
## formula <- as.formula(paste(yName, paste(varNames, collapse = " + "),
## sep = " ~ "))
## randomForest(formula, data)
## }, predictor = function(model, newdata, yValues) {
## predict(model, newdata, type = "prob")[, yValues[[2]],
## drop = TRUE]
## })
## [1] "train set results"
## predict
## truth A B
## A 393 0
## B 7 400
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 4201.026 Inf
## sample estimates:
## odds ratio
## Inf
## [1] "hold-out test set results"
## predict
## truth A
## A 388
## B 412
## [1] "********************"
library('kernlab')
set.seed(123525) # make result more repeatable
res <- runExample(rows=800,features=400,title='SVM',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
ksvm(formula,data=data,kernel ="rbfdot", prob.model=TRUE)
},
predictor=function(model,newdata,yValues) {
predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
}
)
## [1] "********************"
## [1] "SVM"
## runExample(rows = 800, features = 400, title = "SVM", trainer = function(yName,
## varNames, yValues, data) {
## formula <- as.formula(paste(yName, paste(varNames, collapse = " + "),
## sep = " ~ "))
## ksvm(formula, data = data, kernel = "rbfdot", prob.model = TRUE)
## }, predictor = function(model, newdata, yValues) {
## predict(model, newdata, type = "prob")[, yValues[[2]], drop = TRUE]
## })
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## [1] "train set results"
## predict
## truth A B
## A 363 29
## B 37 371
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 73.63263 214.91591
## sample estimates:
## odds ratio
## 123.477
## [1] "hold-out test set results"
## predict
## truth A B
## A 218 178
## B 224 180
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 0.9433
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7374922 1.3133196
## sample estimates:
## odds ratio
## 0.9841653
## [1] "********************"
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
set.seed(123525) # make result more repeatable
res <- runExample(rows=800,features=400,ndup=5,title='Elastic Net',
trainer=function(yName,varNames,yValues,data) {
formula <- as.formula(paste('',paste(c(varNames),collapse=' + '),
sep=' ~ '))
z <- model.matrix(formula,data)
z <- z[,setdiff(colnames(z),'(Intercept)'),drop=FALSE]
cv <- cv.glmnet(x=z,y=data$y,alpha=0.5,family='binomial')
model <- list(model=cv,formula=formula)
model
},
predictor=function(model,newdata,yValues) {
z <- model.matrix(model$formula,newdata)
z <- z[,setdiff(colnames(z),'(Intercept)'),drop=FALSE]
predict(model$model,newx=z,type='response')
}
)
## [1] "********************"
## [1] "Elastic Net"
## print(sys.call(0))
## [1] "train set results"
## predict
## truth A B
## A 2352 0
## B 48 2400
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 8402.051 Inf
## sample estimates:
## odds ratio
## Inf
## [1] "hold-out test set results"
## predict
## truth A B
## A 1152 1224
## B 1326 1098
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 1.678e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6946184 0.8744151
## sample estimates:
## odds ratio
## 0.7793948
## [1] "********************"
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
set.seed(123525) # make result more repeatable
res <- runExample(rows=800,features=400,title='Gradient Boost',
trainer=function(yName,varNames,yValues,data) {
data[[yName]] <- ifelse(data[[yName]]==yValues[[2]],1,0)
formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
sep=' ~ '))
# GBM has problems with complicated formulas or extra columns in frame
# http://stackoverflow.com/questions/25514484/error-in-r-gbm-function-when-cv-folds-0
model <- gbm(formula,data=data[,c(yName,varNames),drop=FALSE],
distribution='bernoulli',
n.trees=100,cv.folds=3,
interaction.depth=3)
ntrees <- gbm.perf(model,plot.it=FALSE)
#print(paste('ntrees',ntrees))
list(model=model,ntrees=ntrees)
},
predictor=function(model,newdata,yValues) {
predict(model$model,newdata,n.trees=model$ntrees,type='response')
}
)
## [1] "********************"
## [1] "Gradient Boost"
## print(sys.call(0))
## Using cv method...
## [1] "train set results"
## predict
## truth A B
## A 300 92
## B 100 308
##
## Fisher's Exact Test for Count Data
##
## data: tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 7.165884 14.087836
## sample estimates:
## odds ratio
## 10.0059
## [1] "hold-out test set results"
## predict
## truth A B
## A 228 168
## B 227 177
##
## Fisher's Exact Test for Count Data
##
## data: tabTest
## p-value = 0.7213
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7920231 1.4139307
## sample estimates:
## odds ratio
## 1.058113
## [1] "********************"