BadBayes

See http://www.win-vector.com/blog/2014/02/bad-bayes-an-example-of-why-you-need-hold-out-testing/

library('WVPlots')

## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## Loading required package: plyr
## Loading required package: stringr

# make an example data set with no meaningfull signal
mkData <- function(rows,features,ndup) {
  yValues <- factor(c('A','B'))
  xValues <- factor(c('a','b'))
  yData = sample(yValues,replace=TRUE,size=rows)
  d <- data.frame(y=yData,stringsAsFactors=FALSE)
  mkRandVar <- function(v) {
    sample(xValues,replace=TRUE,size=rows)
  }
  varNames <- paste('v',seq_len(features),sep='_')
  # don't use replicate as it turns factors back to strings 
  varValues <- data.frame(lapply(varNames,mkRandVar),
                          stringsAsFactors = FALSE)
  colnames(varValues) <- varNames
  d <- cbind(d,varValues)
  if(ndup>0) {
    d <- d[rep(seq_len(rows),ndup+1),,drop=FALSE]
  }
  list(d=d,varNames=varNames,xValues=xValues,yValues=yValues)
}

runExample <- function(rows,features,trainer,predictor,ndup=0,title) {
  print('********************')
  print(title)
  print(sys.call(0)) # print call and arguments
  defs <- mkData(rows,features,ndup)
  dTrain <- defs$d
  xValues <- defs$xValues
  yValues <- defs$yValues
  varNames <- defs$varNames
  dTest <- mkData(rows,features,ndup)$d
  model <- trainer(yName='y',varNames=varNames,yValues=yValues,
                   data=dTrain)
  dTrain$predScore <- predictor(model,newdata=dTrain,yValues=yValues)
  scoreThreshold <- median(dTrain$predScore)
  dTrain$pred <- ifelse(dTrain$predScore>=scoreThreshold,
                        as.character(yValues[[2]]),
                        as.character(yValues[[1]]))
  tabTrain <- table(truth=dTrain$y,
                    predict=dTrain$pred)
  print('train set results')
  print(tabTrain)
  if(length(unique(dTrain$pred))>1) {
     print(fisher.test(tabTrain))
     print(ROCPlot(dTrain,'predScore','y',title=paste('Train',title,'ROC plot')))
  }
  dTest$predScore <- predictor(model,newdata=dTest,yValues=yValues)
  dTest$pred <- ifelse(dTest$predScore>=scoreThreshold,
                       as.character(yValues[[2]]),
                       as.character(yValues[[1]]))
  tabTest <- table(truth=dTest$y,
                   predict=dTest$pred)
  print('hold-out test set results')
  print(tabTest)
  if(length(unique(dTest$pred))>1) {
     print(fisher.test(tabTest))
     print(ROCPlot(dTest,'predScore','y',title=paste('Test',title,'ROC plot')))
  }
  print('********************')
  list(tabTrain=tabTrain,tabTest=tabTest)
}

library(e1071)
set.seed(123525)   # make result more repeatable
res <- runExample(rows=800,features=400,title='Naive Bayes',
                  trainer=function(yName,varNames,yValues,data) {
                    formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
                                                sep=' ~ '))
                    naiveBayes(formula,data) 
                  },
                  predictor=function(model,newdata,yValues) { 
                    predict(model,newdata,type='raw')[,yValues[[2]],drop=TRUE]
                  }
)

## [1] "********************"
## [1] "Naive Bayes"
## runExample(rows = 800, features = 400, title = "Naive Bayes", 
##     trainer = function(yName, varNames, yValues, data) {
##         formula <- as.formula(paste(yName, paste(varNames, collapse = " + "), 
##             sep = " ~ "))
##         naiveBayes(formula, data)
##     }, predictor = function(model, newdata, yValues) {
##         predict(model, newdata, type = "raw")[, yValues[[2]], 
##             drop = TRUE]
##     })
## [1] "train set results"
##      predict
## truth   A   B
##     A 303  89
##     B  97 311
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   7.758333 15.368197
## sample estimates:
## odds ratio 
##   10.87203

## [1] "hold-out test set results"
##      predict
## truth   A   B
##     A 206 190
##     B 213 191
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 0.8874
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7295008 1.2957197
## sample estimates:
## odds ratio 
##  0.9722492

## [1] "********************"

library(rpart)
res <- runExample(rows=800,features=400,title='Decision Tree',
   trainer=function(yName,varNames,yValues,data) {
     formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
        sep=' ~ '))
     rpart(formula,data) 
   },
   predictor=function(model,newdata,yValues) { 
      predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
   }
)

## [1] "********************"
## [1] "Decision Tree"
## runExample(rows = 800, features = 400, title = "Decision Tree", 
##     trainer = function(yName, varNames, yValues, data) {
##         formula <- as.formula(paste(yName, paste(varNames, collapse = " + "), 
##             sep = " ~ "))
##         rpart(formula, data)
##     }, predictor = function(model, newdata, yValues) {
##         predict(model, newdata, type = "prob")[, yValues[[2]], 
##             drop = TRUE]
##     })
## [1] "train set results"
##      predict
## truth   A   B
##     A 291 102
##     B  96 311
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   6.618877 12.913771
## sample estimates:
## odds ratio 
##   9.209628

## [1] "hold-out test set results"
##      predict
## truth   A   B
##     A 196 192
##     B 199 213
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 0.5715
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8199783 1.4560350
## sample estimates:
## odds ratio 
##   1.092529

## [1] "********************"

# glm example
set.seed(123525)   # make result more repeatable
res <- runExample(rows=800,features=400,title='GLM',
   trainer=function(yName,varNames,yValues,data) {
      formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
         sep=' ~ '))
      glm(formula,data,family=binomial(link='logit')) 
   },
   predictor=function(model,newdata,yValues) { 
      predict(model,newdata=newdata,type='response')
   }
)

## [1] "********************"
## [1] "GLM"
## runExample(rows = 800, features = 400, title = "GLM", trainer = function(yName, 
##     varNames, yValues, data) {
##     formula <- as.formula(paste(yName, paste(varNames, collapse = " + "), 
##         sep = " ~ "))
##     glm(formula, data, family = binomial(link = "logit"))
## }, predictor = function(model, newdata, yValues) {
##     predict(model, newdata = newdata, type = "response")
## })

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "train set results"
##      predict
## truth   A   B
##     A 309  83
##     B  84 324
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  10.06874 20.49801
## sample estimates:
## odds ratio 
##   14.29896

## [1] "hold-out test set results"
##      predict
## truth   A   B
##     A 195 201
##     B 209 195
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 0.5245
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.679335 1.206029
## sample estimates:
## odds ratio 
##  0.9052771

## [1] "********************"

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

res <- runExample(rows=800,features=400,title='Random Forest',
   trainer=function(yName,varNames,yValues,data) {
      formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
         sep=' ~ '))
      randomForest(formula,data) 
   },
   predictor=function(model,newdata,yValues) { 
      predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
   }
)

## [1] "********************"
## [1] "Random Forest"
## runExample(rows = 800, features = 400, title = "Random Forest", 
##     trainer = function(yName, varNames, yValues, data) {
##         formula <- as.formula(paste(yName, paste(varNames, collapse = " + "), 
##             sep = " ~ "))
##         randomForest(formula, data)
##     }, predictor = function(model, newdata, yValues) {
##         predict(model, newdata, type = "prob")[, yValues[[2]], 
##             drop = TRUE]
##     })
## [1] "train set results"
##      predict
## truth   A   B
##     A 393   0
##     B   7 400
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  4201.026      Inf
## sample estimates:
## odds ratio 
##        Inf

## [1] "hold-out test set results"
##      predict
## truth   A
##     A 388
##     B 412
## [1] "********************"

library('kernlab')
set.seed(123525)   # make result more repeatable
res <- runExample(rows=800,features=400,title='SVM',
   trainer=function(yName,varNames,yValues,data) {
      formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
         sep=' ~ '))
      ksvm(formula,data=data,kernel ="rbfdot", prob.model=TRUE) 
   },
   predictor=function(model,newdata,yValues) { 
      predict(model,newdata,type='prob')[,yValues[[2]],drop=TRUE]
   }
)

## [1] "********************"
## [1] "SVM"
## runExample(rows = 800, features = 400, title = "SVM", trainer = function(yName, 
##     varNames, yValues, data) {
##     formula <- as.formula(paste(yName, paste(varNames, collapse = " + "), 
##         sep = " ~ "))
##     ksvm(formula, data = data, kernel = "rbfdot", prob.model = TRUE)
## }, predictor = function(model, newdata, yValues) {
##     predict(model, newdata, type = "prob")[, yValues[[2]], drop = TRUE]
## })
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## [1] "train set results"
##      predict
## truth   A   B
##     A 363  29
##     B  37 371
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   73.63263 214.91591
## sample estimates:
## odds ratio 
##    123.477

## [1] "hold-out test set results"
##      predict
## truth   A   B
##     A 218 178
##     B 224 180
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 0.9433
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7374922 1.3133196
## sample estimates:
## odds ratio 
##  0.9841653

## [1] "********************"

library('glmnet')

## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2

set.seed(123525)   # make result more repeatable
res <- runExample(rows=800,features=400,ndup=5,title='Elastic Net',
   trainer=function(yName,varNames,yValues,data) {
      formula <- as.formula(paste('',paste(c(varNames),collapse=' + '),
         sep=' ~ '))
      z <- model.matrix(formula,data)
      z <- z[,setdiff(colnames(z),'(Intercept)'),drop=FALSE]
      cv <- cv.glmnet(x=z,y=data$y,alpha=0.5,family='binomial')
      model <- list(model=cv,formula=formula)
      model
   },
   predictor=function(model,newdata,yValues) { 
      z <- model.matrix(model$formula,newdata)
      z <- z[,setdiff(colnames(z),'(Intercept)'),drop=FALSE]
      predict(model$model,newx=z,type='response')
   }
)

## [1] "********************"
## [1] "Elastic Net"
## print(sys.call(0))
## [1] "train set results"
##      predict
## truth    A    B
##     A 2352    0
##     B   48 2400
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  8402.051      Inf
## sample estimates:
## odds ratio 
##        Inf

## [1] "hold-out test set results"
##      predict
## truth    A    B
##     A 1152 1224
##     B 1326 1098
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 1.678e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6946184 0.8744151
## sample estimates:
## odds ratio 
##  0.7793948

## [1] "********************"

library(gbm)

## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1

set.seed(123525)   # make result more repeatable
res <- runExample(rows=800,features=400,title='Gradient Boost',
                  trainer=function(yName,varNames,yValues,data) {
                    data[[yName]] <- ifelse(data[[yName]]==yValues[[2]],1,0)
                    formula <- as.formula(paste(yName,paste(varNames,collapse=' + '),
                                                sep=' ~ '))
                    # GBM has problems with complicated formulas or extra columns in frame
                    # http://stackoverflow.com/questions/25514484/error-in-r-gbm-function-when-cv-folds-0
                    model <- gbm(formula,data=data[,c(yName,varNames),drop=FALSE],
                        distribution='bernoulli',
                        n.trees=100,cv.folds=3,
                        interaction.depth=3)
                    ntrees <- gbm.perf(model,plot.it=FALSE)
                    #print(paste('ntrees',ntrees))
                    list(model=model,ntrees=ntrees)
                  },
                  predictor=function(model,newdata,yValues) { 
                    predict(model$model,newdata,n.trees=model$ntrees,type='response')
                  }
)

## [1] "********************"
## [1] "Gradient Boost"
## print(sys.call(0))
## Using cv method...
## [1] "train set results"
##      predict
## truth   A   B
##     A 300  92
##     B 100 308
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTrain
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   7.165884 14.087836
## sample estimates:
## odds ratio 
##    10.0059

## [1] "hold-out test set results"
##      predict
## truth   A   B
##     A 228 168
##     B 227 177
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabTest
## p-value = 0.7213
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7920231 1.4139307
## sample estimates:
## odds ratio 
##   1.058113

## [1] "********************"

BadBayes

John Mount

July 13, 2015