library('ggplot2')
library('reshape2')
library('randomForest')
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library('glmnet')
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library('kernlab')
library('ROCR')
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library('plyr')
library('Hmisc')
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
dTrain = read.table("isolet1+2+3+4.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTest = read.table("isolet5.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)
rm(list=c('dTest','dTrain'))
d$V618 <- letters[d$V618]
vars <- colnames(d)[1:617]
yColumn <- 'isN'
d[,yColumn] <- d[,'V618']=='n'

# true prevalence of N
 mean(d$isN)
## [1] 0.03847634

Functions

# draw a set of size N where target variable has prevalence <prevalence>
# assume target is a T/F variable
makePrevalence = function(dataf, target, prevalence, N) {
  # indices of T/F
  tset_ix = which(dataf[[target]])
  others_ix = which(!dataf[[target]])
  
  ntarget = round(N*prevalence)
  
  heads = sample(tset_ix, size=ntarget, replace=TRUE)
  tails = sample(others_ix, size=(N-ntarget), replace=TRUE)
  
  dataf[c(heads, tails),]
}

# to make sure that both factors T and F are represented,
# even if the column is purely TRUE or purely FALSE
bool2factor = function(boolcol) {
  bf = as.factor(c(T, F, boolcol))
  bf[-(1:2)]
}


metrics = function(y, pred, prevalence, label, threshold=0.5) {
  cmat = table(outcome=bool2factor(y), 
               prediction=bool2factor(pred>threshold))
  accuracy = sum(diag(cmat))/sum(cmat)
  precision = cmat[2,2]/sum(cmat[,2])
  recall = cmat[2,2]/sum(cmat[2,]) # also sensitivity
  specificity = cmat[1,1]/sum(cmat[1,]) # 1-FPR
  data.frame(prevalence=prevalence, accuracy=accuracy, 
             precision=precision,
             recall=recall,
             specificity=specificity,
             label=label)
}

# posit a test with a given sensitivity/specificity
# and populations with different prevalences of diabetes
scenario = function(sens, spec, prevalence, trainPrevalence) {
  npos = prevalence
  nneg = 1-prevalence
  
  tPos = npos*sens  # true positives
  tNeg = nneg*spec  # true negatives
  
  fPos = nneg - tNeg # false positives (negatives mis-diagnosed)
  fNeg = npos - tPos # false negatives (positives mis-diagnosed)
  
 # print(paste("accuracy = ", (tPos + tNeg))) # what we got right
 #  print(paste("precision = ", (tPos/(tPos + fPos))))  # the number predicted positive that really are
 data.frame(train_prevalence=trainPrevalence,
            th_accuracy=(tPos+tNeg),
            th_precision=(tPos/(tPos+fPos)),
            th_sens=sens, th_spec=spec)

}


fit_logit = function(data, vars, yColumn) {
  # note: as.matrix should only be called if all the
  # vars are numerical; otherwise use model.matrix
  model = cv.glmnet(x=as.matrix(data[,vars]),y=data[[yColumn]],
                         alpha=0, 
                         family='binomial')
  
  function(d) { 
    predict(model,newx=as.matrix(d[,vars]),type='response')[,1]
  }
}

fit_rf = function(data, vars, yColumn) {
   model = randomForest(x=data[,vars],y=as.factor(data[[yColumn]]))
   
   function(d) {
     predict(model,newdata=d,type='prob')[,'TRUE',drop=TRUE]
  }
}


fit_svm = function(data, vars, yColumn) {
  formula = paste(yColumn, "~", paste(vars, collapse="+"))
  model = ksvm(as.formula(formula),data=data,type='C-svc',C=1.0,
               kernel='rbfdot',
               prob.model=TRUE)
  
  function(d) {
    predict(model,newdata=d,type='prob')[,'TRUE',drop=TRUE]
    }
}

#
# stats is a dataframe whose rows are the output of metrics()
#
statsPlot = function(stats, metric, baseprev) {
  
  baseline = function(column) {
    fmla = as.formula(paste(column, "~ label"))
    aggregate(fmla, data=subset(stats, stats$prevalence==baseprev), FUN=mean)
    }
  
  ggplot(stats, aes_string(x="prevalence", y=metric, color="label")) + 
    geom_point(alpha=0.2) + 
    stat_summary(fun.y="mean", geom="line") + 
    stat_summary(fun.data="mean_cl_boot", geom="errorbar") + 
    geom_hline(data=baseline(metric), aes_string(yintercept=metric, color="label"), linetype=2) + 
    scale_x_continuous("training prevalence") + 
    ggtitle(metric) + facet_wrap(~label, ncol=1) 
  }

Do the run

Setup.

# set the random seed
set.seed(5246253)

# use the test set at the base prevalence
test = d[d$isTest,]
basePrev = mean(test[[yColumn]])

trainall = d[!d$isTest,]

# the training set will have various enriched preferences
prevalences = c(c(1, 2, 5, 10)*basePrev, 0.5)

N=2000

# reaches into global environment
makeModelsAndScore = function(prev, verbose=F) {
  train = makePrevalence(trainall, yColumn, prev, N)
  
  models = list(fit_logit(train, vars, yColumn),
                fit_rf(train, vars, yColumn),
                fit_svm(train, vars, yColumn))
  names(models) = c("logistic",
                    "random forest",
                    "svm")
  nmodels = length(models)
  
  if(verbose) {
    densityPlot = function(predcolumn, dataf, title) {
      ggplot(cbind(dataf, pred=predcolumn), aes_string(x="pred", color=yColumn)) +
        geom_density() + ggtitle(title)
      }
    
    # training prevalences
    print("Metrics on training data")
    
    for(i in seq_len(nmodels)) {
      print(densityPlot(models[[i]](train), train, paste(names(models)[i], ": training")))
      }
    
    f <- function(i) {metrics(train[[yColumn]],
                              models[[i]](train), 
                              prev, 
                              names(models)[i])}
    
    metricf = ldply(seq_len(nmodels), .fun=f)
    print(metricf)
    
 }
  
  f <- function(i) {metrics(test[[yColumn]],
                            models[[i]](test), 
                            prev, 
                            names(models)[i])}
  
  metricf = ldply(seq_len(nmodels), .fun=f)
  
  
 if(verbose) {
    # test prevalences
    print("Metrics on test data")
    print(metricf)
    
    for(i in seq_len(nmodels)) {
      print(densityPlot(models[[i]](test), test, paste(names(models)[i], ": test")))
    }
  }
  
  metricf
}

Run.

makeModelsAndScore(basePrev, verbose=T)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## [1] "Metrics on training data"

##   prevalence accuracy precision   recall specificity         label
## 1 0.03848621   0.9985 1.0000000 0.961039     1.00000      logistic
## 2 0.03848621   1.0000 1.0000000 1.000000     1.00000 random forest
## 3 0.03848621   0.9975 0.9736842 0.961039     0.99896           svm
## [1] "Metrics on test data"
##   prevalence  accuracy precision    recall specificity         label
## 1 0.03848621 0.9807569 0.7777778 0.7000000   0.9919947      logistic
## 2 0.03848621 0.9717768 1.0000000 0.2666667   1.0000000 random forest
## 3 0.03848621 0.9846055 0.7903226 0.8166667   0.9913276           svm

##   prevalence  accuracy precision    recall specificity         label
## 1 0.03848621 0.9807569 0.7777778 0.7000000   0.9919947      logistic
## 2 0.03848621 0.9717768 1.0000000 0.2666667   1.0000000 random forest
## 3 0.03848621 0.9846055 0.7903226 0.8166667   0.9913276           svm
# stats = ldply(prevalences, .fun=makeModelsAndScore)
stats = ldply(1:10, .fun=function(x){ldply(prevalences, .fun=makeModelsAndScore)})
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel

Plot.

statsPlot(stats, "accuracy", basePrev)