library('ggplot2')
library('reshape2')
library('randomForest')
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library('glmnet')
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library('kernlab')
library('ROCR')
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library('plyr')
library('Hmisc')
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
dTrain = read.table("isolet1+2+3+4.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTest = read.table("isolet5.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)
rm(list=c('dTest','dTrain'))
d$V618 <- letters[d$V618]
vars <- colnames(d)[1:617]
yColumn <- 'isN'
d[,yColumn] <- d[,'V618']=='n'
# true prevalence of N
mean(d$isN)
## [1] 0.03847634
Functions
# draw a set of size N where target variable has prevalence <prevalence>
# assume target is a T/F variable
makePrevalence = function(dataf, target, prevalence, N) {
# indices of T/F
tset_ix = which(dataf[[target]])
others_ix = which(!dataf[[target]])
ntarget = round(N*prevalence)
heads = sample(tset_ix, size=ntarget, replace=TRUE)
tails = sample(others_ix, size=(N-ntarget), replace=TRUE)
dataf[c(heads, tails),]
}
# to make sure that both factors T and F are represented,
# even if the column is purely TRUE or purely FALSE
bool2factor = function(boolcol) {
bf = as.factor(c(T, F, boolcol))
bf[-(1:2)]
}
metrics = function(y, pred, prevalence, label, threshold=0.5) {
cmat = table(outcome=bool2factor(y),
prediction=bool2factor(pred>threshold))
accuracy = sum(diag(cmat))/sum(cmat)
precision = cmat[2,2]/sum(cmat[,2])
recall = cmat[2,2]/sum(cmat[2,]) # also sensitivity
specificity = cmat[1,1]/sum(cmat[1,]) # 1-FPR
data.frame(prevalence=prevalence, accuracy=accuracy,
precision=precision,
recall=recall,
specificity=specificity,
label=label)
}
# posit a test with a given sensitivity/specificity
# and populations with different prevalences of diabetes
scenario = function(sens, spec, prevalence, trainPrevalence) {
npos = prevalence
nneg = 1-prevalence
tPos = npos*sens # true positives
tNeg = nneg*spec # true negatives
fPos = nneg - tNeg # false positives (negatives mis-diagnosed)
fNeg = npos - tPos # false negatives (positives mis-diagnosed)
# print(paste("accuracy = ", (tPos + tNeg))) # what we got right
# print(paste("precision = ", (tPos/(tPos + fPos)))) # the number predicted positive that really are
data.frame(train_prevalence=trainPrevalence,
th_accuracy=(tPos+tNeg),
th_precision=(tPos/(tPos+fPos)),
th_sens=sens, th_spec=spec)
}
fit_logit = function(data, vars, yColumn) {
# note: as.matrix should only be called if all the
# vars are numerical; otherwise use model.matrix
model = cv.glmnet(x=as.matrix(data[,vars]),y=data[[yColumn]],
alpha=0,
family='binomial')
function(d) {
predict(model,newx=as.matrix(d[,vars]),type='response')[,1]
}
}
fit_rf = function(data, vars, yColumn) {
model = randomForest(x=data[,vars],y=as.factor(data[[yColumn]]))
function(d) {
predict(model,newdata=d,type='prob')[,'TRUE',drop=TRUE]
}
}
fit_svm = function(data, vars, yColumn) {
formula = paste(yColumn, "~", paste(vars, collapse="+"))
model = ksvm(as.formula(formula),data=data,type='C-svc',C=1.0,
kernel='rbfdot',
prob.model=TRUE)
function(d) {
predict(model,newdata=d,type='prob')[,'TRUE',drop=TRUE]
}
}
#
# stats is a dataframe whose rows are the output of metrics()
#
statsPlot = function(stats, metric, baseprev) {
baseline = function(column) {
fmla = as.formula(paste(column, "~ label"))
aggregate(fmla, data=subset(stats, stats$prevalence==baseprev), FUN=mean)
}
ggplot(stats, aes_string(x="prevalence", y=metric, color="label")) +
geom_point(alpha=0.2) +
stat_summary(fun.y="mean", geom="line") +
stat_summary(fun.data="mean_cl_boot", geom="errorbar") +
geom_hline(data=baseline(metric), aes_string(yintercept=metric, color="label"), linetype=2) +
scale_x_continuous("training prevalence") +
ggtitle(metric) + facet_wrap(~label, ncol=1)
}
Do the run
Setup.
# set the random seed
set.seed(5246253)
# use the test set at the base prevalence
test = d[d$isTest,]
basePrev = mean(test[[yColumn]])
trainall = d[!d$isTest,]
# the training set will have various enriched preferences
prevalences = c(c(1, 2, 5, 10)*basePrev, 0.5)
N=2000
# reaches into global environment
makeModelsAndScore = function(prev, verbose=F) {
train = makePrevalence(trainall, yColumn, prev, N)
models = list(fit_logit(train, vars, yColumn),
fit_rf(train, vars, yColumn),
fit_svm(train, vars, yColumn))
names(models) = c("logistic",
"random forest",
"svm")
nmodels = length(models)
if(verbose) {
densityPlot = function(predcolumn, dataf, title) {
ggplot(cbind(dataf, pred=predcolumn), aes_string(x="pred", color=yColumn)) +
geom_density() + ggtitle(title)
}
# training prevalences
print("Metrics on training data")
for(i in seq_len(nmodels)) {
print(densityPlot(models[[i]](train), train, paste(names(models)[i], ": training")))
}
f <- function(i) {metrics(train[[yColumn]],
models[[i]](train),
prev,
names(models)[i])}
metricf = ldply(seq_len(nmodels), .fun=f)
print(metricf)
}
f <- function(i) {metrics(test[[yColumn]],
models[[i]](test),
prev,
names(models)[i])}
metricf = ldply(seq_len(nmodels), .fun=f)
if(verbose) {
# test prevalences
print("Metrics on test data")
print(metricf)
for(i in seq_len(nmodels)) {
print(densityPlot(models[[i]](test), test, paste(names(models)[i], ": test")))
}
}
metricf
}
Run.
makeModelsAndScore(basePrev, verbose=T)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## [1] "Metrics on training data"
## prevalence accuracy precision recall specificity label
## 1 0.03848621 0.9985 1.0000000 0.961039 1.00000 logistic
## 2 0.03848621 1.0000 1.0000000 1.000000 1.00000 random forest
## 3 0.03848621 0.9975 0.9736842 0.961039 0.99896 svm
## [1] "Metrics on test data"
## prevalence accuracy precision recall specificity label
## 1 0.03848621 0.9807569 0.7777778 0.7000000 0.9919947 logistic
## 2 0.03848621 0.9717768 1.0000000 0.2666667 1.0000000 random forest
## 3 0.03848621 0.9846055 0.7903226 0.8166667 0.9913276 svm
## prevalence accuracy precision recall specificity label
## 1 0.03848621 0.9807569 0.7777778 0.7000000 0.9919947 logistic
## 2 0.03848621 0.9717768 1.0000000 0.2666667 1.0000000 random forest
## 3 0.03848621 0.9846055 0.7903226 0.8166667 0.9913276 svm
# stats = ldply(prevalences, .fun=makeModelsAndScore)
stats = ldply(1:10, .fun=function(x){ldply(prevalences, .fun=makeModelsAndScore)})
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
Plot.
statsPlot(stats, "accuracy", basePrev)