d <- readRDS(file='KDD2009preds.RDS') # from KDD2009Cross.Rmd
set.seed(25325)
treatedTrainP <- d$treatedTrainP
treatedTestP <- d$treatedTestP
WVPlots::ROCPlotPair(treatedTrainP,'gbm','randomForest','churn',TRUE,
'AUC comparison on train',
nrep=1000,
parallelCluster = cl)
testD <- WVPlots::ROCPlotPair(treatedTestP,'gbm','randomForest','churn',TRUE,
'AUC comparison on test',
nrep=1000,
parallelCluster = cl, returnScores = TRUE)
print(testD$plot)
statsPaired <- testD$aucsig
print(statsPaired$eFreq) # fraction of time we saw reversal
## [1] 0.113
print(statsPaired$eValue) # Student probability of reversal
## [1] 0.1014348
statsPaired$observedScore1-statsPaired$observedScore2
## [1] 0.01208274
WVPlots::ShadedDensity(statsPaired$resampledScores, "diff", 0,
title="paired differences in resampled test AUCs") +
geom_vline(xintercept = statsPaired$observedScore1-statsPaired$observedScore2)
WVPlots::PlotDistCountNormal(statsPaired$resampledScores, "diff",'empirical and normal differences') +
geom_vline(xintercept = 0, color='red') +
geom_vline(xintercept = statsPaired$observedScore1-statsPaired$observedScore2,color='darkgreen')
WVPlots::PlotDistDensityNormal(statsPaired$resampledScores, "diff",'empirical and normal differences') +
geom_vline(xintercept = 0, color='red') +
geom_vline(xintercept = statsPaired$observedScore1-statsPaired$observedScore2,color='darkgreen')
# posterior-like scores by resampling
statsGBM <- sigr::formatAUCresample(treatedTestP,'gbm','churn',TRUE,
returnScores=TRUE,parallelCluster=cl)
resamples <- data.frame(AUC=statsGBM$eScore$resampledScores,
model='gbm',
stringsAsFactors=FALSE)
statsRF <- sigr::formatAUCresample(treatedTestP,'randomForest','churn',TRUE,
returnScores=TRUE,parallelCluster=cl)
resamples <- rbind(resamples,
data.frame(AUC=statsRF$eScore$resampledScores,
model='randomForest',
stringsAsFactors=FALSE))
observed <- data.frame(model=c('gbm','randomForest'),
AUC=c(statsGBM$eScore$observedScore,statsRF$eScore$observedScore),
stringsAsFactors = FALSE)
ggplot() +
geom_density(data=resamples,mapping=aes(x=AUC,color=model)) +
geom_vline(data=observed,mapping=aes(xintercept=AUC,color=model)) +
ggtitle("resampled test AUCs")
statsGBM$eScore$observedScore
## [1] 0.7305541
statsRF$eScore$observedScore
## [1] 0.7184713
statsGBM$eScore$observedScore - statsRF$eScore$observedScore
## [1] 0.01208274
aggregate(AUC~model,data=resamples,FUN=mean)
## model AUC
## 1 gbm 0.7317959
## 2 randomForest 0.7175508
aggregate(AUC~model,data=resamples,FUN=median)
## model AUC
## 1 gbm 0.7321129
## 2 randomForest 0.7175756