load('bsteps.Rdata')
source('Lfns.R')
source('Afns.R')
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loaded gbm 2.1.3
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# column selection doesn't work the same for data.table
# in particular d[1,'columnTitle'] returns the column title, not value
results <- data.frame(data.table::rbindlist(resList))
# # build a very crude clustering model on a very small subset of the data
# gVars <- names(treatmentsC$varScores)[treatmentsC$varScores<0.999]
# scI <- doFitApplyDupModelX(yName,gVars,
# treatedTrainM,
# list(train=treatedTrainM),
# outSample=FALSE)
# print("Bayes limit estimate (unreliable)")
# print(scI)
# problems with ddply when input and output column names match
# work around
killMean <- '\\.mean$'
changeColNames <- function(d,regexpToZap) {
toFix <- grep(regexpToZap,colnames(d))
if(length(toFix)>0) {
cols <- colnames(d)[toFix]
for(ci in cols) {
newName <- gsub(regexpToZap,'',ci)
d[[newName]] <- d[[ci]]
d[[ci]] <- c()
}
}
d
}
# 95% confidence interval from fit normal distribution
resultsPn <- data.frame(data.table::rbindlist(resListP))
resultsPn <- ddply(resultsPn,.(model),summarize,
xptrain.ndeviance.var=var(xptrain.ndeviance),
xptrain.ndeviance.mean=mean(xptrain.ndeviance),
xptrain.auc.var=var(xptrain.auc),
xptrain.auc.mean=mean(xptrain.auc))
resultsPn <- changeColNames(resultsPn,killMean)
resultsPn$xptrain.ndeviance.lW <-
qnorm(0.025,
mean=resultsPn$xptrain.ndeviance,
sd=sqrt(resultsPn$xptrain.ndeviance.var))
resultsPn$xptrain.ndeviance.uW <-
qnorm(1-0.025,
mean=resultsPn$xptrain.ndeviance,
sd=sqrt(resultsPn$xptrain.ndeviance.var))
resultsPn$xptrain.ndeviance.var <- c()
resultsPn$xptrain.auc.lW <- qnorm(0.025,
mean=resultsPn$xptrain.auc,
sd=sqrt(resultsPn$xptrain.auc.var))
resultsPn$xptrain.auc.uW <- qnorm(1-0.025,
mean=resultsPn$xptrain.auc,
sd=sqrt(resultsPn$xptrain.auc.var))
resultsPn$xptrain.auc.var <- c()
# merge frames
print(results)
## model train.ndeviance train.ndeviance.lW
## 1 logistic regression 0.4809891 0.4685803
## 2 gbm 0.5062300 0.4927137
## 3 elastic net logistic regression 0.4871893 0.4747094
## 4 GAM logistic regression 0.4783072 0.4656980
## 5 random forest 0.2779918 0.2685069
## 6 null model 0.5249036 0.5103592
## 7 best single variable model 0.5149623 0.5012127
## train.ndeviance.uW train.ndeviance.S train.ndeviance.S.lW
## 1 0.4932056 0.4809891 0.4769529
## 2 0.5198868 0.5062300 0.5050369
## 3 0.4998967 0.4871893 0.4844050
## 4 0.4917976 0.4783072 0.4745005
## 5 0.2880642 0.2779918 0.2748508
## 6 0.5382898 0.5249036 0.5249036
## 7 0.5283797 0.5149623 0.5131440
## train.ndeviance.S.uW train.auc train.auc.lW train.auc.uW train.auc.S
## 1 0.4857352 0.7205609 0.7113054 0.7313737 0.7205609
## 2 0.5073814 0.7160286 0.7067463 0.7252085 0.7160286
## 3 0.4902014 0.7151746 0.7051007 0.7250734 0.7151746
## 4 0.4827925 0.7259392 0.7157407 0.7358966 0.7259392
## 5 0.2812654 0.9952225 0.9945178 0.9959836 0.9952225
## 6 0.5249036 0.5000000 0.5000000 0.5000000 0.5000000
## 7 0.5167553 0.5700278 0.5605000 0.5797416 0.5700278
## train.auc.S.lW train.auc.S.uW test.ndeviance test.ndeviance.lW
## 1 0.7113436 0.7305975 0.4725858 0.4404377
## 2 0.7063980 0.7250764 0.4937040 0.4589759
## 3 0.7049253 0.7251897 0.4765857 0.4424223
## 4 0.7164288 0.7358196 0.4689583 0.4329022
## 5 0.9944704 0.9959528 0.5689824 0.5059207
## 6 0.5000000 0.5000000 0.5125123 0.4770656
## 7 0.5614111 0.5798827 0.5013121 0.4648182
## test.ndeviance.uW test.ndeviance.S test.ndeviance.S.lW
## 1 0.5062230 0.4725858 0.4614082
## 2 0.5319951 0.4937040 0.4906970
## 3 0.5064884 0.4765857 0.4680849
## 4 0.5041888 0.4689583 0.4576748
## 5 0.6273441 0.5689824 0.5468312
## 6 0.5500187 0.5125123 0.5125123
## 7 0.5359320 0.5013121 0.4968736
## test.ndeviance.S.uW test.auc test.auc.lW test.auc.uW test.auc.S
## 1 0.4833136 0.7076793 0.6781363 0.7358586 0.7076793
## 2 0.4965932 0.7134821 0.6860364 0.7428393 0.7134821
## 3 0.4847611 0.7074162 0.6810033 0.7371173 0.7074162
## 4 0.4799969 0.7170659 0.6921127 0.7437722 0.7170659
## 5 0.5924701 0.6644473 0.6347722 0.6970121 0.6644473
## 6 0.5125123 0.5000000 0.5000000 0.5000000 0.5000000
## 7 0.5060314 0.5821818 0.5566792 0.6072647 0.5821818
## test.auc.S.lW test.auc.S.uW
## 1 0.6807231 0.7332472
## 2 0.6831842 0.7434163
## 3 0.6789281 0.7362042
## 4 0.6899575 0.7444696
## 5 0.6309642 0.6955825
## 6 0.5000000 0.5000000
## 7 0.5591748 0.6055104
print(resultsPn)
## model xptrain.ndeviance xptrain.auc
## 1 best single variable model 0.5247282 0.5108289
## 2 elastic net logistic regression 0.5249036 0.5000000
## 3 GAM logistic regression 0.5236637 0.5362849
## 4 gbm 0.5245778 0.5483617
## 5 logistic regression 0.5238892 0.5319086
## 6 null model 0.5249036 0.5000000
## 7 random forest 0.2780146 0.9961301
## xptrain.ndeviance.lW xptrain.ndeviance.uW xptrain.auc.lW xptrain.auc.uW
## 1 0.5245659 0.5248905 0.5031825 0.5184754
## 2 0.5249036 0.5249036 0.5000000 0.5000000
## 3 0.5225868 0.5247407 0.5270605 0.5455092
## 4 0.5237696 0.5253860 0.4809028 0.6158206
## 5 0.5234513 0.5243272 0.5245609 0.5392563
## 6 0.5249036 0.5249036 0.5000000 0.5000000
## 7 0.2691397 0.2868896 0.9939836 0.9982765
resultsB <- merge(results,resultsPn,by='model')
print(resultsB)
## model train.ndeviance train.ndeviance.lW
## 1 best single variable model 0.5149623 0.5012127
## 2 elastic net logistic regression 0.4871893 0.4747094
## 3 GAM logistic regression 0.4783072 0.4656980
## 4 gbm 0.5062300 0.4927137
## 5 logistic regression 0.4809891 0.4685803
## 6 null model 0.5249036 0.5103592
## 7 random forest 0.2779918 0.2685069
## train.ndeviance.uW train.ndeviance.S train.ndeviance.S.lW
## 1 0.5283797 0.5149623 0.5131440
## 2 0.4998967 0.4871893 0.4844050
## 3 0.4917976 0.4783072 0.4745005
## 4 0.5198868 0.5062300 0.5050369
## 5 0.4932056 0.4809891 0.4769529
## 6 0.5382898 0.5249036 0.5249036
## 7 0.2880642 0.2779918 0.2748508
## train.ndeviance.S.uW train.auc train.auc.lW train.auc.uW train.auc.S
## 1 0.5167553 0.5700278 0.5605000 0.5797416 0.5700278
## 2 0.4902014 0.7151746 0.7051007 0.7250734 0.7151746
## 3 0.4827925 0.7259392 0.7157407 0.7358966 0.7259392
## 4 0.5073814 0.7160286 0.7067463 0.7252085 0.7160286
## 5 0.4857352 0.7205609 0.7113054 0.7313737 0.7205609
## 6 0.5249036 0.5000000 0.5000000 0.5000000 0.5000000
## 7 0.2812654 0.9952225 0.9945178 0.9959836 0.9952225
## train.auc.S.lW train.auc.S.uW test.ndeviance test.ndeviance.lW
## 1 0.5614111 0.5798827 0.5013121 0.4648182
## 2 0.7049253 0.7251897 0.4765857 0.4424223
## 3 0.7164288 0.7358196 0.4689583 0.4329022
## 4 0.7063980 0.7250764 0.4937040 0.4589759
## 5 0.7113436 0.7305975 0.4725858 0.4404377
## 6 0.5000000 0.5000000 0.5125123 0.4770656
## 7 0.9944704 0.9959528 0.5689824 0.5059207
## test.ndeviance.uW test.ndeviance.S test.ndeviance.S.lW
## 1 0.5359320 0.5013121 0.4968736
## 2 0.5064884 0.4765857 0.4680849
## 3 0.5041888 0.4689583 0.4576748
## 4 0.5319951 0.4937040 0.4906970
## 5 0.5062230 0.4725858 0.4614082
## 6 0.5500187 0.5125123 0.5125123
## 7 0.6273441 0.5689824 0.5468312
## test.ndeviance.S.uW test.auc test.auc.lW test.auc.uW test.auc.S
## 1 0.5060314 0.5821818 0.5566792 0.6072647 0.5821818
## 2 0.4847611 0.7074162 0.6810033 0.7371173 0.7074162
## 3 0.4799969 0.7170659 0.6921127 0.7437722 0.7170659
## 4 0.4965932 0.7134821 0.6860364 0.7428393 0.7134821
## 5 0.4833136 0.7076793 0.6781363 0.7358586 0.7076793
## 6 0.5125123 0.5000000 0.5000000 0.5000000 0.5000000
## 7 0.5924701 0.6644473 0.6347722 0.6970121 0.6644473
## test.auc.S.lW test.auc.S.uW xptrain.ndeviance xptrain.auc
## 1 0.5591748 0.6055104 0.5247282 0.5108289
## 2 0.6789281 0.7362042 0.5249036 0.5000000
## 3 0.6899575 0.7444696 0.5236637 0.5362849
## 4 0.6831842 0.7434163 0.5245778 0.5483617
## 5 0.6807231 0.7332472 0.5238892 0.5319086
## 6 0.5000000 0.5000000 0.5249036 0.5000000
## 7 0.6309642 0.6955825 0.2780146 0.9961301
## xptrain.ndeviance.lW xptrain.ndeviance.uW xptrain.auc.lW xptrain.auc.uW
## 1 0.5245659 0.5248905 0.5031825 0.5184754
## 2 0.5249036 0.5249036 0.5000000 0.5000000
## 3 0.5225868 0.5247407 0.5270605 0.5455092
## 4 0.5237696 0.5253860 0.4809028 0.6158206
## 5 0.5234513 0.5243272 0.5245609 0.5392563
## 6 0.5249036 0.5249036 0.5000000 0.5000000
## 7 0.2691397 0.2868896 0.9939836 0.9982765
# naive scoring, only on train
plts1 <- plotResultRanges(results,plotRanges=FALSE,
plotRestriction='^train\\.')[c('AUC','normalized.deviance')]
print(plts1)
## $AUC
##
## $normalized.deviance
#multiplot(plotlist=plts1)
# # add in the (unreliable Bayes estimate)
# print(plts1$AUC + geom_vline(xintercept=scI$train$auc,linetype=2))
# print(plts1$normalized.deviance + geom_vline(xintercept=scI$train$ndeviance,linetype=2))
# one solution- in train permutation test
resultsBs <- resultsB
resultsBs[,grep('^train\\..*\\.(l|u)W$',colnames(resultsBs))] <- NA
print(plotResultRanges(resultsBs,
plotRestriction='^((train\\.)|(xptrain\\.))'))
## $AUC
## Warning: Removed 7 rows containing missing values (geom_errorbarh).
##
## $AUC.stratified
## Warning: Removed 7 rows containing missing values (geom_errorbarh).
##
## $normalized.deviance
## Warning: Removed 7 rows containing missing values (geom_errorbarh).
##
## $normalized.deviance.stratified
## Warning: Removed 7 rows containing missing values (geom_errorbarh).
#multiplot(plotlist=plotResultRanges(resultsBs,
# plotRestriction='^((train\\.)|(xptrain\\.))'))
# or, could test/train split
print(plotResultRanges(results,plotRanges=FALSE))
## $AUC
##
## $AUC.stratified
##
## $normalized.deviance
##
## $normalized.deviance.stratified
#multiplot(plotlist=plotResultRanges(results,plotRanges=FALSE))
# and bootstrap all scores
print(plotResultRanges(results))
## $AUC
##
## $AUC.stratified
##
## $normalized.deviance
##
## $normalized.deviance.stratified
#multiplot(plotlist=plotResultRanges(results))
# and can try to reduce variance by y-stratifying the bootstrap
# or could do whole shmear
print(plotResultRanges(resultsB))
## $AUC
##
## $AUC.stratified
##
## $normalized.deviance
##
## $normalized.deviance.stratified
#multiplot(plotlist=plotResultRanges(resultsB))