BootPreds

load('bsteps.Rdata')
source('Lfns.R')
source('Afns.R')

## Loading required package: survival

## Loading required package: lattice

## Loading required package: splines

## Loaded gbm 2.1.3

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library("plyr")

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

# column selection doesn't work the same for data.table
# in particular d[1,'columnTitle'] returns the column title, not value
results <-  data.frame(data.table::rbindlist(resList))


# # build a very crude clustering model on a very small subset of the data
# gVars <- names(treatmentsC$varScores)[treatmentsC$varScores<0.999]
# scI <- doFitApplyDupModelX(yName,gVars,
#                           treatedTrainM,
#                           list(train=treatedTrainM),
#                           outSample=FALSE)
# print("Bayes limit estimate (unreliable)")
# print(scI)


# problems with ddply when input and output column names match
# work around
killMean <- '\\.mean$'

changeColNames <- function(d,regexpToZap) {
  toFix <- grep(regexpToZap,colnames(d))
  if(length(toFix)>0) {
    cols <- colnames(d)[toFix]
    for(ci in cols) {
      newName <- gsub(regexpToZap,'',ci)
      d[[newName]] <- d[[ci]]
      d[[ci]] <- c()
    }
  }
  d
}



#  95% confidence interval from fit normal distribution
resultsPn <-  data.frame(data.table::rbindlist(resListP))
resultsPn <- ddply(resultsPn,.(model),summarize,
      xptrain.ndeviance.var=var(xptrain.ndeviance),
      xptrain.ndeviance.mean=mean(xptrain.ndeviance),
      xptrain.auc.var=var(xptrain.auc),
      xptrain.auc.mean=mean(xptrain.auc))
resultsPn <- changeColNames(resultsPn,killMean)
resultsPn$xptrain.ndeviance.lW <- 
  qnorm(0.025,
        mean=resultsPn$xptrain.ndeviance,
        sd=sqrt(resultsPn$xptrain.ndeviance.var))
resultsPn$xptrain.ndeviance.uW <- 
  qnorm(1-0.025,
        mean=resultsPn$xptrain.ndeviance,
        sd=sqrt(resultsPn$xptrain.ndeviance.var))
resultsPn$xptrain.ndeviance.var <- c()
resultsPn$xptrain.auc.lW <- qnorm(0.025,
                                 mean=resultsPn$xptrain.auc,
                                 sd=sqrt(resultsPn$xptrain.auc.var))
resultsPn$xptrain.auc.uW <- qnorm(1-0.025,
                                 mean=resultsPn$xptrain.auc,
                                 sd=sqrt(resultsPn$xptrain.auc.var))
resultsPn$xptrain.auc.var <- c()


#  merge frames
print(results)

##                             model train.ndeviance train.ndeviance.lW
## 1             logistic regression       0.4809891          0.4685803
## 2                             gbm       0.5062300          0.4927137
## 3 elastic net logistic regression       0.4871893          0.4747094
## 4         GAM logistic regression       0.4783072          0.4656980
## 5                   random forest       0.2779918          0.2685069
## 6                      null model       0.5249036          0.5103592
## 7      best single variable model       0.5149623          0.5012127
##   train.ndeviance.uW train.ndeviance.S train.ndeviance.S.lW
## 1          0.4932056         0.4809891            0.4769529
## 2          0.5198868         0.5062300            0.5050369
## 3          0.4998967         0.4871893            0.4844050
## 4          0.4917976         0.4783072            0.4745005
## 5          0.2880642         0.2779918            0.2748508
## 6          0.5382898         0.5249036            0.5249036
## 7          0.5283797         0.5149623            0.5131440
##   train.ndeviance.S.uW train.auc train.auc.lW train.auc.uW train.auc.S
## 1            0.4857352 0.7205609    0.7113054    0.7313737   0.7205609
## 2            0.5073814 0.7160286    0.7067463    0.7252085   0.7160286
## 3            0.4902014 0.7151746    0.7051007    0.7250734   0.7151746
## 4            0.4827925 0.7259392    0.7157407    0.7358966   0.7259392
## 5            0.2812654 0.9952225    0.9945178    0.9959836   0.9952225
## 6            0.5249036 0.5000000    0.5000000    0.5000000   0.5000000
## 7            0.5167553 0.5700278    0.5605000    0.5797416   0.5700278
##   train.auc.S.lW train.auc.S.uW test.ndeviance test.ndeviance.lW
## 1      0.7113436      0.7305975      0.4725858         0.4404377
## 2      0.7063980      0.7250764      0.4937040         0.4589759
## 3      0.7049253      0.7251897      0.4765857         0.4424223
## 4      0.7164288      0.7358196      0.4689583         0.4329022
## 5      0.9944704      0.9959528      0.5689824         0.5059207
## 6      0.5000000      0.5000000      0.5125123         0.4770656
## 7      0.5614111      0.5798827      0.5013121         0.4648182
##   test.ndeviance.uW test.ndeviance.S test.ndeviance.S.lW
## 1         0.5062230        0.4725858           0.4614082
## 2         0.5319951        0.4937040           0.4906970
## 3         0.5064884        0.4765857           0.4680849
## 4         0.5041888        0.4689583           0.4576748
## 5         0.6273441        0.5689824           0.5468312
## 6         0.5500187        0.5125123           0.5125123
## 7         0.5359320        0.5013121           0.4968736
##   test.ndeviance.S.uW  test.auc test.auc.lW test.auc.uW test.auc.S
## 1           0.4833136 0.7076793   0.6781363   0.7358586  0.7076793
## 2           0.4965932 0.7134821   0.6860364   0.7428393  0.7134821
## 3           0.4847611 0.7074162   0.6810033   0.7371173  0.7074162
## 4           0.4799969 0.7170659   0.6921127   0.7437722  0.7170659
## 5           0.5924701 0.6644473   0.6347722   0.6970121  0.6644473
## 6           0.5125123 0.5000000   0.5000000   0.5000000  0.5000000
## 7           0.5060314 0.5821818   0.5566792   0.6072647  0.5821818
##   test.auc.S.lW test.auc.S.uW
## 1     0.6807231     0.7332472
## 2     0.6831842     0.7434163
## 3     0.6789281     0.7362042
## 4     0.6899575     0.7444696
## 5     0.6309642     0.6955825
## 6     0.5000000     0.5000000
## 7     0.5591748     0.6055104

print(resultsPn)

##                             model xptrain.ndeviance xptrain.auc
## 1      best single variable model         0.5247282   0.5108289
## 2 elastic net logistic regression         0.5249036   0.5000000
## 3         GAM logistic regression         0.5236637   0.5362849
## 4                             gbm         0.5245778   0.5483617
## 5             logistic regression         0.5238892   0.5319086
## 6                      null model         0.5249036   0.5000000
## 7                   random forest         0.2780146   0.9961301
##   xptrain.ndeviance.lW xptrain.ndeviance.uW xptrain.auc.lW xptrain.auc.uW
## 1            0.5245659            0.5248905      0.5031825      0.5184754
## 2            0.5249036            0.5249036      0.5000000      0.5000000
## 3            0.5225868            0.5247407      0.5270605      0.5455092
## 4            0.5237696            0.5253860      0.4809028      0.6158206
## 5            0.5234513            0.5243272      0.5245609      0.5392563
## 6            0.5249036            0.5249036      0.5000000      0.5000000
## 7            0.2691397            0.2868896      0.9939836      0.9982765

resultsB <- merge(results,resultsPn,by='model')
print(resultsB)

##                             model train.ndeviance train.ndeviance.lW
## 1      best single variable model       0.5149623          0.5012127
## 2 elastic net logistic regression       0.4871893          0.4747094
## 3         GAM logistic regression       0.4783072          0.4656980
## 4                             gbm       0.5062300          0.4927137
## 5             logistic regression       0.4809891          0.4685803
## 6                      null model       0.5249036          0.5103592
## 7                   random forest       0.2779918          0.2685069
##   train.ndeviance.uW train.ndeviance.S train.ndeviance.S.lW
## 1          0.5283797         0.5149623            0.5131440
## 2          0.4998967         0.4871893            0.4844050
## 3          0.4917976         0.4783072            0.4745005
## 4          0.5198868         0.5062300            0.5050369
## 5          0.4932056         0.4809891            0.4769529
## 6          0.5382898         0.5249036            0.5249036
## 7          0.2880642         0.2779918            0.2748508
##   train.ndeviance.S.uW train.auc train.auc.lW train.auc.uW train.auc.S
## 1            0.5167553 0.5700278    0.5605000    0.5797416   0.5700278
## 2            0.4902014 0.7151746    0.7051007    0.7250734   0.7151746
## 3            0.4827925 0.7259392    0.7157407    0.7358966   0.7259392
## 4            0.5073814 0.7160286    0.7067463    0.7252085   0.7160286
## 5            0.4857352 0.7205609    0.7113054    0.7313737   0.7205609
## 6            0.5249036 0.5000000    0.5000000    0.5000000   0.5000000
## 7            0.2812654 0.9952225    0.9945178    0.9959836   0.9952225
##   train.auc.S.lW train.auc.S.uW test.ndeviance test.ndeviance.lW
## 1      0.5614111      0.5798827      0.5013121         0.4648182
## 2      0.7049253      0.7251897      0.4765857         0.4424223
## 3      0.7164288      0.7358196      0.4689583         0.4329022
## 4      0.7063980      0.7250764      0.4937040         0.4589759
## 5      0.7113436      0.7305975      0.4725858         0.4404377
## 6      0.5000000      0.5000000      0.5125123         0.4770656
## 7      0.9944704      0.9959528      0.5689824         0.5059207
##   test.ndeviance.uW test.ndeviance.S test.ndeviance.S.lW
## 1         0.5359320        0.5013121           0.4968736
## 2         0.5064884        0.4765857           0.4680849
## 3         0.5041888        0.4689583           0.4576748
## 4         0.5319951        0.4937040           0.4906970
## 5         0.5062230        0.4725858           0.4614082
## 6         0.5500187        0.5125123           0.5125123
## 7         0.6273441        0.5689824           0.5468312
##   test.ndeviance.S.uW  test.auc test.auc.lW test.auc.uW test.auc.S
## 1           0.5060314 0.5821818   0.5566792   0.6072647  0.5821818
## 2           0.4847611 0.7074162   0.6810033   0.7371173  0.7074162
## 3           0.4799969 0.7170659   0.6921127   0.7437722  0.7170659
## 4           0.4965932 0.7134821   0.6860364   0.7428393  0.7134821
## 5           0.4833136 0.7076793   0.6781363   0.7358586  0.7076793
## 6           0.5125123 0.5000000   0.5000000   0.5000000  0.5000000
## 7           0.5924701 0.6644473   0.6347722   0.6970121  0.6644473
##   test.auc.S.lW test.auc.S.uW xptrain.ndeviance xptrain.auc
## 1     0.5591748     0.6055104         0.5247282   0.5108289
## 2     0.6789281     0.7362042         0.5249036   0.5000000
## 3     0.6899575     0.7444696         0.5236637   0.5362849
## 4     0.6831842     0.7434163         0.5245778   0.5483617
## 5     0.6807231     0.7332472         0.5238892   0.5319086
## 6     0.5000000     0.5000000         0.5249036   0.5000000
## 7     0.6309642     0.6955825         0.2780146   0.9961301
##   xptrain.ndeviance.lW xptrain.ndeviance.uW xptrain.auc.lW xptrain.auc.uW
## 1            0.5245659            0.5248905      0.5031825      0.5184754
## 2            0.5249036            0.5249036      0.5000000      0.5000000
## 3            0.5225868            0.5247407      0.5270605      0.5455092
## 4            0.5237696            0.5253860      0.4809028      0.6158206
## 5            0.5234513            0.5243272      0.5245609      0.5392563
## 6            0.5249036            0.5249036      0.5000000      0.5000000
## 7            0.2691397            0.2868896      0.9939836      0.9982765

# naive scoring, only on train
plts1 <- plotResultRanges(results,plotRanges=FALSE,
                       plotRestriction='^train\\.')[c('AUC','normalized.deviance')]
print(plts1)

## $AUC

## 
## $normalized.deviance

#multiplot(plotlist=plts1)
# # add in the (unreliable Bayes estimate)
# print(plts1$AUC + geom_vline(xintercept=scI$train$auc,linetype=2))
# print(plts1$normalized.deviance + geom_vline(xintercept=scI$train$ndeviance,linetype=2))


# one solution- in train permutation test
resultsBs <- resultsB
resultsBs[,grep('^train\\..*\\.(l|u)W$',colnames(resultsBs))] <- NA
print(plotResultRanges(resultsBs,
                       plotRestriction='^((train\\.)|(xptrain\\.))'))

## $AUC

## Warning: Removed 7 rows containing missing values (geom_errorbarh).

## 
## $AUC.stratified

## Warning: Removed 7 rows containing missing values (geom_errorbarh).

## 
## $normalized.deviance

## Warning: Removed 7 rows containing missing values (geom_errorbarh).

## 
## $normalized.deviance.stratified

## Warning: Removed 7 rows containing missing values (geom_errorbarh).

#multiplot(plotlist=plotResultRanges(resultsBs,
#                       plotRestriction='^((train\\.)|(xptrain\\.))'))

# or, could test/train split
print(plotResultRanges(results,plotRanges=FALSE))

## $AUC

## 
## $AUC.stratified

## 
## $normalized.deviance

## 
## $normalized.deviance.stratified

#multiplot(plotlist=plotResultRanges(results,plotRanges=FALSE))
# and bootstrap all scores
print(plotResultRanges(results))

## $AUC

## 
## $AUC.stratified

## 
## $normalized.deviance

## 
## $normalized.deviance.stratified

#multiplot(plotlist=plotResultRanges(results))
# and can try to reduce variance by y-stratifying the bootstrap

# or could do whole shmear
print(plotResultRanges(resultsB))

## $AUC

## 
## $AUC.stratified

## 
## $normalized.deviance

## 
## $normalized.deviance.stratified

#multiplot(plotlist=plotResultRanges(resultsB))

BootPreds

John Mount

August 10, 2015