KDD2009 example. Winners had hold-out AUC of 0.7611 on churn. See here for more details.

#load some libraries
# http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
# Using dev version of vtreat
# devtools::install_github("WinVector/vtreat")  # to get 0.5.22 version with vtreat::mkCrossFrameCExperiment()
library('vtreat')
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-14. For overview type 'help("mgcv-package")'.
## Loading required package: sigr
library('parallel')
library('gbm')
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loaded gbm 2.1.1
#library('class')
library('ggplot2')
library('randomForest')
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/' 

d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
                header=T,sep='\t',na.strings=c('NA',''), 
               stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
                    header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
                        header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
                        header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrain = subset(d,rgroup<=0.9)  # shared set for impact models and training
dTest = subset(d,rgroup>0.9) # set for evaluation
debug = FALSE
if(debug) {
  dTrain <- dTrain[sample.int(nrow(dTrain),100),]
  dTest <- dTest[sample.int(nrow(dTest),100),]
}
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
                nonvars)
yName = 'churn'
yTarget = 1

This is a fun little experiment. We use all of the training data both for variable design and then for modeling. This sets up the potential of a bad nested model bias. We work around this by using a “cross frame” which is a special data frame that has been treated, but not by the returned treatment plan. Each row is the cross frame was built by a treatment plan built from a disjoint set of rows (a lot like cross validation). This (hopefully) makes the rows in the cross frame exchangable with future rows, as neither is directly involved in the treatment design. The hope is this improves generalization error while allowing us to use all of our available data for training.

# build data treatments

set.seed(239525)

cl <- c()
if(!debug) {
  ncore <- parallel::detectCores()
  cl <- parallel::makeCluster(ncore)
}

# build treatments on just the coding data
crossExpmt = mkCrossFrameCExperiment(dTrain,
    vars,yName,yTarget,
    smFactor=2.0, 
    parallelCluster=cl)
treatmentsC <- crossExpmt$treatments
treatedTrain <- crossExpmt$crossFrame

kddSig = 0.05

selvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<=kddSig]
treatedTrain[[yName]] = treatedTrain[[yName]]==yTarget

treatedTest = prepare(treatmentsC,
                      dTest,
                      pruneSig=c(),
                      varRestriction=selvars,
                      parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget


if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}
# Run other models (with proper coding/training separation).
#
# This gets us back to AUC 0.72

print(selvars)
##   [1] "Var2_isBAD"                            
##   [2] "Var3_isBAD"                            
##   [3] "Var4_isBAD"                            
##   [4] "Var5_isBAD"                            
##   [5] "Var6_clean"                            
##   [6] "Var6_isBAD"                            
##   [7] "Var7_clean"                            
##   [8] "Var7_isBAD"                            
##   [9] "Var10_isBAD"                           
##  [10] "Var11_isBAD"                           
##  [11] "Var13_clean"                           
##  [12] "Var13_isBAD"                           
##  [13] "Var14_isBAD"                           
##  [14] "Var16_isBAD"                           
##  [15] "Var17_isBAD"                           
##  [16] "Var18_isBAD"                           
##  [17] "Var19_isBAD"                           
##  [18] "Var21_isBAD"                           
##  [19] "Var22_isBAD"                           
##  [20] "Var23_isBAD"                           
##  [21] "Var24_clean"                           
##  [22] "Var24_isBAD"                           
##  [23] "Var25_clean"                           
##  [24] "Var25_isBAD"                           
##  [25] "Var26_isBAD"                           
##  [26] "Var27_isBAD"                           
##  [27] "Var28_clean"                           
##  [28] "Var28_isBAD"                           
##  [29] "Var33_isBAD"                           
##  [30] "Var34_isBAD"                           
##  [31] "Var35_isBAD"                           
##  [32] "Var36_isBAD"                           
##  [33] "Var37_isBAD"                           
##  [34] "Var38_isBAD"                           
##  [35] "Var40_isBAD"                           
##  [36] "Var43_isBAD"                           
##  [37] "Var44_isBAD"                           
##  [38] "Var46_isBAD"                           
##  [39] "Var49_isBAD"                           
##  [40] "Var51_isBAD"                           
##  [41] "Var54_isBAD"                           
##  [42] "Var56_isBAD"                           
##  [43] "Var59_isBAD"                           
##  [44] "Var60_isBAD"                           
##  [45] "Var61_isBAD"                           
##  [46] "Var65_clean"                           
##  [47] "Var65_isBAD"                           
##  [48] "Var67_isBAD"                           
##  [49] "Var68_isBAD"                           
##  [50] "Var69_isBAD"                           
##  [51] "Var70_isBAD"                           
##  [52] "Var71_isBAD"                           
##  [53] "Var72_clean"                           
##  [54] "Var73_clean"                           
##  [55] "Var74_clean"                           
##  [56] "Var74_isBAD"                           
##  [57] "Var75_isBAD"                           
##  [58] "Var76_isBAD"                           
##  [59] "Var78_isBAD"                           
##  [60] "Var80_isBAD"                           
##  [61] "Var81_clean"                           
##  [62] "Var81_isBAD"                           
##  [63] "Var82_isBAD"                           
##  [64] "Var83_isBAD"                           
##  [65] "Var84_isBAD"                           
##  [66] "Var85_clean"                           
##  [67] "Var85_isBAD"                           
##  [68] "Var88_isBAD"                           
##  [69] "Var89_isBAD"                           
##  [70] "Var91_isBAD"                           
##  [71] "Var93_isBAD"                           
##  [72] "Var94_clean"                           
##  [73] "Var95_isBAD"                           
##  [74] "Var96_isBAD"                           
##  [75] "Var97_isBAD"                           
##  [76] "Var99_isBAD"                           
##  [77] "Var101_isBAD"                          
##  [78] "Var102_isBAD"                          
##  [79] "Var103_isBAD"                          
##  [80] "Var104_isBAD"                          
##  [81] "Var105_isBAD"                          
##  [82] "Var106_isBAD"                          
##  [83] "Var107_isBAD"                          
##  [84] "Var109_isBAD"                          
##  [85] "Var111_isBAD"                          
##  [86] "Var112_isBAD"                          
##  [87] "Var113_clean"                          
##  [88] "Var114_isBAD"                          
##  [89] "Var115_isBAD"                          
##  [90] "Var117_isBAD"                          
##  [91] "Var119_clean"                          
##  [92] "Var119_isBAD"                          
##  [93] "Var120_isBAD"                          
##  [94] "Var122_isBAD"                          
##  [95] "Var123_isBAD"                          
##  [96] "Var124_isBAD"                          
##  [97] "Var125_clean"                          
##  [98] "Var125_isBAD"                          
##  [99] "Var126_clean"                          
## [100] "Var126_isBAD"                          
## [101] "Var127_isBAD"                          
## [102] "Var128_isBAD"                          
## [103] "Var130_isBAD"                          
## [104] "Var132_isBAD"                          
## [105] "Var133_isBAD"                          
## [106] "Var134_isBAD"                          
## [107] "Var135_isBAD"                          
## [108] "Var138_isBAD"                          
## [109] "Var139_clean"                          
## [110] "Var139_isBAD"                          
## [111] "Var140_clean"                          
## [112] "Var140_isBAD"                          
## [113] "Var143_clean"                          
## [114] "Var143_isBAD"                          
## [115] "Var144_clean"                          
## [116] "Var144_isBAD"                          
## [117] "Var145_isBAD"                          
## [118] "Var146_isBAD"                          
## [119] "Var147_isBAD"                          
## [120] "Var148_isBAD"                          
## [121] "Var149_clean"                          
## [122] "Var149_isBAD"                          
## [123] "Var150_isBAD"                          
## [124] "Var151_isBAD"                          
## [125] "Var152_isBAD"                          
## [126] "Var153_isBAD"                          
## [127] "Var155_isBAD"                          
## [128] "Var157_isBAD"                          
## [129] "Var158_isBAD"                          
## [130] "Var159_isBAD"                          
## [131] "Var160_clean"                          
## [132] "Var160_isBAD"                          
## [133] "Var161_isBAD"                          
## [134] "Var162_isBAD"                          
## [135] "Var163_isBAD"                          
## [136] "Var164_isBAD"                          
## [137] "Var165_isBAD"                          
## [138] "Var166_isBAD"                          
## [139] "Var168_clean"                          
## [140] "Var170_isBAD"                          
## [141] "Var171_isBAD"                          
## [142] "Var172_isBAD"                          
## [143] "Var173_clean"                          
## [144] "Var173_isBAD"                          
## [145] "Var174_isBAD"                          
## [146] "Var176_isBAD"                          
## [147] "Var177_isBAD"                          
## [148] "Var178_isBAD"                          
## [149] "Var179_isBAD"                          
## [150] "Var181_isBAD"                          
## [151] "Var182_isBAD"                          
## [152] "Var183_isBAD"                          
## [153] "Var184_isBAD"                          
## [154] "Var188_isBAD"                          
## [155] "Var189_clean"                          
## [156] "Var191_lev_NA"                         
## [157] "Var191_lev_x.r__I"                     
## [158] "Var192_catP"                           
## [159] "Var192_catB"                           
## [160] "Var193_lev_x.2Knk1KF"                  
## [161] "Var193_lev_x.AERks4l"                  
## [162] "Var193_lev_x.RO12"                     
## [163] "Var193_catP"                           
## [164] "Var193_catB"                           
## [165] "Var194_lev_NA"                         
## [166] "Var194_lev_x.SEuy"                     
## [167] "Var194_catP"                           
## [168] "Var194_catB"                           
## [169] "Var195_lev_x.taul"                     
## [170] "Var195_catP"                           
## [171] "Var195_catB"                           
## [172] "Var197_lev_x.0Xwj"                     
## [173] "Var197_lev_x.ssAy"                     
## [174] "Var197_lev_x.TyGl"                     
## [175] "Var197_catP"                           
## [176] "Var197_catB"                           
## [177] "Var198_lev_x.fhk21Ss"                  
## [178] "Var198_lev_x.PHNvXy8"                  
## [179] "Var198_catP"                           
## [180] "Var198_catB"                           
## [181] "Var199_catP"                           
## [182] "Var199_catB"                           
## [183] "Var200_lev_NA"                         
## [184] "Var200_catP"                           
## [185] "Var200_catB"                           
## [186] "Var201_lev_NA"                         
## [187] "Var201_lev_x.smXZ"                     
## [188] "Var201_catP"                           
## [189] "Var201_catB"                           
## [190] "Var202_catP"                           
## [191] "Var202_catB"                           
## [192] "Var203_lev_x.9_Y1"                     
## [193] "Var203_catP"                           
## [194] "Var204_lev_x.RcM7"                     
## [195] "Var204_lev_x.RVjC"                     
## [196] "Var204_lev_x.z5Ry"                     
## [197] "Var204_catP"                           
## [198] "Var204_catB"                           
## [199] "Var205_lev_x.09_Q"                     
## [200] "Var205_lev_x.sJzTlal"                  
## [201] "Var205_lev_x.VpdQ"                     
## [202] "Var205_catP"                           
## [203] "Var205_catB"                           
## [204] "Var206_lev_NA"                         
## [205] "Var206_lev_x.6JmL"                     
## [206] "Var206_lev_x.hAFG"                     
## [207] "Var206_lev_x.haYg"                     
## [208] "Var206_lev_x.IYzP"                     
## [209] "Var206_lev_x.kxE9"                     
## [210] "Var206_lev_x.y6dw"                     
## [211] "Var206_lev_x.zm5i"                     
## [212] "Var206_catP"                           
## [213] "Var206_catB"                           
## [214] "Var207_lev_x.7M47J5GA0pTYIFxg5uy"      
## [215] "Var207_lev_x.DHn_WUyBhW_whjA88g9bvA64_"
## [216] "Var207_lev_x.me75fM6ugJ"               
## [217] "Var207_lev_x.NKv3VA1BpP"               
## [218] "Var207_catP"                           
## [219] "Var207_catB"                           
## [220] "Var210_lev_x.g5HH"                     
## [221] "Var210_lev_x.uKAI"                     
## [222] "Var210_catP"                           
## [223] "Var210_catB"                           
## [224] "Var211_lev_x.L84s"                     
## [225] "Var211_lev_x.Mtgm"                     
## [226] "Var212_lev_x.4kVnq_T26xq1p"            
## [227] "Var212_lev_x.CrNX"                     
## [228] "Var212_lev_x.Ie_5MZs"                  
## [229] "Var212_lev_x.NhsEn4L"                  
## [230] "Var212_lev_x.XfqtO3UdzaXh_"            
## [231] "Var212_catP"                           
## [232] "Var212_catB"                           
## [233] "Var213_lev_NA"                         
## [234] "Var213_lev_x.KdSa"                     
## [235] "Var214_lev_NA"                         
## [236] "Var214_catP"                           
## [237] "Var214_catB"                           
## [238] "Var216_lev_x.11p4mKe"                  
## [239] "Var216_lev_x.kZJtVhC"                  
## [240] "Var216_lev_x.kZJyVg2"                  
## [241] "Var216_lev_x.mAja5EA"                  
## [242] "Var216_lev_x.NGZxnJM"                  
## [243] "Var216_lev_x.XTbPUYD"                  
## [244] "Var216_catB"                           
## [245] "Var217_catP"                           
## [246] "Var218_lev_x.cJvF"                     
## [247] "Var218_lev_x.UYBR"                     
## [248] "Var218_catP"                           
## [249] "Var218_catB"                           
## [250] "Var219_lev_x.AU8pNoi"                  
## [251] "Var219_lev_x.FzaX"                     
## [252] "Var219_catP"                           
## [253] "Var220_lev_x.4UxGlow"                  
## [254] "Var220_lev_x.UF16siJ"                  
## [255] "Var220_catP"                           
## [256] "Var220_catB"                           
## [257] "Var221_lev_x.d0EEeJi"                  
## [258] "Var221_lev_x.oslk"                     
## [259] "Var221_lev_x.QKW8DRm"                  
## [260] "Var221_lev_x.zCkv"                     
## [261] "Var221_catP"                           
## [262] "Var221_catB"                           
## [263] "Var222_lev_x.APgdzOv"                  
## [264] "Var222_lev_x.catzS2D"                  
## [265] "Var222_catP"                           
## [266] "Var222_catB"                           
## [267] "Var223_catB"                           
## [268] "Var224_lev_NA"                         
## [269] "Var225_lev_NA"                         
## [270] "Var225_lev_x.ELof"                     
## [271] "Var225_lev_x.kG3k"                     
## [272] "Var225_lev_x.xG3x"                     
## [273] "Var225_catP"                           
## [274] "Var225_catB"                           
## [275] "Var226_lev_x.7P5s"                     
## [276] "Var226_lev_x.FSa2"                     
## [277] "Var226_lev_x.me1d"                     
## [278] "Var226_lev_x.Qu4f"                     
## [279] "Var226_lev_x.szEZ"                     
## [280] "Var226_lev_x.uWr3"                     
## [281] "Var226_lev_x.Xa3G"                     
## [282] "Var226_catP"                           
## [283] "Var226_catB"                           
## [284] "Var227_lev_x.02N6s8f"                  
## [285] "Var227_lev_x.nIGXDli"                  
## [286] "Var227_lev_x.RAYp"                     
## [287] "Var227_lev_x.ZI9m"                     
## [288] "Var227_catP"                           
## [289] "Var227_catB"                           
## [290] "Var228_lev_x.55YFVY9"                  
## [291] "Var228_lev_x.F2FyR07IdsN7I"            
## [292] "Var228_lev_x.ib5G6X1eUxUn6"            
## [293] "Var228_lev_x.iyHGyLCEkQ"               
## [294] "Var228_lev_x.R4y5gQQWY8OodqDV"         
## [295] "Var228_lev_x.TCU50_Yjmm6GIBZ0lL_"      
## [296] "Var228_catP"                           
## [297] "Var228_catB"                           
## [298] "Var229_lev_NA"                         
## [299] "Var229_lev_x.am7c"                     
## [300] "Var229_lev_x.mj86"                     
## [301] "Var229_catP"                           
## [302] "Var229_catB"
# prepare plotting frames
treatedTrainP = treatedTrain[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]

GBM

formulaS = paste(yName,paste(selvars,collapse=' + '),sep=' ~ ')
mname='gbm'
print(date())
## [1] "Sun Oct 16 10:43:39 2016"
print(paste(mname,length(selvars)))
## [1] "gbm 302"
modelGBMs = gbm(as.formula(formulaS),
                data=treatedTrain,
                distribution='bernoulli',
                n.trees=1000,
                interaction.depth=3,
                keep.data=FALSE,
                cv.folds=5)
#print(modelGBMs)
#print(summary(modelGBMs))
nTrees = gbm.perf(modelGBMs)
## Using cv method...

treatedTrainP[[mname]] = predict(modelGBMs,newdata=treatedTrain,type='response',
                                 n.trees=nTrees) 
treatedTestP[[mname]] = predict(modelGBMs,newdata=treatedTest,type='response',
                                n.trees=nTrees)

t2 = paste(mname,'test data')
print(DoubleDensityPlot(treatedTestP, mname, yName, 
                        title=t2))

print(ROCPlot(treatedTestP, mname, yName, yTarget,
              title=t2))