KDD2009 example. Winners had hold-out AUC of 0.7611 on churn. See here for more details.
#load some libraries
# http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
library('vtreat')
packageVersion('vtreat')
## [1] '0.5.26'
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
library('parallel')
library('gbm')
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loaded gbm 2.1.1
#library('class')
library('ggplot2')
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library('xgboost')
# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/'
d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
header=T,sep='\t',na.strings=c('NA',''),
stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
rgroup = runif(dim(d)[[1]])
dTrain = d[rgroup<=0.9,] # set for building models
dTest = d[rgroup>0.9,] # set for evaluation
debug = FALSE
if(debug) {
dTrain <- dTrain[sample.int(nrow(dTrain),100),]
dTest <- dTest[sample.int(nrow(dTest),100),]
}
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
nonvars)
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
cl <- c()
if(!debug) {
ncore <- parallel::detectCores()
cl <- parallel::makeCluster(ncore)
}
# build treatments
trainPlan = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
smFactor=2.0,
parallelCluster=cl)
print(trainPlan$method)
## [1] "kwaycrossystratified"
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame
kddSig = 1/nrow(treatmentsC$scoreFrame)
print(kddSig)
## [1] 0.001828154
selvars = treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<kddSig]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
pruneSig=kddSig,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
# Run other models (with proper coding/training separation).
#
# This gets us back to AUC 0.72
print(selvars)
## [1] "Var2_isBAD"
## [2] "Var3_isBAD"
## [3] "Var4_isBAD"
## [4] "Var5_isBAD"
## [5] "Var6_clean"
## [6] "Var6_isBAD"
## [7] "Var7_clean"
## [8] "Var7_isBAD"
## [9] "Var10_isBAD"
## [10] "Var11_isBAD"
## [11] "Var13_clean"
## [12] "Var13_isBAD"
## [13] "Var14_isBAD"
## [14] "Var16_isBAD"
## [15] "Var17_isBAD"
## [16] "Var18_isBAD"
## [17] "Var19_isBAD"
## [18] "Var21_isBAD"
## [19] "Var22_isBAD"
## [20] "Var23_isBAD"
## [21] "Var25_isBAD"
## [22] "Var26_isBAD"
## [23] "Var27_isBAD"
## [24] "Var28_isBAD"
## [25] "Var33_isBAD"
## [26] "Var34_isBAD"
## [27] "Var35_isBAD"
## [28] "Var36_isBAD"
## [29] "Var37_isBAD"
## [30] "Var38_isBAD"
## [31] "Var40_isBAD"
## [32] "Var43_isBAD"
## [33] "Var44_isBAD"
## [34] "Var46_isBAD"
## [35] "Var49_isBAD"
## [36] "Var51_isBAD"
## [37] "Var54_isBAD"
## [38] "Var60_isBAD"
## [39] "Var61_isBAD"
## [40] "Var65_clean"
## [41] "Var65_isBAD"
## [42] "Var67_isBAD"
## [43] "Var68_isBAD"
## [44] "Var69_isBAD"
## [45] "Var70_isBAD"
## [46] "Var71_isBAD"
## [47] "Var72_clean"
## [48] "Var73_clean"
## [49] "Var74_clean"
## [50] "Var74_isBAD"
## [51] "Var75_isBAD"
## [52] "Var76_isBAD"
## [53] "Var78_isBAD"
## [54] "Var80_isBAD"
## [55] "Var81_clean"
## [56] "Var81_isBAD"
## [57] "Var82_isBAD"
## [58] "Var83_isBAD"
## [59] "Var84_isBAD"
## [60] "Var85_isBAD"
## [61] "Var88_isBAD"
## [62] "Var91_isBAD"
## [63] "Var93_isBAD"
## [64] "Var95_isBAD"
## [65] "Var96_isBAD"
## [66] "Var97_isBAD"
## [67] "Var99_isBAD"
## [68] "Var101_isBAD"
## [69] "Var103_isBAD"
## [70] "Var106_isBAD"
## [71] "Var107_isBAD"
## [72] "Var111_isBAD"
## [73] "Var112_isBAD"
## [74] "Var113_clean"
## [75] "Var114_isBAD"
## [76] "Var117_isBAD"
## [77] "Var119_isBAD"
## [78] "Var120_isBAD"
## [79] "Var122_isBAD"
## [80] "Var123_isBAD"
## [81] "Var124_isBAD"
## [82] "Var125_clean"
## [83] "Var125_isBAD"
## [84] "Var126_clean"
## [85] "Var126_isBAD"
## [86] "Var127_isBAD"
## [87] "Var128_isBAD"
## [88] "Var130_isBAD"
## [89] "Var132_isBAD"
## [90] "Var133_isBAD"
## [91] "Var134_isBAD"
## [92] "Var135_isBAD"
## [93] "Var138_isBAD"
## [94] "Var139_isBAD"
## [95] "Var140_clean"
## [96] "Var140_isBAD"
## [97] "Var143_isBAD"
## [98] "Var144_clean"
## [99] "Var144_isBAD"
## [100] "Var145_isBAD"
## [101] "Var146_isBAD"
## [102] "Var147_isBAD"
## [103] "Var148_isBAD"
## [104] "Var150_isBAD"
## [105] "Var151_isBAD"
## [106] "Var152_isBAD"
## [107] "Var153_isBAD"
## [108] "Var155_isBAD"
## [109] "Var157_isBAD"
## [110] "Var158_isBAD"
## [111] "Var159_isBAD"
## [112] "Var160_isBAD"
## [113] "Var161_isBAD"
## [114] "Var162_isBAD"
## [115] "Var163_isBAD"
## [116] "Var164_isBAD"
## [117] "Var165_isBAD"
## [118] "Var166_isBAD"
## [119] "Var170_isBAD"
## [120] "Var171_isBAD"
## [121] "Var172_isBAD"
## [122] "Var173_isBAD"
## [123] "Var174_isBAD"
## [124] "Var176_isBAD"
## [125] "Var177_isBAD"
## [126] "Var179_isBAD"
## [127] "Var181_isBAD"
## [128] "Var182_isBAD"
## [129] "Var183_isBAD"
## [130] "Var184_isBAD"
## [131] "Var188_isBAD"
## [132] "Var189_clean"
## [133] "Var191_lev_NA"
## [134] "Var191_lev_x.r__I"
## [135] "Var192_catP"
## [136] "Var192_catB"
## [137] "Var193_lev_x.2Knk1KF"
## [138] "Var193_lev_x.AERks4l"
## [139] "Var193_lev_x.RO12"
## [140] "Var193_catP"
## [141] "Var193_catB"
## [142] "Var194_lev_NA"
## [143] "Var194_lev_x.SEuy"
## [144] "Var194_catP"
## [145] "Var194_catB"
## [146] "Var195_lev_x.taul"
## [147] "Var195_catP"
## [148] "Var195_catB"
## [149] "Var197_catP"
## [150] "Var197_catB"
## [151] "Var198_lev_x.fhk21Ss"
## [152] "Var198_catP"
## [153] "Var198_catB"
## [154] "Var199_catP"
## [155] "Var199_catB"
## [156] "Var200_lev_NA"
## [157] "Var200_catP"
## [158] "Var200_catB"
## [159] "Var201_lev_NA"
## [160] "Var201_lev_x.smXZ"
## [161] "Var201_catP"
## [162] "Var201_catB"
## [163] "Var202_catP"
## [164] "Var202_catB"
## [165] "Var204_catP"
## [166] "Var204_catB"
## [167] "Var205_lev_x.sJzTlal"
## [168] "Var205_lev_x.VpdQ"
## [169] "Var205_catP"
## [170] "Var205_catB"
## [171] "Var206_lev_NA"
## [172] "Var206_lev_x.hAFG"
## [173] "Var206_lev_x.haYg"
## [174] "Var206_lev_x.IYzP"
## [175] "Var206_lev_x.kxE9"
## [176] "Var206_lev_x.y6dw"
## [177] "Var206_lev_x.zm5i"
## [178] "Var206_catP"
## [179] "Var206_catB"
## [180] "Var207_lev_x.7M47J5GA0pTYIFxg5uy"
## [181] "Var207_lev_x.DHn_WUyBhW_whjA88g9bvA64_"
## [182] "Var207_lev_x.me75fM6ugJ"
## [183] "Var207_catP"
## [184] "Var207_catB"
## [185] "Var210_lev_x.g5HH"
## [186] "Var210_lev_x.uKAI"
## [187] "Var210_catP"
## [188] "Var210_catB"
## [189] "Var211_lev_x.L84s"
## [190] "Var211_lev_x.Mtgm"
## [191] "Var212_lev_x.4kVnq_T26xq1p"
## [192] "Var212_lev_x.CrNX"
## [193] "Var212_lev_x.NhsEn4L"
## [194] "Var212_lev_x.XfqtO3UdzaXh_"
## [195] "Var212_catP"
## [196] "Var212_catB"
## [197] "Var213_lev_NA"
## [198] "Var213_lev_x.KdSa"
## [199] "Var214_lev_NA"
## [200] "Var214_catP"
## [201] "Var214_catB"
## [202] "Var216_lev_x.kZJyVg2"
## [203] "Var216_lev_x.XTbPUYD"
## [204] "Var216_catB"
## [205] "Var217_catP"
## [206] "Var217_catB"
## [207] "Var218_lev_x.cJvF"
## [208] "Var218_lev_x.UYBR"
## [209] "Var218_catP"
## [210] "Var218_catB"
## [211] "Var220_lev_x.4UxGlow"
## [212] "Var220_catP"
## [213] "Var220_catB"
## [214] "Var221_lev_x.d0EEeJi"
## [215] "Var221_lev_x.oslk"
## [216] "Var221_lev_x.QKW8DRm"
## [217] "Var221_lev_x.zCkv"
## [218] "Var221_catP"
## [219] "Var221_catB"
## [220] "Var222_lev_x.catzS2D"
## [221] "Var222_catP"
## [222] "Var222_catB"
## [223] "Var225_lev_NA"
## [224] "Var225_lev_x.ELof"
## [225] "Var225_catP"
## [226] "Var225_catB"
## [227] "Var226_lev_x.7P5s"
## [228] "Var226_lev_x.FSa2"
## [229] "Var226_lev_x.szEZ"
## [230] "Var226_catP"
## [231] "Var226_catB"
## [232] "Var227_lev_x.nIGXDli"
## [233] "Var227_lev_x.RAYp"
## [234] "Var227_lev_x.ZI9m"
## [235] "Var227_catP"
## [236] "Var227_catB"
## [237] "Var228_lev_x.55YFVY9"
## [238] "Var228_lev_x.F2FyR07IdsN7I"
## [239] "Var228_lev_x.ib5G6X1eUxUn6"
## [240] "Var228_lev_x.iyHGyLCEkQ"
## [241] "Var228_lev_x.R4y5gQQWY8OodqDV"
## [242] "Var228_lev_x.TCU50_Yjmm6GIBZ0lL_"
## [243] "Var228_catP"
## [244] "Var228_catB"
## [245] "Var229_lev_NA"
## [246] "Var229_lev_x.am7c"
## [247] "Var229_lev_x.mj86"
## [248] "Var229_catP"
## [249] "Var229_catB"
# prepare plotting frames
treatedTrainP = treatedTrainM[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]
formulaS = paste(yName,paste(selvars,collapse=' + '),sep=' ~ ')
for(mname in c('glmPred','gbmPred','xgboost')) {
print("*****************************")
print(date())
print(paste(mname,length(selvars)))
if(mname=='gbmPred') {
modelGBMs = gbm(as.formula(formulaS),
data=treatedTrainM,
distribution='bernoulli',
n.trees=1000,
interaction.depth=3,
keep.data=FALSE,
cv.folds=5)
#print(modelGBMs)
#print(summary(modelGBMs))
nTrees = gbm.perf(modelGBMs)
treatedTrainP[[mname]] = predict(modelGBMs,
newdata=treatedTrainM,type='response',
n.trees=nTrees)
treatedTestP[[mname]] = predict(modelGBMs,
newdata=treatedTest,type='response',
n.trees=nTrees)
} else if(mname=='glmPred') {
modelglms = cv.glmnet(x = as.matrix(treatedTrainM[,selvars,drop=FALSE]),
y = treatedTrainM[[yName]],
alpha=0.5,
family='binomial')
#print(summary(modelglms))
treatedTrainP[[mname]] = as.numeric(predict(modelglms,
newx=as.matrix(treatedTrainM[,selvars,drop=FALSE]),
type='response'))
treatedTestP[[mname]] = as.numeric(predict(modelglms,
newx=as.matrix(treatedTest[,selvars,drop=FALSE]),
type='response'))
} else if(mname=='xgboost') {
modelxg = xgboost(data=xgb.DMatrix(as.matrix(treatedTrainM[,selvars,drop=FALSE]),
label=treatedTrainM[[yName]]),
objective='binary:logistic',
nrounds=100,
nthread=parallel::detectCores())
treatedTrainP[[mname]] = as.numeric(predict(modelxg,
as.matrix(treatedTrainM[,selvars,drop=FALSE])))
treatedTestP[[mname]] = as.numeric(predict(modelxg,
as.matrix(treatedTest[,selvars,drop=FALSE])))
}
t1 = paste(mname,'train data')
print(DoubleDensityPlot(treatedTrainP, mname, yName,
title=t1))
print(ROCPlot(treatedTrainP, mname, yName,
title=t1))
t2 = paste(mname,'test data')
print(DoubleDensityPlot(treatedTestP, mname, yName,
title=t2))
print(ROCPlot(treatedTestP, mname, yName,
title=t2))
print(date())
print("*****************************")
}
## [1] "*****************************"
## [1] "Mon Jul 18 16:53:51 2016"
## [1] "glmPred 249"
## [1] "Mon Jul 18 16:59:21 2016"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Mon Jul 18 16:59:21 2016"
## [1] "gbmPred 249"
## Using cv method...
## [1] "Mon Jul 18 17:23:09 2016"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Mon Jul 18 17:23:09 2016"
## [1] "xgboost 249"
## [0] train-error:0.071800
## [1] train-error:0.071866
## [2] train-error:0.071422
## [3] train-error:0.071844
## [4] train-error:0.071822
## [5] train-error:0.071711
## [6] train-error:0.071378
## [7] train-error:0.071111
## [8] train-error:0.070645
## [9] train-error:0.070556
## [10] train-error:0.070223
## [11] train-error:0.069979
## [12] train-error:0.069801
## [13] train-error:0.069712
## [14] train-error:0.068935
## [15] train-error:0.068824
## [16] train-error:0.068668
## [17] train-error:0.068602
## [18] train-error:0.068202
## [19] train-error:0.068135
## [20] train-error:0.067847
## [21] train-error:0.067447
## [22] train-error:0.067203
## [23] train-error:0.066781
## [24] train-error:0.066470
## [25] train-error:0.066203
## [26] train-error:0.066003
## [27] train-error:0.065870
## [28] train-error:0.065848
## [29] train-error:0.065737
## [30] train-error:0.065515
## [31] train-error:0.065404
## [32] train-error:0.064960
## [33] train-error:0.064804
## [34] train-error:0.064626
## [35] train-error:0.064493
## [36] train-error:0.064204
## [37] train-error:0.064005
## [38] train-error:0.063938
## [39] train-error:0.063827
## [40] train-error:0.063805
## [41] train-error:0.063405
## [42] train-error:0.063405
## [43] train-error:0.063383
## [44] train-error:0.063338
## [45] train-error:0.063205
## [46] train-error:0.062961
## [47] train-error:0.062761
## [48] train-error:0.062494
## [49] train-error:0.062317
## [50] train-error:0.062095
## [51] train-error:0.061939
## [52] train-error:0.061717
## [53] train-error:0.061451
## [54] train-error:0.061473
## [55] train-error:0.061406
## [56] train-error:0.061051
## [57] train-error:0.060807
## [58] train-error:0.060696
## [59] train-error:0.060385
## [60] train-error:0.060074
## [61] train-error:0.059718
## [62] train-error:0.059519
## [63] train-error:0.059185
## [64] train-error:0.059052
## [65] train-error:0.058897
## [66] train-error:0.058586
## [67] train-error:0.058275
## [68] train-error:0.058119
## [69] train-error:0.058031
## [70] train-error:0.057764
## [71] train-error:0.057231
## [72] train-error:0.057120
## [73] train-error:0.056742
## [74] train-error:0.056565
## [75] train-error:0.056409
## [76] train-error:0.056432
## [77] train-error:0.056321
## [78] train-error:0.055965
## [79] train-error:0.055565
## [80] train-error:0.054921
## [81] train-error:0.054544
## [82] train-error:0.054211
## [83] train-error:0.053922
## [84] train-error:0.053123
## [85] train-error:0.052967
## [86] train-error:0.052612
## [87] train-error:0.052456
## [88] train-error:0.052190
## [89] train-error:0.051901
## [90] train-error:0.051457
## [91] train-error:0.050680
## [92] train-error:0.050191
## [93] train-error:0.049791
## [94] train-error:0.049436
## [95] train-error:0.049014
## [96] train-error:0.048681
## [97] train-error:0.048525
## [98] train-error:0.048548
## [99] train-error:0.047970
## [1] "Mon Jul 18 17:23:56 2016"
## [1] "*****************************"