KDD2009 example. Winners had hold-out AUC of 0.7611 on churn. See here for more details.
#load some libraries
# http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
library('vtreat')
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
library('parallel')
#library('class')
library('ggplot2')
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
source("xgboost.R")
## Loading required package: wrapr
# load the data as in the book
# change this path to match your directory structure
dir = '../../PracticalDataScienceWithR/zmPDSwR/KDD2009/'
d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
header=T,sep='\t',na.strings=c('NA',''),
stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrain = subset(d,rgroup<=0.9) # set for building models
dTest = subset(d,rgroup>0.9) # set for evaluation
debug = FALSE
if(debug) {
dTrain <- dTrain[sample.int(nrow(dTrainM),100),]
dTest <- dTest[sample.int(nrow(dTest),100),]
}
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
nonvars)
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
cl <- c()
if(!debug) {
ncore <- parallel::detectCores()
cl <- parallel::makeCluster(ncore)
}
# build treatments
kddSig = 0.05
base::date()
## [1] "Fri Feb 16 15:07:35 2018"
trainPlan = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
smFactor=2.0, rareCount = 2, rareSig = 0.5,
parallelCluster=cl)
base::date()
## [1] "Fri Feb 16 15:44:27 2018"
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame
selvars = treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<kddSig]
print(treatmentsC$scoreFrame[,c('varName','sig')])
## varName sig
## 1 Var1_clean 9.492776e-01
## 2 Var1_isBAD 4.529984e-01
## 3 Var2_clean 8.605947e-01
## 4 Var2_isBAD 4.708216e-06
## 5 Var3_clean 7.676960e-01
## 6 Var3_isBAD 4.885833e-06
## 7 Var4_clean 4.660491e-01
## 8 Var4_isBAD 1.862012e-06
## 9 Var5_clean 8.234268e-01
## 10 Var5_isBAD 1.419659e-06
## 11 Var6_clean 6.820836e-05
## 12 Var6_isBAD 1.822398e-15
## 13 Var7_clean 2.243008e-50
## 14 Var7_isBAD 4.938685e-12
## 15 Var9_clean 7.172173e-01
## 16 Var9_isBAD 4.529984e-01
## 17 Var10_clean 4.154370e-01
## 18 Var10_isBAD 1.419659e-06
## 19 Var11_clean 9.166634e-01
## 20 Var11_isBAD 4.885833e-06
## 21 Var12_clean 3.775127e-01
## 22 Var12_isBAD 7.958694e-01
## 23 Var13_clean 3.243381e-29
## 24 Var13_isBAD 4.938685e-12
## 25 Var14_clean 2.849611e-01
## 26 Var14_isBAD 4.885833e-06
## 27 Var16_clean 6.766542e-01
## 28 Var16_isBAD 1.419659e-06
## 29 Var17_clean 8.319031e-01
## 30 Var17_isBAD 1.862012e-06
## 31 Var18_clean 8.868140e-01
## 32 Var18_isBAD 1.862012e-06
## 33 Var19_clean 2.251994e-01
## 34 Var19_isBAD 1.862012e-06
## 35 Var21_clean 8.586691e-02
## 36 Var21_isBAD 1.822398e-15
## 37 Var22_clean 9.137265e-02
## 38 Var22_isBAD 6.650215e-16
## 39 Var23_clean 7.450751e-01
## 40 Var23_isBAD 1.419659e-06
## 41 Var24_clean 1.751853e-02
## 42 Var24_isBAD 3.283203e-02
## 43 Var25_clean 1.871793e-02
## 44 Var25_isBAD 6.650215e-16
## 45 Var26_clean 8.131591e-01
## 46 Var26_isBAD 1.419659e-06
## 47 Var27_clean 4.453701e-01
## 48 Var27_isBAD 1.419659e-06
## 49 Var28_clean 2.576883e-03
## 50 Var28_isBAD 6.216094e-16
## 51 Var29_clean 3.997658e-01
## 52 Var29_isBAD 4.529984e-01
## 53 Var30_clean 8.526755e-01
## 54 Var30_isBAD 4.529984e-01
## 55 Var33_clean 8.218358e-01
## 56 Var33_isBAD 1.069230e-04
## 57 Var34_clean 2.481331e-01
## 58 Var34_isBAD 4.708216e-06
## 59 Var35_clean 6.384715e-02
## 60 Var35_isBAD 6.650215e-16
## 61 Var36_clean 8.308712e-01
## 62 Var36_isBAD 4.708216e-06
## 63 Var37_clean 7.051880e-01
## 64 Var37_isBAD 1.862012e-06
## 65 Var38_clean 1.377868e-01
## 66 Var38_isBAD 6.650215e-16
## 67 Var40_clean 6.865043e-01
## 68 Var40_isBAD 4.708216e-06
## 69 Var41_clean 8.896375e-01
## 70 Var41_isBAD 4.529984e-01
## 71 Var43_clean 9.368772e-01
## 72 Var43_isBAD 4.708216e-06
## 73 Var44_clean 6.515621e-01
## 74 Var44_isBAD 6.650215e-16
## 75 Var45_clean 3.862460e-01
## 76 Var45_isBAD 2.037797e-01
## 77 Var46_clean 6.375689e-01
## 78 Var46_isBAD 4.708216e-06
## 79 Var47_clean 6.770957e-01
## 80 Var47_isBAD 4.529984e-01
## 81 Var49_clean 5.199929e-01
## 82 Var49_isBAD 4.708216e-06
## 83 Var50_clean 5.624658e-01
## 84 Var50_isBAD 4.529984e-01
## 85 Var51_clean 1.451012e-01
## 86 Var51_isBAD 5.895954e-05
## 87 Var53_clean 1.419659e-01
## 88 Var53_isBAD 4.529984e-01
## 89 Var54_clean 8.323607e-01
## 90 Var54_isBAD 4.708216e-06
## 91 Var56_clean 8.017284e-01
## 92 Var56_isBAD 4.294514e-02
## 93 Var57_clean 1.196233e-01
## 94 Var58_clean 9.500667e-01
## 95 Var58_isBAD 4.529984e-01
## 96 Var59_clean 4.834119e-01
## 97 Var59_isBAD 2.014521e-02
## 98 Var60_clean 8.494907e-01
## 99 Var60_isBAD 1.419659e-06
## 100 Var61_clean 9.396293e-01
## 101 Var61_isBAD 1.069230e-04
## 102 Var62_clean 5.246557e-01
## 103 Var62_isBAD 7.958694e-01
## 104 Var63_clean 7.695258e-01
## 105 Var63_isBAD 5.016146e-01
## 106 Var64_clean 9.402750e-01
## 107 Var64_isBAD 1.933946e-01
## 108 Var65_clean 4.650316e-17
## 109 Var65_isBAD 4.938685e-12
## 110 Var66_clean 7.232170e-01
## 111 Var66_isBAD 5.016146e-01
## 112 Var67_clean 6.842271e-01
## 113 Var67_isBAD 1.419659e-06
## 114 Var68_clean 7.448982e-01
## 115 Var68_isBAD 4.708216e-06
## 116 Var69_clean 3.134760e-01
## 117 Var69_isBAD 1.419659e-06
## 118 Var70_clean 7.319862e-01
## 119 Var70_isBAD 1.419659e-06
## 120 Var71_clean 9.406609e-01
## 121 Var71_isBAD 8.505976e-06
## 122 Var72_clean 5.281298e-12
## 123 Var72_isBAD 1.082957e-01
## 124 Var73_clean 4.221817e-50
## 125 Var74_clean 1.142333e-34
## 126 Var74_isBAD 4.938685e-12
## 127 Var75_clean 5.766790e-01
## 128 Var75_isBAD 4.708216e-06
## 129 Var76_clean 1.258218e-01
## 130 Var76_isBAD 6.650215e-16
## 131 Var77_clean 3.869888e-01
## 132 Var77_isBAD 4.529984e-01
## 133 Var78_clean 5.539030e-01
## 134 Var78_isBAD 6.650215e-16
## 135 Var80_clean 8.666502e-01
## 136 Var80_isBAD 1.419659e-06
## 137 Var81_clean 1.954891e-12
## 138 Var81_isBAD 1.822398e-15
## 139 Var82_clean 6.062823e-02
## 140 Var82_isBAD 1.862012e-06
## 141 Var83_clean 8.666888e-01
## 142 Var83_isBAD 6.650215e-16
## 143 Var84_clean 6.994175e-01
## 144 Var84_isBAD 4.885833e-06
## 145 Var85_clean 1.604340e-02
## 146 Var85_isBAD 6.650215e-16
## 147 Var86_clean 6.330919e-01
## 148 Var86_isBAD 4.529984e-01
## 149 Var87_clean 5.909534e-01
## 150 Var87_isBAD 4.529984e-01
## 151 Var88_clean 8.070649e-01
## 152 Var88_isBAD 2.035268e-04
## 153 Var89_clean 5.913166e-01
## 154 Var89_isBAD 4.294514e-02
## 155 Var90_clean 6.519917e-01
## 156 Var90_isBAD 4.529984e-01
## 157 Var91_clean 9.406609e-01
## 158 Var91_isBAD 8.505976e-06
## 159 Var92_clean 5.102136e-01
## 160 Var92_isBAD 9.131691e-01
## 161 Var93_clean 8.799233e-01
## 162 Var93_isBAD 1.419659e-06
## 163 Var94_clean 3.040503e-03
## 164 Var94_isBAD 1.082957e-01
## 165 Var95_clean 8.740925e-01
## 166 Var95_isBAD 4.708216e-06
## 167 Var96_clean 9.791331e-01
## 168 Var96_isBAD 4.708216e-06
## 169 Var97_clean 5.104204e-01
## 170 Var97_isBAD 1.419659e-06
## 171 Var98_clean 5.196590e-01
## 172 Var98_isBAD 7.958694e-01
## 173 Var99_clean 8.074618e-01
## 174 Var99_isBAD 1.862012e-06
## 175 Var100_clean 3.622525e-01
## 176 Var100_isBAD 4.529984e-01
## 177 Var101_clean 9.132568e-01
## 178 Var101_isBAD 1.609703e-04
## 179 Var102_clean 8.995868e-01
## 180 Var102_isBAD 4.212975e-02
## 181 Var103_clean 9.833747e-01
## 182 Var103_isBAD 1.419659e-06
## 183 Var104_clean 9.008945e-01
## 184 Var104_isBAD 2.014521e-02
## 185 Var105_clean 9.008945e-01
## 186 Var105_isBAD 2.014521e-02
## 187 Var106_clean 9.832242e-01
## 188 Var106_isBAD 1.862012e-06
## 189 Var107_clean 9.052218e-01
## 190 Var107_isBAD 1.419659e-06
## 191 Var108_clean 4.496980e-01
## 192 Var108_isBAD 4.529984e-01
## 193 Var109_clean 1.821215e-01
## 194 Var109_isBAD 3.283203e-02
## 195 Var110_clean 1.182896e-01
## 196 Var110_isBAD 4.529984e-01
## 197 Var111_clean 5.100830e-01
## 198 Var111_isBAD 8.505976e-06
## 199 Var112_clean 1.461138e-01
## 200 Var112_isBAD 6.650215e-16
## 201 Var113_clean 3.397145e-35
## 202 Var114_clean 1.801976e-01
## 203 Var114_isBAD 4.708216e-06
## 204 Var115_clean 5.050064e-01
## 205 Var115_isBAD 2.014521e-02
## 206 Var116_clean 8.816889e-01
## 207 Var116_isBAD 4.529984e-01
## 208 Var117_clean 5.088379e-01
## 209 Var117_isBAD 1.862012e-06
## 210 Var118_isBAD 9.131691e-01
## 211 Var119_clean 2.755317e-02
## 212 Var119_isBAD 1.822398e-15
## 213 Var120_clean 9.554935e-01
## 214 Var120_isBAD 1.419659e-06
## 215 Var121_clean 9.468560e-01
## 216 Var121_isBAD 4.529984e-01
## 217 Var122_clean 5.330364e-01
## 218 Var122_isBAD 4.708216e-06
## 219 Var123_clean 8.512207e-01
## 220 Var123_isBAD 6.650215e-16
## 221 Var124_clean 8.426307e-01
## 222 Var124_isBAD 1.862012e-06
## 223 Var125_clean 1.240769e-06
## 224 Var125_isBAD 4.938685e-12
## 225 Var126_clean 1.016548e-18
## 226 Var126_isBAD 3.089900e-73
## 227 Var127_clean 9.526229e-01
## 228 Var127_isBAD 2.035268e-04
## 229 Var128_clean 8.070649e-01
## 230 Var128_isBAD 2.035268e-04
## 231 Var129_clean 8.423695e-01
## 232 Var129_isBAD 4.529984e-01
## 233 Var130_clean 6.175541e-02
## 234 Var130_isBAD 4.885833e-06
## 235 Var131_clean 1.474015e-01
## 236 Var131_isBAD 4.529984e-01
## 237 Var132_clean 1.146808e-01
## 238 Var132_isBAD 6.650215e-16
## 239 Var133_clean 9.205745e-01
## 240 Var133_isBAD 6.650215e-16
## 241 Var134_clean 2.869840e-01
## 242 Var134_isBAD 6.650215e-16
## 243 Var135_clean 9.381591e-01
## 244 Var135_isBAD 1.862012e-06
## 245 Var136_clean 2.068155e-01
## 246 Var136_isBAD 5.016146e-01
## 247 Var137_clean 6.449236e-01
## 248 Var137_isBAD 4.529984e-01
## 249 Var138_clean 8.520043e-01
## 250 Var138_isBAD 1.862012e-06
## 251 Var139_clean 3.492143e-02
## 252 Var139_isBAD 1.419659e-06
## 253 Var140_clean 4.744656e-17
## 254 Var140_isBAD 4.938685e-12
## 255 Var142_clean 1.548562e-01
## 256 Var142_isBAD 4.529984e-01
## 257 Var143_clean 2.680228e-02
## 258 Var143_isBAD 6.650215e-16
## 259 Var144_clean 8.900620e-30
## 260 Var144_isBAD 1.822398e-15
## 261 Var145_clean 8.188110e-01
## 262 Var145_isBAD 1.862012e-06
## 263 Var146_clean 8.568031e-01
## 264 Var146_isBAD 1.419659e-06
## 265 Var147_clean 2.938475e-01
## 266 Var147_isBAD 1.419659e-06
## 267 Var148_clean 9.401887e-01
## 268 Var148_isBAD 1.419659e-06
## 269 Var149_clean 7.314142e-03
## 270 Var149_isBAD 3.283203e-02
## 271 Var150_clean 9.022465e-01
## 272 Var150_isBAD 1.862012e-06
## 273 Var151_clean 9.380727e-01
## 274 Var151_isBAD 1.069230e-04
## 275 Var152_clean 8.730887e-01
## 276 Var152_isBAD 1.862012e-06
## 277 Var153_clean 9.003705e-02
## 278 Var153_isBAD 6.650215e-16
## 279 Var154_clean 5.678110e-01
## 280 Var154_isBAD 4.529984e-01
## 281 Var155_clean 9.893070e-01
## 282 Var155_isBAD 1.862012e-06
## 283 Var156_clean 7.232170e-01
## 284 Var156_isBAD 5.016146e-01
## 285 Var157_clean 8.228076e-01
## 286 Var157_isBAD 8.505976e-06
## 287 Var158_clean 9.410387e-01
## 288 Var158_isBAD 1.609703e-04
## 289 Var159_clean 9.823880e-01
## 290 Var159_isBAD 4.708216e-06
## 291 Var160_clean 5.977602e-03
## 292 Var160_isBAD 6.650215e-16
## 293 Var161_clean 3.160734e-01
## 294 Var161_isBAD 1.862012e-06
## 295 Var162_clean 5.769463e-01
## 296 Var162_isBAD 4.708216e-06
## 297 Var163_clean 4.873034e-01
## 298 Var163_isBAD 6.650215e-16
## 299 Var164_clean 7.113866e-01
## 300 Var164_isBAD 1.862012e-06
## 301 Var165_clean 8.730784e-01
## 302 Var165_isBAD 1.609703e-04
## 303 Var166_clean 9.690297e-01
## 304 Var166_isBAD 1.419659e-06
## 305 Var168_clean 1.250505e-02
## 306 Var168_isBAD 4.529984e-01
## 307 Var170_clean 5.107866e-01
## 308 Var170_isBAD 4.708216e-06
## 309 Var171_clean 8.303872e-01
## 310 Var171_isBAD 2.035268e-04
## 311 Var172_clean 4.570958e-01
## 312 Var172_isBAD 1.419659e-06
## 313 Var173_clean 2.412398e-02
## 314 Var173_isBAD 6.650215e-16
## 315 Var174_clean 6.481659e-01
## 316 Var174_isBAD 1.862012e-06
## 317 Var176_clean 5.014708e-01
## 318 Var176_isBAD 4.885833e-06
## 319 Var177_clean 1.286309e-01
## 320 Var177_isBAD 4.708216e-06
## 321 Var178_clean 5.505067e-01
## 322 Var178_isBAD 4.294514e-02
## 323 Var179_clean 8.971566e-01
## 324 Var179_isBAD 1.862012e-06
## 325 Var180_clean 8.457465e-01
## 326 Var180_isBAD 4.529984e-01
## 327 Var181_clean 8.072579e-01
## 328 Var181_isBAD 6.650215e-16
## 329 Var182_clean 9.562394e-01
## 330 Var182_isBAD 1.862012e-06
## 331 Var183_clean 7.838545e-01
## 332 Var183_isBAD 4.708216e-06
## 333 Var184_clean 7.155066e-01
## 334 Var184_isBAD 4.708216e-06
## 335 Var186_clean 8.510361e-01
## 336 Var186_isBAD 4.529984e-01
## 337 Var187_clean 6.159631e-01
## 338 Var187_isBAD 4.529984e-01
## 339 Var188_clean 5.453843e-02
## 340 Var188_isBAD 4.708216e-06
## 341 Var189_clean 1.291969e-64
## 342 Var189_isBAD 1.970811e-01
## 343 Var190_clean 5.455177e-01
## 344 Var190_isBAD 1.930716e-01
## 345 Var192_catP 2.082577e-10
## 346 Var192_catB 5.533664e-40
## 347 Var193_catP 1.701853e-39
## 348 Var193_catB 6.236651e-32
## 349 Var194_catP 6.096128e-05
## 350 Var194_catB 1.291723e-03
## 351 Var195_catP 6.138027e-06
## 352 Var195_catB 1.929683e-05
## 353 Var196_catP 9.404524e-02
## 354 Var196_catB 1.000194e-01
## 355 Var197_catP 1.416153e-06
## 356 Var197_catB 6.309090e-04
## 357 Var198_catP 9.978122e-23
## 358 Var198_catB 7.173122e-20
## 359 Var199_catP 1.470377e-36
## 360 Var199_catB 7.717977e-62
## 361 Var200_catP 9.623338e-29
## 362 Var200_catB 1.718490e-16
## 363 Var201_catP 6.754469e-05
## 364 Var201_catB 1.024274e-04
## 365 Var202_catP 2.682362e-18
## 366 Var202_catB 2.582971e-06
## 367 Var203_catP 2.512826e-02
## 368 Var203_catB 5.392982e-01
## 369 Var204_catP 9.407627e-04
## 370 Var204_catB 2.171628e-11
## 371 Var205_catP 2.599738e-17
## 372 Var205_catB 3.967139e-40
## 373 Var206_catP 4.392588e-30
## 374 Var206_catB 5.187663e-68
## 375 Var207_catP 2.546858e-25
## 376 Var207_catB 2.217665e-32
## 377 Var208_catP 1.530395e-01
## 378 Var208_catB 7.321903e-01
## 379 Var210_catP 9.785209e-17
## 380 Var210_catB 3.809536e-22
## 381 Var212_catP 6.314134e-53
## 382 Var212_catB 3.411730e-51
## 383 Var214_catP 9.623338e-29
## 384 Var214_catB 1.718490e-16
## 385 Var215_catP 5.019360e-01
## 386 Var216_catP 2.257811e-01
## 387 Var216_catB 2.563654e-60
## 388 Var217_catP 4.250294e-63
## 389 Var217_catB 5.031841e-25
## 390 Var218_catP 2.418266e-55
## 391 Var218_catB 3.577217e-65
## 392 Var219_catP 8.744379e-03
## 393 Var219_catB 4.589762e-02
## 394 Var220_catP 9.978122e-23
## 395 Var220_catB 7.173122e-20
## 396 Var221_catP 1.571828e-20
## 397 Var221_catB 1.473679e-22
## 398 Var222_catP 9.978122e-23
## 399 Var222_catB 7.173122e-20
## 400 Var223_catP 9.237290e-01
## 401 Var223_catB 4.394179e-02
## 402 Var225_catP 4.506149e-28
## 403 Var225_catB 2.921872e-31
## 404 Var226_catP 4.519958e-06
## 405 Var226_catB 3.080169e-15
## 406 Var227_catP 6.712252e-26
## 407 Var227_catB 1.253216e-31
## 408 Var228_catP 1.055705e-41
## 409 Var228_catB 2.522467e-43
## 410 Var229_catP 4.670404e-38
## 411 Var229_catB 2.832005e-36
## 412 Var191_lev_NA 2.035268e-04
## 413 Var191_lev_x.r__I 2.035268e-04
## 414 Var193_lev_x.2Knk1KF 4.037365e-22
## 415 Var193_lev_x.AERks4l 4.739646e-04
## 416 Var193_lev_x.RO12 1.876226e-40
## 417 Var194_lev_NA 6.161267e-05
## 418 Var194_lev_x.SEuy 8.513717e-05
## 419 Var195_lev_x.taul 5.902705e-06
## 420 Var196_lev_x.1K8T 9.389575e-02
## 421 Var197_lev_x.0Xwj 1.280985e-02
## 422 Var197_lev_x.487l 6.156225e-02
## 423 Var197_lev_x.JLbT 2.873384e-01
## 424 Var197_lev_x.ssAy 3.997540e-03
## 425 Var197_lev_x.TyGl 4.804164e-02
## 426 Var198_lev_rare 3.059497e-06
## 427 Var198_lev_x.fhk21Ss 1.586589e-20
## 428 Var198_lev_x.PHNvXy8 1.178434e-02
## 429 Var199_lev_rare 2.388672e-18
## 430 Var200_lev_NA 9.625925e-29
## 431 Var200_lev_rare 3.419952e-22
## 432 Var201_lev_NA 6.239981e-05
## 433 Var201_lev_x.smXZ 5.556270e-05
## 434 Var202_lev_rare 1.717677e-01
## 435 Var203_lev_x.9_Y1 2.457481e-02
## 436 Var203_lev_x.F3hy 5.199863e-02
## 437 Var203_lev_x.HLqf 1.011029e-01
## 438 Var204_lev_x.15m3 1.902608e-01
## 439 Var204_lev_x.m_h1 2.657181e-01
## 440 Var204_lev_x.RcM7 2.412547e-03
## 441 Var204_lev_x.rGJy 3.201938e-01
## 442 Var204_lev_x.RVjC 6.707372e-03
## 443 Var204_lev_x.SkZj 1.120507e-01
## 444 Var204_lev_x.z5Ry 3.144715e-03
## 445 Var205_lev_NA 3.520832e-01
## 446 Var205_lev_x.09_Q 2.208737e-03
## 447 Var205_lev_x.sJzTlal 4.297525e-43
## 448 Var205_lev_x.VpdQ 4.100066e-11
## 449 Var206_lev_NA 1.822398e-15
## 450 Var206_lev_x.43pnToF 1.959477e-01
## 451 Var206_lev_x.6JmL 1.236047e-02
## 452 Var206_lev_x.hAFG 5.468636e-08
## 453 Var206_lev_x.haYg 8.746431e-12
## 454 Var206_lev_x.IYzP 3.285526e-23
## 455 Var206_lev_x.kxE9 8.274335e-06
## 456 Var206_lev_x.sYC_ 1.925194e-01
## 457 Var206_lev_x.wMei 2.194114e-01
## 458 Var206_lev_x.y6dw 7.699583e-14
## 459 Var206_lev_x.zm5i 5.300895e-14
## 460 Var207_lev_x.7M47J5GA0pTYIFxg5uy 5.394318e-24
## 461 Var207_lev_x.DHn_WUyBhW_whjA88g9bvA64_ 2.898618e-08
## 462 Var207_lev_x.me75fM6ugJ 1.389566e-27
## 463 Var207_lev_x.NKv3VA1BpP 2.653449e-02
## 464 Var208_lev_x.kIsH 1.448640e-01
## 465 Var208_lev_x.sBgB 8.562110e-02
## 466 Var210_lev_x.g5HH 2.862852e-16
## 467 Var210_lev_x.uKAI 7.611924e-17
## 468 Var211_lev_x.L84s 7.064698e-12
## 469 Var211_lev_x.Mtgm 7.064698e-12
## 470 Var212_lev_x.4kVnq_T26xq1p 1.021609e-06
## 471 Var212_lev_x.CrNX 8.613110e-08
## 472 Var212_lev_x.Ie_5MZs 1.096606e-02
## 473 Var212_lev_x.NhsEn4L 1.170222e-54
## 474 Var212_lev_x.XfqtO3UdzaXh_ 1.074954e-17
## 475 Var213_lev_NA 8.505976e-06
## 476 Var213_lev_x.KdSa 8.505976e-06
## 477 Var214_lev_NA 9.625925e-29
## 478 Var214_lev_rare 3.419952e-22
## 479 Var216_lev_rare 2.882467e-03
## 480 Var216_lev_x.11p4mKe 3.024677e-02
## 481 Var216_lev_x.kZJtVhC 8.818825e-03
## 482 Var216_lev_x.kZJyVg2 1.515818e-03
## 483 Var216_lev_x.mAja5EA 1.151189e-02
## 484 Var216_lev_x.mAjbk_S 3.523563e-01
## 485 Var216_lev_x.NGZxnJM 3.831746e-02
## 486 Var216_lev_x.XTbPUYD 5.534288e-15
## 487 Var217_lev_rare 3.764304e-16
## 488 Var218_lev_x.cJvF 1.064714e-28
## 489 Var218_lev_x.UYBR 1.028586e-11
## 490 Var219_lev_NA 3.455312e-01
## 491 Var219_lev_x.AU8pNoi 3.901751e-03
## 492 Var219_lev_x.FzaX 1.787239e-02
## 493 Var220_lev_rare 3.059497e-06
## 494 Var220_lev_x.4UxGlow 1.586589e-20
## 495 Var220_lev_x.UF16siJ 1.178434e-02
## 496 Var221_lev_x.d0EEeJi 2.329916e-08
## 497 Var221_lev_x.oslk 1.188016e-20
## 498 Var221_lev_x.QKW8DRm 1.699119e-06
## 499 Var221_lev_x.zCkv 1.187931e-08
## 500 Var222_lev_rare 3.059497e-06
## 501 Var222_lev_x.APgdzOv 1.178434e-02
## 502 Var222_lev_x.catzS2D 1.586589e-20
## 503 Var223_lev_NA 3.455312e-01
## 504 Var224_lev_NA 2.014521e-02
## 505 Var225_lev_NA 2.137222e-29
## 506 Var225_lev_x.ELof 4.193752e-22
## 507 Var225_lev_x.kG3k 4.614202e-03
## 508 Var225_lev_x.xG3x 4.125449e-03
## 509 Var226_lev_x.5Acm 4.555151e-01
## 510 Var226_lev_x.7aLG 2.861099e-01
## 511 Var226_lev_x.7P5s 4.617143e-07
## 512 Var226_lev_x.Aoh3 1.826908e-01
## 513 Var226_lev_x.FSa2 1.918030e-10
## 514 Var226_lev_x.kwS7 1.599428e-01
## 515 Var226_lev_x.me1d 3.786301e-03
## 516 Var226_lev_x.PM2D 6.991124e-02
## 517 Var226_lev_x.Qcbd 1.312678e-01
## 518 Var226_lev_x.Qu4f 3.602696e-02
## 519 Var226_lev_x.rgKb 3.396179e-01
## 520 Var226_lev_x.szEZ 2.278658e-04
## 521 Var226_lev_x.TNEC 3.447014e-01
## 522 Var226_lev_x.uWr3 1.991424e-02
## 523 Var226_lev_x.Xa3G 1.390213e-02
## 524 Var227_lev_x.02N6s8f 2.410784e-03
## 525 Var227_lev_x.6fzt 2.948771e-01
## 526 Var227_lev_x.nIGXDli 4.602414e-04
## 527 Var227_lev_x.RAYp 2.927722e-27
## 528 Var227_lev_x.ZI9m 1.825722e-24
## 529 Var228_lev_x.55YFVY9 9.148369e-10
## 530 Var228_lev_x.F2FyR07IdsN7I 2.261755e-42
## 531 Var228_lev_x.ib5G6X1eUxUn6 8.780686e-13
## 532 Var228_lev_x.iyHGyLCEkQ 2.923118e-09
## 533 Var228_lev_x.R4y5gQQWY8OodqDV 5.551966e-05
## 534 Var228_lev_x.TCU50_Yjmm6GIBZ0lL_ 4.796431e-08
## 535 Var229_lev_NA 3.635024e-38
## 536 Var229_lev_x.am7c 4.398543e-16
## 537 Var229_lev_x.mj86 2.190401e-14
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
pruneSig=kddSig,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
# Run other models (with proper coding/training separation).
#
# This gets us back to AUC 0.72
#print(selvars)
# prepare plotting frames
treatedTrainP = treatedTrainM[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]
formulaS = paste(yName,paste(selvars,collapse=' + '),sep=' ~ ')
for(mname in c('glmPred','xgboost')) {
print("*****************************")
print(date())
print(paste(mname,length(selvars)))
base::date()
if(mname=='xgboost') {
m <- mkXGBoostModelC(treatedTrainM, selvars, yName)
treatedTestP[[mname]] = m(treatedTest)
} else {
modelglms = cv.glmnet(x = as.matrix(treatedTrainM[,selvars,drop=FALSE]),
y = treatedTrainM[[yName]],
alpha=0.5,
family='binomial')
#print(summary(modelglms))
treatedTestP[[mname]] = as.numeric(predict(modelglms,
newx=as.matrix(treatedTest[,selvars,drop=FALSE]),
type='response'))
}
base::date()
t2 = paste(mname,'test data')
print(DoubleDensityPlot(treatedTestP, mname, yName,
title=t2))
print(ROCPlot(treatedTestP, mname, yName, yTarget,
title=t2))
print(DoubleHistogramPlot(treatedTestP, mname, yName,
title=t2))
print(date())
print("*****************************")
}
## [1] "*****************************"
## [1] "Fri Feb 16 15:44:28 2018"
## [1] "glmPred 312"
## [1] "Fri Feb 16 15:51:48 2018"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Fri Feb 16 15:51:48 2018"
## [1] "xgboost 312"
## [1] "start mkXGBoostModel Fri Feb 16 15:51:48 2018"
## [1] " start mkXGBoostModel xgb.cv Fri Feb 16 15:51:48 2018"
## [1] "xgboost: ntrees 61"
## [1] " start mkXGBoostModel xgboost Fri Feb 16 16:12:10 2018"
## [1] "done mkXGBoostModel Fri Feb 16 16:12:29 2018"
## [1] "Fri Feb 16 16:12:30 2018"
## [1] "*****************************"