knitr::opts_chunk$set(echo = TRUE)
library('vtreat')
library('WVPlots') # see: https://github.com/WinVector/WVPlots
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
library('rpart')
library('caret')
## Loading required package: lattice
#library('doMC')
ncore <- parallel::detectCores()
#registerDoMC(cores = ncore)
cl <- parallel::makeCluster(ncore)
# see: https://github.com/WinVector/PreparingDataWorkshop/tree/master/KDD2009
d = read.table('orange_small_train.data.gz',
header=T,sep='\t',na.strings=c('NA',''),
strip.white = TRUE,
stringsAsFactors=FALSE)
churn = read.table('orange_small_train_churn.labels.txt',
header=F,sep='\t',
strip.white = TRUE,
stringsAsFactors = FALSE)
d$churn = churn$V1
set.seed(729375)
rgroup = runif(dim(d)[[1]])
dTrain = d[rgroup<=0.9,] # set for building models
dTest = d[rgroup>0.9,] # set for evaluation
rm(list=c('d','churn'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
nonvars)
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
# build treatments
trainPlan = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
smFactor=2.0,
parallelCluster=cl)
print(trainPlan$method)
## [1] "kwaycrossystratified"
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame
#kddSig = 1/nrow(treatmentsC$scoreFrame)
selvars <- setdiff(colnames(treatedTrainM),outcomes)
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
varRestriction=selvars,
pruneSig=NULL,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
goodvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<1/nrow(treatmentsC$scoreFrame)]
form <- paste(yName,paste(goodvars,collapse=' + '),sep=' ~ ')
rcontrol <- rpart.control(cp = 0.001)
m <- rpart(form,treatedTrainM,control=rcontrol)
summary(m)
## Call:
## rpart(formula = form, data = treatedTrainM, control = rcontrol)
## n= 45028
##
## CP nsplit rel error xerror xstd
## 1 0.012698340 0 1.0000000 1.0000249 0.01537689
## 2 0.008453711 1 0.9873017 0.9873581 0.01499669
## 3 0.007171726 2 0.9788479 0.9806515 0.01489179
## 4 0.007012624 3 0.9716762 0.9729558 0.01468386
## 5 0.003488916 4 0.9646636 0.9661061 0.01454178
## 6 0.002403099 5 0.9611747 0.9620480 0.01446954
## 7 0.002251124 6 0.9587716 0.9611113 0.01444146
## 8 0.002237793 7 0.9565205 0.9585800 0.01438728
## 9 0.001895567 10 0.9498071 0.9553720 0.01433074
## 10 0.001525045 11 0.9479115 0.9531197 0.01430570
## 11 0.001199865 12 0.9463865 0.9528562 0.01430973
## 12 0.001175852 13 0.9451866 0.9535656 0.01432270
## 13 0.001016774 14 0.9440108 0.9532200 0.01431974
## 14 0.001000000 15 0.9429940 0.9542217 0.01433326
##
## Variable importance
## Var126_clean Var217_catP
## 15 6
## Var218_catB Var218_catP
## 6 6
## Var126_isBAD Var74_clean
## 6 5
## Var13_clean Var211_lev_x.L84s
## 5 5
## Var211_lev_x.Mtgm Var140_clean
## 5 4
## Var7_clean Var73_clean
## 4 3
## Var125_clean Var212_catB
## 3 2
## Var212_catP Var212_lev_x.NhsEn4L
## 2 2
## Var192_catB Var6_clean
## 2 2
## Var189_clean Var192_catP
## 2 2
## Var199_catP Var228_catB
## 1 1
## Var193_catB Var207_catB
## 1 1
## Var207_catP Var207_lev_x.me75fM6ugJ
## 1 1
## Var193_catP Var193_lev_x.RO12
## 1 1
## Var228_lev_x.F2FyR07IdsN7I Var217_catB
## 1 1
## Var113_clean Var216_catB
## 1 1
##
## Node number 1: 45028 observations, complexity param=0.01269834
## mean=0.07370969, MSE=0.06827657
## left son=2 (12046 obs) right son=3 (32982 obs)
## Primary splits:
## Var126_clean < -15 to the left, improve=0.012698340, (0 missing)
## Var126_isBAD < 0.5 to the left, improve=0.007833296, (0 missing)
## Var74_clean < 3.5 to the right, improve=0.007767768, (0 missing)
## Var13_clean < 30 to the right, improve=0.007342036, (0 missing)
## Var218_catP < 0.2466773 to the right, improve=0.007320165, (0 missing)
## Surrogate splits:
## Var211_lev_x.L84s < 0.5 to the left, agree=0.848, adj=0.433, (0 split)
## Var211_lev_x.Mtgm < 0.5 to the right, agree=0.848, adj=0.433, (0 split)
## Var73_clean < 9 to the left, agree=0.760, adj=0.104, (0 split)
## Var113_clean < -1253786 to the left, agree=0.754, adj=0.080, (0 split)
## Var22_isBAD < 0.5 to the right, agree=0.747, adj=0.055, (0 split)
##
## Node number 2: 12046 observations, complexity param=0.001175852
## mean=0.02498755, MSE=0.02436317
## left son=4 (12030 obs) right son=5 (16 obs)
## Primary splits:
## Var192_catB < 0.9740576 to the left, improve=0.012317730, (0 missing)
## Var210_catB < 0.4563415 to the left, improve=0.005462331, (0 missing)
## Var210_lev_x.g5HH < 0.5 to the left, improve=0.005145479, (0 missing)
## Var216_catB < 0.5860013 to the left, improve=0.004648987, (0 missing)
## Var210_catP < 0.4910221 to the right, improve=0.004361510, (0 missing)
##
## Node number 3: 32982 observations, complexity param=0.008453711
## mean=0.09150446, MSE=0.08313139
## left son=6 (32530 obs) right son=7 (452 obs)
## Primary splits:
## Var218_catP < 0.2466773 to the right, improve=0.009478939, (0 missing)
## Var218_catB < 0.6405356 to the left, improve=0.009478939, (0 missing)
## Var217_catP < 0.01014358 to the left, improve=0.009478939, (0 missing)
## Var74_clean < 3.5 to the right, improve=0.008521106, (0 missing)
## Var216_catB < 0.6173914 to the left, improve=0.007839353, (0 missing)
## Surrogate splits:
## Var217_catP < 0.01014358 to the left, agree=1, adj=1, (0 split)
## Var218_catB < 0.6405356 to the left, agree=1, adj=1, (0 split)
##
## Node number 4: 12030 observations
## mean=0.02435578, MSE=0.02376257
##
## Node number 5: 16 observations
## mean=0.5, MSE=0.25
##
## Node number 6: 32530 observations, complexity param=0.007171726
## mean=0.08819551, MSE=0.08041706
## left son=12 (18225 obs) right son=13 (14305 obs)
## Primary splits:
## Var74_clean < 3.5 to the right, improve=0.008428412, (0 missing)
## Var126_clean < 1 to the right, improve=0.007885540, (0 missing)
## Var216_catB < 0.6173914 to the left, improve=0.007657706, (0 missing)
## Var212_catB < 0.04062418 to the left, improve=0.007642976, (0 missing)
## Var212_lev_x.NhsEn4L < 0.5 to the left, improve=0.007367498, (0 missing)
## Surrogate splits:
## Var13_clean < 66 to the right, agree=0.852, adj=0.664, (0 split)
## Var140_clean < 47.5 to the right, agree=0.850, adj=0.658, (0 split)
## Var7_clean < 3.375388 to the right, agree=0.828, adj=0.609, (0 split)
## Var125_clean < 859.5 to the right, agree=0.785, adj=0.511, (0 split)
## Var6_clean < 584.5 to the right, agree=0.717, adj=0.356, (0 split)
##
## Node number 7: 452 observations, complexity param=0.001895567
## mean=0.329646, MSE=0.2209795
## left son=14 (330 obs) right son=15 (122 obs)
## Primary splits:
## Var13_clean < 6 to the right, improve=0.05834493, (0 missing)
## Var229_catP < 0.4003503 to the left, improve=0.04925234, (0 missing)
## Var229_catB < -0.03183241 to the left, improve=0.04925234, (0 missing)
## Var229_lev_NA < 0.5 to the left, improve=0.04925234, (0 missing)
## Var73_clean < 55 to the right, improve=0.03842976, (0 missing)
## Surrogate splits:
## Var125_clean < 265.5 to the right, agree=0.927, adj=0.730, (0 split)
## Var140_clean < 7.5 to the right, agree=0.927, adj=0.730, (0 split)
## Var7_clean < 3.375388 to the right, agree=0.923, adj=0.713, (0 split)
## Var73_clean < 29 to the right, agree=0.885, adj=0.574, (0 split)
## Var74_clean < 3.5 to the right, agree=0.869, adj=0.516, (0 split)
##
## Node number 12: 18225 observations, complexity param=0.002237793
## mean=0.06513032, MSE=0.06088836
## left son=24 (9414 obs) right son=25 (8811 obs)
## Primary splits:
## Var126_clean < 1 to the right, improve=0.005664251, (0 missing)
## Var189_clean < 249 to the right, improve=0.004819273, (0 missing)
## Var192_catP < 6.662558e-05 to the right, improve=0.004685729, (0 missing)
## Var73_clean < 117 to the right, improve=0.004235697, (0 missing)
## Var199_catB < -0.1350012 to the left, improve=0.004014248, (0 missing)
## Surrogate splits:
## Var126_isBAD < 0.5 to the left, agree=0.966, adj=0.931, (0 split)
## Var73_clean < 83 to the left, agree=0.590, adj=0.152, (0 split)
## Var13_clean < 1850 to the left, agree=0.569, adj=0.108, (0 split)
## Var228_catB < -0.1341969 to the right, agree=0.565, adj=0.101, (0 split)
## Var193_catB < -0.2354326 to the right, agree=0.565, adj=0.101, (0 split)
##
## Node number 13: 14305 observations, complexity param=0.007012624
## mean=0.1175813, MSE=0.1037559
## left son=26 (8550 obs) right son=27 (5755 obs)
## Primary splits:
## Var126_clean < 1 to the right, improve=0.014525600, (0 missing)
## Var216_catB < 0.6173914 to the left, improve=0.009869755, (0 missing)
## Var189_clean < 177 to the right, improve=0.008510919, (0 missing)
## Var217_catP < 0.0003164712 to the left, improve=0.008258016, (0 missing)
## Var218_catP < 0.4929545 to the right, improve=0.008059713, (0 missing)
## Surrogate splits:
## Var126_isBAD < 0.5 to the left, agree=0.885, adj=0.714, (0 split)
## Var211_lev_x.L84s < 0.5 to the right, agree=0.674, adj=0.191, (0 split)
## Var211_lev_x.Mtgm < 0.5 to the left, agree=0.674, adj=0.191, (0 split)
## Var216_catB < 0.5182151 to the left, agree=0.650, adj=0.131, (0 split)
## Var197_catP < 0.0907274 to the left, agree=0.614, adj=0.040, (0 split)
##
## Node number 14: 330 observations
## mean=0.2606061, MSE=0.1926905
##
## Node number 15: 122 observations, complexity param=0.001525045
## mean=0.5163934, MSE=0.2497313
## left son=30 (56 obs) right son=31 (66 obs)
## Primary splits:
## Var126_isBAD < 0.5 to the right, improve=0.15388780, (0 missing)
## Var226_catP < 0.1598181 to the left, improve=0.10573430, (0 missing)
## Var226_lev_x.FSa2 < 0.5 to the left, improve=0.09560748, (0 missing)
## Var226_catB < 0.001250243 to the left, improve=0.07842171, (0 missing)
## Var126_clean < 1.757099 to the left, improve=0.07788108, (0 missing)
## Surrogate splits:
## Var126_clean < 1.757099 to the left, agree=0.836, adj=0.643, (0 split)
## Var73_clean < 17 to the right, agree=0.639, adj=0.214, (0 split)
## Var197_catP < 0.08643007 to the right, agree=0.639, adj=0.214, (0 split)
## Var6_clean < 108.5 to the right, agree=0.631, adj=0.196, (0 split)
## Var189_clean < 276.5438 to the right, agree=0.607, adj=0.143, (0 split)
##
## Node number 24: 9414 observations
## mean=0.0471638, MSE=0.04493937
##
## Node number 25: 8811 observations, complexity param=0.002237793
## mean=0.08432641, MSE=0.07721547
## left son=50 (4050 obs) right son=51 (4761 obs)
## Primary splits:
## Var73_clean < 101 to the right, improve=0.010082550, (0 missing)
## Var228_catB < -0.1341969 to the left, improve=0.008291896, (0 missing)
## Var193_catB < -0.05596828 to the left, improve=0.007391730, (0 missing)
## Var228_lev_x.F2FyR07IdsN7I < 0.5 to the left, improve=0.007259880, (0 missing)
## Var228_catP < 0.3707485 to the left, improve=0.007259880, (0 missing)
## Surrogate splits:
## Var228_catB < -0.1341969 to the left, agree=0.938, adj=0.866, (0 split)
## Var193_catB < -0.05596828 to the left, agree=0.909, adj=0.803, (0 split)
## Var193_lev_x.RO12 < 0.5 to the left, agree=0.905, adj=0.793, (0 split)
## Var193_catP < 0.432626 to the left, agree=0.905, adj=0.793, (0 split)
## Var228_lev_x.F2FyR07IdsN7I < 0.5 to the left, agree=0.883, adj=0.745, (0 split)
##
## Node number 26: 8550 observations, complexity param=0.002403099
## mean=0.08573099, MSE=0.07838119
## left son=52 (7895 obs) right son=53 (655 obs)
## Primary splits:
## Var189_clean < 177 to the right, improve=0.011024230, (0 missing)
## Var205_catB < 0.3210198 to the left, improve=0.006174657, (0 missing)
## Var205_lev_x.sJzTlal < 0.5 to the left, improve=0.006174657, (0 missing)
## Var205_catP < 0.1612161 to the right, improve=0.005643660, (0 missing)
## Var199_catB < -0.0595945 to the left, improve=0.005333791, (0 missing)
##
## Node number 27: 5755 observations, complexity param=0.003488916
## mean=0.1649001, MSE=0.137708
## left son=54 (1498 obs) right son=55 (4257 obs)
## Primary splits:
## Var212_catB < 0.07801134 to the left, improve=0.013534440, (0 missing)
## Var212_lev_x.NhsEn4L < 0.5 to the left, improve=0.012331740, (0 missing)
## Var212_catP < 0.3564742 to the left, improve=0.012331740, (0 missing)
## Var206_catP < 0.1198596 to the left, improve=0.009973120, (0 missing)
## Var206_catB < 0.08504809 to the left, improve=0.009397475, (0 missing)
## Surrogate splits:
## Var212_lev_x.NhsEn4L < 0.5 to the left, agree=0.984, adj=0.939, (0 split)
## Var212_catP < 0.3564742 to the left, agree=0.984, adj=0.939, (0 split)
## Var207_catB < 0.08104058 to the left, agree=0.880, adj=0.537, (0 split)
## Var207_lev_x.me75fM6ugJ < 0.5 to the left, agree=0.876, adj=0.523, (0 split)
## Var207_catP < 0.4197841 to the left, agree=0.876, adj=0.523, (0 split)
##
## Node number 30: 56 observations
## mean=0.3035714, MSE=0.2114158
##
## Node number 31: 66 observations
## mean=0.6969697, MSE=0.2112029
##
## Node number 50: 4050 observations
## mean=0.05407407, MSE=0.05115007
##
## Node number 51: 4761 observations, complexity param=0.002237793
## mean=0.1100609, MSE=0.09794751
## left son=102 (4742 obs) right son=103 (19 obs)
## Primary splits:
## Var192_catP < 6.662558e-05 to the right, improve=0.016070540, (0 missing)
## Var192_catB < 1.358903 to the left, improve=0.008390386, (0 missing)
## Var113_clean < 357699.2 to the left, improve=0.005234391, (0 missing)
## Var199_catB < -0.1350012 to the left, improve=0.004526121, (0 missing)
## Var199_catP < 0.001082665 to the right, improve=0.004431511, (0 missing)
## Surrogate splits:
## Var192_catB < 1.325518 to the left, agree=0.997, adj=0.316, (0 split)
##
## Node number 52: 7895 observations, complexity param=0.001016774
## mean=0.07726409, MSE=0.07129435
## left son=104 (7880 obs) right son=105 (15 obs)
## Primary splits:
## Var192_catB < 1.044599 to the left, improve=0.005553560, (0 missing)
## Var216_catB < 0.8251976 to the left, improve=0.004793560, (0 missing)
## Var192_catP < 0.0001165967 to the right, improve=0.004758200, (0 missing)
## Var113_clean < 618298 to the left, improve=0.004714472, (0 missing)
## Var205_catB < 0.3210198 to the left, improve=0.004279377, (0 missing)
##
## Node number 53: 655 observations
## mean=0.1877863, MSE=0.1525226
##
## Node number 54: 1498 observations
## mean=0.09212283, MSE=0.08363621
##
## Node number 55: 4257 observations, complexity param=0.002251124
## mean=0.1905097, MSE=0.1542158
## left son=110 (1635 obs) right son=111 (2622 obs)
## Primary splits:
## Var199_catP < 0.002981494 to the right, improve=0.010541960, (0 missing)
## Var217_catB < 0.07179702 to the left, improve=0.008902083, (0 missing)
## Var205_lev_x.sJzTlal < 0.5 to the left, improve=0.007522448, (0 missing)
## Var205_catB < 0.3210198 to the left, improve=0.007522448, (0 missing)
## Var202_catP < 0.0006162863 to the left, improve=0.007520035, (0 missing)
## Surrogate splits:
## Var199_catB < -0.002611826 to the left, agree=0.657, adj=0.108, (0 split)
## Var229_catP < 0.233023 to the left, agree=0.635, adj=0.050, (0 split)
## Var229_lev_NA < 0.5 to the left, agree=0.634, adj=0.048, (0 split)
## Var229_catB < -0.03183241 to the left, agree=0.634, adj=0.048, (0 split)
## Var229_lev_x.am7c < 0.5 to the right, agree=0.627, adj=0.029, (0 split)
##
## Node number 102: 4742 observations
## mean=0.1075496, MSE=0.09598265
##
## Node number 103: 19 observations
## mean=0.7368421, MSE=0.1939058
##
## Node number 104: 7880 observations
## mean=0.07639594, MSE=0.0705596
##
## Node number 105: 15 observations
## mean=0.5333333, MSE=0.2488889
##
## Node number 110: 1635 observations
## mean=0.1394495, MSE=0.1200034
##
## Node number 111: 2622 observations, complexity param=0.001199865
## mean=0.2223494, MSE=0.1729101
## left son=222 (1935 obs) right son=223 (687 obs)
## Primary splits:
## Var217_catB < 0.3930525 to the left, improve=0.008136426, (0 missing)
## Var192_catB < 0.6988594 to the left, improve=0.007572952, (0 missing)
## Var202_catP < 0.0006162863 to the left, improve=0.007505118, (0 missing)
## Var216_catB < 0.6436846 to the left, improve=0.006971343, (0 missing)
## Var217_catP < 0.000349784 to the left, improve=0.006667231, (0 missing)
## Surrogate splits:
## Var202_catB < 0.4247921 to the left, agree=0.876, adj=0.528, (0 split)
## Var217_catP < 0.005180146 to the left, agree=0.750, adj=0.045, (0 split)
## Var200_catB < 0.4094226 to the left, agree=0.741, adj=0.013, (0 split)
## Var214_catB < 0.4094226 to the left, agree=0.741, adj=0.013, (0 split)
## Var198_catB < 1.115213 to the left, agree=0.740, adj=0.007, (0 split)
##
## Node number 222: 1935 observations
## mean=0.2, MSE=0.16
##
## Node number 223: 687 observations
## mean=0.2852984, MSE=0.2039032
pTrain <- predict(m,newdata=treatedTrainM)
treatedTrainM$pred <- as.numeric(pTrain)
WVPlots::ROCPlot(treatedTrainM,'pred',yName,'prediction on train')
pTest <- predict(m,newdata=treatedTest)
treatedTest$pred <- as.numeric(pTest)
WVPlots::ROCPlot(treatedTest,'pred',yName,'prediction on test')
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}