knitr::opts_chunk$set(echo = TRUE)
library('vtreat')
library('WVPlots') # see: https://github.com/WinVector/WVPlots
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
library('rpart')
library('caret')
## Loading required package: lattice
#library('doMC')
ncore <- parallel::detectCores()
#registerDoMC(cores = ncore)
cl <- parallel::makeCluster(ncore)
# see: https://github.com/WinVector/PreparingDataWorkshop/tree/master/KDD2009
d = read.table('orange_small_train.data.gz',
               header=T,sep='\t',na.strings=c('NA',''), 
               strip.white = TRUE,
               stringsAsFactors=FALSE)
churn = read.table('orange_small_train_churn.labels.txt',
                   header=F,sep='\t',
                   strip.white = TRUE,
                   stringsAsFactors = FALSE)
d$churn = churn$V1
set.seed(729375)
rgroup = runif(dim(d)[[1]])
dTrain = d[rgroup<=0.9,]  # set for building models
dTest = d[rgroup>0.9,] # set for evaluation
rm(list=c('d','churn'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
                nonvars)
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)

# build treatments 
trainPlan = mkCrossFrameCExperiment(dTrain,
    vars,yName,yTarget,
    smFactor=2.0, 
    parallelCluster=cl)
print(trainPlan$method)
## [1] "kwaycrossystratified"
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame

#kddSig = 1/nrow(treatmentsC$scoreFrame)
selvars <- setdiff(colnames(treatedTrainM),outcomes)
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget

treatedTest = prepare(treatmentsC,
                      dTest,
                      varRestriction=selvars,
                      pruneSig=NULL, 
                      parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
goodvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<1/nrow(treatmentsC$scoreFrame)]

form <- paste(yName,paste(goodvars,collapse=' + '),sep=' ~ ')
rcontrol <- rpart.control(cp = 0.001)
m <- rpart(form,treatedTrainM,control=rcontrol)

summary(m)
## Call:
## rpart(formula = form, data = treatedTrainM, control = rcontrol)
##   n= 45028 
## 
##             CP nsplit rel error    xerror       xstd
## 1  0.012698340      0 1.0000000 1.0000249 0.01537689
## 2  0.008453711      1 0.9873017 0.9873581 0.01499669
## 3  0.007171726      2 0.9788479 0.9806515 0.01489179
## 4  0.007012624      3 0.9716762 0.9729558 0.01468386
## 5  0.003488916      4 0.9646636 0.9661061 0.01454178
## 6  0.002403099      5 0.9611747 0.9620480 0.01446954
## 7  0.002251124      6 0.9587716 0.9611113 0.01444146
## 8  0.002237793      7 0.9565205 0.9585800 0.01438728
## 9  0.001895567     10 0.9498071 0.9553720 0.01433074
## 10 0.001525045     11 0.9479115 0.9531197 0.01430570
## 11 0.001199865     12 0.9463865 0.9528562 0.01430973
## 12 0.001175852     13 0.9451866 0.9535656 0.01432270
## 13 0.001016774     14 0.9440108 0.9532200 0.01431974
## 14 0.001000000     15 0.9429940 0.9542217 0.01433326
## 
## Variable importance
##               Var126_clean                Var217_catP 
##                         15                          6 
##                Var218_catB                Var218_catP 
##                          6                          6 
##               Var126_isBAD                Var74_clean 
##                          6                          5 
##                Var13_clean          Var211_lev_x.L84s 
##                          5                          5 
##          Var211_lev_x.Mtgm               Var140_clean 
##                          5                          4 
##                 Var7_clean                Var73_clean 
##                          4                          3 
##               Var125_clean                Var212_catB 
##                          3                          2 
##                Var212_catP       Var212_lev_x.NhsEn4L 
##                          2                          2 
##                Var192_catB                 Var6_clean 
##                          2                          2 
##               Var189_clean                Var192_catP 
##                          2                          2 
##                Var199_catP                Var228_catB 
##                          1                          1 
##                Var193_catB                Var207_catB 
##                          1                          1 
##                Var207_catP    Var207_lev_x.me75fM6ugJ 
##                          1                          1 
##                Var193_catP          Var193_lev_x.RO12 
##                          1                          1 
## Var228_lev_x.F2FyR07IdsN7I                Var217_catB 
##                          1                          1 
##               Var113_clean                Var216_catB 
##                          1                          1 
## 
## Node number 1: 45028 observations,    complexity param=0.01269834
##   mean=0.07370969, MSE=0.06827657 
##   left son=2 (12046 obs) right son=3 (32982 obs)
##   Primary splits:
##       Var126_clean < -15          to the left,  improve=0.012698340, (0 missing)
##       Var126_isBAD < 0.5          to the left,  improve=0.007833296, (0 missing)
##       Var74_clean  < 3.5          to the right, improve=0.007767768, (0 missing)
##       Var13_clean  < 30           to the right, improve=0.007342036, (0 missing)
##       Var218_catP  < 0.2466773    to the right, improve=0.007320165, (0 missing)
##   Surrogate splits:
##       Var211_lev_x.L84s < 0.5          to the left,  agree=0.848, adj=0.433, (0 split)
##       Var211_lev_x.Mtgm < 0.5          to the right, agree=0.848, adj=0.433, (0 split)
##       Var73_clean       < 9            to the left,  agree=0.760, adj=0.104, (0 split)
##       Var113_clean      < -1253786     to the left,  agree=0.754, adj=0.080, (0 split)
##       Var22_isBAD       < 0.5          to the right, agree=0.747, adj=0.055, (0 split)
## 
## Node number 2: 12046 observations,    complexity param=0.001175852
##   mean=0.02498755, MSE=0.02436317 
##   left son=4 (12030 obs) right son=5 (16 obs)
##   Primary splits:
##       Var192_catB       < 0.9740576    to the left,  improve=0.012317730, (0 missing)
##       Var210_catB       < 0.4563415    to the left,  improve=0.005462331, (0 missing)
##       Var210_lev_x.g5HH < 0.5          to the left,  improve=0.005145479, (0 missing)
##       Var216_catB       < 0.5860013    to the left,  improve=0.004648987, (0 missing)
##       Var210_catP       < 0.4910221    to the right, improve=0.004361510, (0 missing)
## 
## Node number 3: 32982 observations,    complexity param=0.008453711
##   mean=0.09150446, MSE=0.08313139 
##   left son=6 (32530 obs) right son=7 (452 obs)
##   Primary splits:
##       Var218_catP < 0.2466773    to the right, improve=0.009478939, (0 missing)
##       Var218_catB < 0.6405356    to the left,  improve=0.009478939, (0 missing)
##       Var217_catP < 0.01014358   to the left,  improve=0.009478939, (0 missing)
##       Var74_clean < 3.5          to the right, improve=0.008521106, (0 missing)
##       Var216_catB < 0.6173914    to the left,  improve=0.007839353, (0 missing)
##   Surrogate splits:
##       Var217_catP < 0.01014358   to the left,  agree=1, adj=1, (0 split)
##       Var218_catB < 0.6405356    to the left,  agree=1, adj=1, (0 split)
## 
## Node number 4: 12030 observations
##   mean=0.02435578, MSE=0.02376257 
## 
## Node number 5: 16 observations
##   mean=0.5, MSE=0.25 
## 
## Node number 6: 32530 observations,    complexity param=0.007171726
##   mean=0.08819551, MSE=0.08041706 
##   left son=12 (18225 obs) right son=13 (14305 obs)
##   Primary splits:
##       Var74_clean          < 3.5          to the right, improve=0.008428412, (0 missing)
##       Var126_clean         < 1            to the right, improve=0.007885540, (0 missing)
##       Var216_catB          < 0.6173914    to the left,  improve=0.007657706, (0 missing)
##       Var212_catB          < 0.04062418   to the left,  improve=0.007642976, (0 missing)
##       Var212_lev_x.NhsEn4L < 0.5          to the left,  improve=0.007367498, (0 missing)
##   Surrogate splits:
##       Var13_clean  < 66           to the right, agree=0.852, adj=0.664, (0 split)
##       Var140_clean < 47.5         to the right, agree=0.850, adj=0.658, (0 split)
##       Var7_clean   < 3.375388     to the right, agree=0.828, adj=0.609, (0 split)
##       Var125_clean < 859.5        to the right, agree=0.785, adj=0.511, (0 split)
##       Var6_clean   < 584.5        to the right, agree=0.717, adj=0.356, (0 split)
## 
## Node number 7: 452 observations,    complexity param=0.001895567
##   mean=0.329646, MSE=0.2209795 
##   left son=14 (330 obs) right son=15 (122 obs)
##   Primary splits:
##       Var13_clean   < 6            to the right, improve=0.05834493, (0 missing)
##       Var229_catP   < 0.4003503    to the left,  improve=0.04925234, (0 missing)
##       Var229_catB   < -0.03183241  to the left,  improve=0.04925234, (0 missing)
##       Var229_lev_NA < 0.5          to the left,  improve=0.04925234, (0 missing)
##       Var73_clean   < 55           to the right, improve=0.03842976, (0 missing)
##   Surrogate splits:
##       Var125_clean < 265.5        to the right, agree=0.927, adj=0.730, (0 split)
##       Var140_clean < 7.5          to the right, agree=0.927, adj=0.730, (0 split)
##       Var7_clean   < 3.375388     to the right, agree=0.923, adj=0.713, (0 split)
##       Var73_clean  < 29           to the right, agree=0.885, adj=0.574, (0 split)
##       Var74_clean  < 3.5          to the right, agree=0.869, adj=0.516, (0 split)
## 
## Node number 12: 18225 observations,    complexity param=0.002237793
##   mean=0.06513032, MSE=0.06088836 
##   left son=24 (9414 obs) right son=25 (8811 obs)
##   Primary splits:
##       Var126_clean < 1            to the right, improve=0.005664251, (0 missing)
##       Var189_clean < 249          to the right, improve=0.004819273, (0 missing)
##       Var192_catP  < 6.662558e-05 to the right, improve=0.004685729, (0 missing)
##       Var73_clean  < 117          to the right, improve=0.004235697, (0 missing)
##       Var199_catB  < -0.1350012   to the left,  improve=0.004014248, (0 missing)
##   Surrogate splits:
##       Var126_isBAD < 0.5          to the left,  agree=0.966, adj=0.931, (0 split)
##       Var73_clean  < 83           to the left,  agree=0.590, adj=0.152, (0 split)
##       Var13_clean  < 1850         to the left,  agree=0.569, adj=0.108, (0 split)
##       Var228_catB  < -0.1341969   to the right, agree=0.565, adj=0.101, (0 split)
##       Var193_catB  < -0.2354326   to the right, agree=0.565, adj=0.101, (0 split)
## 
## Node number 13: 14305 observations,    complexity param=0.007012624
##   mean=0.1175813, MSE=0.1037559 
##   left son=26 (8550 obs) right son=27 (5755 obs)
##   Primary splits:
##       Var126_clean < 1            to the right, improve=0.014525600, (0 missing)
##       Var216_catB  < 0.6173914    to the left,  improve=0.009869755, (0 missing)
##       Var189_clean < 177          to the right, improve=0.008510919, (0 missing)
##       Var217_catP  < 0.0003164712 to the left,  improve=0.008258016, (0 missing)
##       Var218_catP  < 0.4929545    to the right, improve=0.008059713, (0 missing)
##   Surrogate splits:
##       Var126_isBAD      < 0.5          to the left,  agree=0.885, adj=0.714, (0 split)
##       Var211_lev_x.L84s < 0.5          to the right, agree=0.674, adj=0.191, (0 split)
##       Var211_lev_x.Mtgm < 0.5          to the left,  agree=0.674, adj=0.191, (0 split)
##       Var216_catB       < 0.5182151    to the left,  agree=0.650, adj=0.131, (0 split)
##       Var197_catP       < 0.0907274    to the left,  agree=0.614, adj=0.040, (0 split)
## 
## Node number 14: 330 observations
##   mean=0.2606061, MSE=0.1926905 
## 
## Node number 15: 122 observations,    complexity param=0.001525045
##   mean=0.5163934, MSE=0.2497313 
##   left son=30 (56 obs) right son=31 (66 obs)
##   Primary splits:
##       Var126_isBAD      < 0.5          to the right, improve=0.15388780, (0 missing)
##       Var226_catP       < 0.1598181    to the left,  improve=0.10573430, (0 missing)
##       Var226_lev_x.FSa2 < 0.5          to the left,  improve=0.09560748, (0 missing)
##       Var226_catB       < 0.001250243  to the left,  improve=0.07842171, (0 missing)
##       Var126_clean      < 1.757099     to the left,  improve=0.07788108, (0 missing)
##   Surrogate splits:
##       Var126_clean < 1.757099     to the left,  agree=0.836, adj=0.643, (0 split)
##       Var73_clean  < 17           to the right, agree=0.639, adj=0.214, (0 split)
##       Var197_catP  < 0.08643007   to the right, agree=0.639, adj=0.214, (0 split)
##       Var6_clean   < 108.5        to the right, agree=0.631, adj=0.196, (0 split)
##       Var189_clean < 276.5438     to the right, agree=0.607, adj=0.143, (0 split)
## 
## Node number 24: 9414 observations
##   mean=0.0471638, MSE=0.04493937 
## 
## Node number 25: 8811 observations,    complexity param=0.002237793
##   mean=0.08432641, MSE=0.07721547 
##   left son=50 (4050 obs) right son=51 (4761 obs)
##   Primary splits:
##       Var73_clean                < 101          to the right, improve=0.010082550, (0 missing)
##       Var228_catB                < -0.1341969   to the left,  improve=0.008291896, (0 missing)
##       Var193_catB                < -0.05596828  to the left,  improve=0.007391730, (0 missing)
##       Var228_lev_x.F2FyR07IdsN7I < 0.5          to the left,  improve=0.007259880, (0 missing)
##       Var228_catP                < 0.3707485    to the left,  improve=0.007259880, (0 missing)
##   Surrogate splits:
##       Var228_catB                < -0.1341969   to the left,  agree=0.938, adj=0.866, (0 split)
##       Var193_catB                < -0.05596828  to the left,  agree=0.909, adj=0.803, (0 split)
##       Var193_lev_x.RO12          < 0.5          to the left,  agree=0.905, adj=0.793, (0 split)
##       Var193_catP                < 0.432626     to the left,  agree=0.905, adj=0.793, (0 split)
##       Var228_lev_x.F2FyR07IdsN7I < 0.5          to the left,  agree=0.883, adj=0.745, (0 split)
## 
## Node number 26: 8550 observations,    complexity param=0.002403099
##   mean=0.08573099, MSE=0.07838119 
##   left son=52 (7895 obs) right son=53 (655 obs)
##   Primary splits:
##       Var189_clean         < 177          to the right, improve=0.011024230, (0 missing)
##       Var205_catB          < 0.3210198    to the left,  improve=0.006174657, (0 missing)
##       Var205_lev_x.sJzTlal < 0.5          to the left,  improve=0.006174657, (0 missing)
##       Var205_catP          < 0.1612161    to the right, improve=0.005643660, (0 missing)
##       Var199_catB          < -0.0595945   to the left,  improve=0.005333791, (0 missing)
## 
## Node number 27: 5755 observations,    complexity param=0.003488916
##   mean=0.1649001, MSE=0.137708 
##   left son=54 (1498 obs) right son=55 (4257 obs)
##   Primary splits:
##       Var212_catB          < 0.07801134   to the left,  improve=0.013534440, (0 missing)
##       Var212_lev_x.NhsEn4L < 0.5          to the left,  improve=0.012331740, (0 missing)
##       Var212_catP          < 0.3564742    to the left,  improve=0.012331740, (0 missing)
##       Var206_catP          < 0.1198596    to the left,  improve=0.009973120, (0 missing)
##       Var206_catB          < 0.08504809   to the left,  improve=0.009397475, (0 missing)
##   Surrogate splits:
##       Var212_lev_x.NhsEn4L    < 0.5          to the left,  agree=0.984, adj=0.939, (0 split)
##       Var212_catP             < 0.3564742    to the left,  agree=0.984, adj=0.939, (0 split)
##       Var207_catB             < 0.08104058   to the left,  agree=0.880, adj=0.537, (0 split)
##       Var207_lev_x.me75fM6ugJ < 0.5          to the left,  agree=0.876, adj=0.523, (0 split)
##       Var207_catP             < 0.4197841    to the left,  agree=0.876, adj=0.523, (0 split)
## 
## Node number 30: 56 observations
##   mean=0.3035714, MSE=0.2114158 
## 
## Node number 31: 66 observations
##   mean=0.6969697, MSE=0.2112029 
## 
## Node number 50: 4050 observations
##   mean=0.05407407, MSE=0.05115007 
## 
## Node number 51: 4761 observations,    complexity param=0.002237793
##   mean=0.1100609, MSE=0.09794751 
##   left son=102 (4742 obs) right son=103 (19 obs)
##   Primary splits:
##       Var192_catP  < 6.662558e-05 to the right, improve=0.016070540, (0 missing)
##       Var192_catB  < 1.358903     to the left,  improve=0.008390386, (0 missing)
##       Var113_clean < 357699.2     to the left,  improve=0.005234391, (0 missing)
##       Var199_catB  < -0.1350012   to the left,  improve=0.004526121, (0 missing)
##       Var199_catP  < 0.001082665  to the right, improve=0.004431511, (0 missing)
##   Surrogate splits:
##       Var192_catB < 1.325518     to the left,  agree=0.997, adj=0.316, (0 split)
## 
## Node number 52: 7895 observations,    complexity param=0.001016774
##   mean=0.07726409, MSE=0.07129435 
##   left son=104 (7880 obs) right son=105 (15 obs)
##   Primary splits:
##       Var192_catB  < 1.044599     to the left,  improve=0.005553560, (0 missing)
##       Var216_catB  < 0.8251976    to the left,  improve=0.004793560, (0 missing)
##       Var192_catP  < 0.0001165967 to the right, improve=0.004758200, (0 missing)
##       Var113_clean < 618298       to the left,  improve=0.004714472, (0 missing)
##       Var205_catB  < 0.3210198    to the left,  improve=0.004279377, (0 missing)
## 
## Node number 53: 655 observations
##   mean=0.1877863, MSE=0.1525226 
## 
## Node number 54: 1498 observations
##   mean=0.09212283, MSE=0.08363621 
## 
## Node number 55: 4257 observations,    complexity param=0.002251124
##   mean=0.1905097, MSE=0.1542158 
##   left son=110 (1635 obs) right son=111 (2622 obs)
##   Primary splits:
##       Var199_catP          < 0.002981494  to the right, improve=0.010541960, (0 missing)
##       Var217_catB          < 0.07179702   to the left,  improve=0.008902083, (0 missing)
##       Var205_lev_x.sJzTlal < 0.5          to the left,  improve=0.007522448, (0 missing)
##       Var205_catB          < 0.3210198    to the left,  improve=0.007522448, (0 missing)
##       Var202_catP          < 0.0006162863 to the left,  improve=0.007520035, (0 missing)
##   Surrogate splits:
##       Var199_catB       < -0.002611826 to the left,  agree=0.657, adj=0.108, (0 split)
##       Var229_catP       < 0.233023     to the left,  agree=0.635, adj=0.050, (0 split)
##       Var229_lev_NA     < 0.5          to the left,  agree=0.634, adj=0.048, (0 split)
##       Var229_catB       < -0.03183241  to the left,  agree=0.634, adj=0.048, (0 split)
##       Var229_lev_x.am7c < 0.5          to the right, agree=0.627, adj=0.029, (0 split)
## 
## Node number 102: 4742 observations
##   mean=0.1075496, MSE=0.09598265 
## 
## Node number 103: 19 observations
##   mean=0.7368421, MSE=0.1939058 
## 
## Node number 104: 7880 observations
##   mean=0.07639594, MSE=0.0705596 
## 
## Node number 105: 15 observations
##   mean=0.5333333, MSE=0.2488889 
## 
## Node number 110: 1635 observations
##   mean=0.1394495, MSE=0.1200034 
## 
## Node number 111: 2622 observations,    complexity param=0.001199865
##   mean=0.2223494, MSE=0.1729101 
##   left son=222 (1935 obs) right son=223 (687 obs)
##   Primary splits:
##       Var217_catB < 0.3930525    to the left,  improve=0.008136426, (0 missing)
##       Var192_catB < 0.6988594    to the left,  improve=0.007572952, (0 missing)
##       Var202_catP < 0.0006162863 to the left,  improve=0.007505118, (0 missing)
##       Var216_catB < 0.6436846    to the left,  improve=0.006971343, (0 missing)
##       Var217_catP < 0.000349784  to the left,  improve=0.006667231, (0 missing)
##   Surrogate splits:
##       Var202_catB < 0.4247921    to the left,  agree=0.876, adj=0.528, (0 split)
##       Var217_catP < 0.005180146  to the left,  agree=0.750, adj=0.045, (0 split)
##       Var200_catB < 0.4094226    to the left,  agree=0.741, adj=0.013, (0 split)
##       Var214_catB < 0.4094226    to the left,  agree=0.741, adj=0.013, (0 split)
##       Var198_catB < 1.115213     to the left,  agree=0.740, adj=0.007, (0 split)
## 
## Node number 222: 1935 observations
##   mean=0.2, MSE=0.16 
## 
## Node number 223: 687 observations
##   mean=0.2852984, MSE=0.2039032
pTrain <- predict(m,newdata=treatedTrainM)
treatedTrainM$pred <- as.numeric(pTrain)
WVPlots::ROCPlot(treatedTrainM,'pred',yName,'prediction on train')

pTest <- predict(m,newdata=treatedTest)
treatedTest$pred <- as.numeric(pTest)
WVPlots::ROCPlot(treatedTest,'pred',yName,'prediction on test')

if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}