knitr::opts_chunk$set(echo = TRUE)
library('vtreat')
library('WVPlots') # see: https://github.com/WinVector/WVPlots
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
# See installH2O.R for how to install h2o
# From: http://learn.h2o.ai/content/tutorials/deeplearning/
# See also: http://www.r-bloggers.com/things-to-try-after-user-part-1-deep-learning-with-h2o/
library('h2o')
## Loading required package: statmod
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=-1, max_mem_size="2G")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/7q/h_jp2vj131g5799gfnpzhdp80000gn/T//RtmpBXDypq/h2o_johnmount_started_from_r.out
## /var/folders/7q/h_jp2vj131g5799gfnpzhdp80000gn/T//RtmpBXDypq/h2o_johnmount_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 114 milliseconds
## H2O cluster version: 3.8.3.4
## H2O cluster name: H2O_started_from_R_johnmount_rnv147
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.78 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## R Version: R version 3.3.1 (2016-06-21)
h2o.removeAll() ## clean slate - just in case the cluster was already running
## [1] 0
ncore <- parallel::detectCores()
cl <- parallel::makeCluster(ncore)
# see: https://github.com/WinVector/PreparingDataWorkshop/tree/master/KDD2009
d = read.table('orange_small_train.data.gz',
header=T,sep='\t',na.strings=c('NA',''),
strip.white = TRUE,
stringsAsFactors=FALSE)
churn = read.table('orange_small_train_churn.labels.txt',
header=F,sep='\t',
strip.white = TRUE,
stringsAsFactors = FALSE)
d$churn = churn$V1
set.seed(729375)
rgroup = runif(dim(d)[[1]])
dTrain = d[rgroup<=0.9,] # set for building models
dTest = d[rgroup>0.9,] # set for evaluation
rm(list=c('d','churn'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
nonvars)
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
# build treatments
trainPlan = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
smFactor=2.0,
parallelCluster=cl)
print(trainPlan$method)
## [1] "kwaycrossystratified"
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame
#kddSig = 1/nrow(treatmentsC$scoreFrame)
selvars <- setdiff(colnames(treatedTrainM),outcomes)
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
varRestriction=selvars,
pruneSig=NULL,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
# simple default, production model would require hyperparameter search
vrsel <- runif(nrow(treatedTrainM))<=0.1
trainSet <- as.h2o(treatedTrainM[!vrsel,])
##
|
| | 0%
|
|=================================================================| 100%
valSet <- as.h2o(treatedTrainM[vrsel,])
##
|
| | 0%
|
|=================================================================| 100%
goodvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<1/nrow(treatmentsC$scoreFrame)]
hyper_params <- list(
hidden=list(c(32,32,32),c(64,64)),
input_dropout_ratio=c(0,0.05),
rate=c(0.01,0.02),
rate_annealing=c(1e-8,1e-7,1e-6)
)
print(date())
## [1] "Mon Jul 18 15:06:17 2016"
g <- h2o.grid(
algorithm="deeplearning",
grid_id="dl_grid",
training_frame=trainSet,
validation_frame=valSet,
x=goodvars,
y=yName,
epochs=100,
stopping_metric="misclassification",
stopping_tolerance=1e-2, ## stop when misclassification does not improve by >=1% for 2 scoring events
stopping_rounds=20,
score_validation_samples=10000, ## downsample validation set for faster scoring
score_duty_cycle=0.025, ## don't score more than 2.5% of the wall time
adaptive_rate=F, ## manually tuned learning rate
momentum_start=0.5, ## manually tuned momentum
momentum_stable=0.9,
momentum_ramp=1e7,
l1=1e-5,
l2=1e-5,
activation=c("Rectifier"),
max_w2=10, ## can help improve stability for Rectifier
hyper_params=hyper_params
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
print(date())
## [1] "Mon Jul 18 15:51:24 2016"
print(g@summary_table[1,])
## Hyper-Parameter Search Summary: ordered by increasing logloss
## hidden input_dropout_ratio rate rate_annealing model_ids
## 1 [I@321a9955 0.05 0.01 1.0E-6 dl_grid_model_18
## logloss
## 1 0.25280926530079145
m <- h2o.getModel(g@model_ids[[1]])
summary(m)
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model Key: dl_grid_model_18
## Status of Neuron Layers: predicting churn, 2-class classification, bernoulli distribution, CrossEntropy loss, 10,178 weights/biases, 139.0 KB, 4,042,200 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_RMS
## 1 1 249 Input 5.00 %
## 2 2 32 Rectifier 0.00 % 0.000010 0.000010 0.001983 0.000000
## 3 3 32 Rectifier 0.00 % 0.000010 0.000010 0.001983 0.000000
## 4 4 32 Rectifier 0.00 % 0.000010 0.000010 0.001983 0.000000
## 5 5 2 Softmax 0.000010 0.000010 0.001983 0.000000
## momentum mean_weight weight_RMS mean_bias bias_RMS
## 1
## 2 0.661688 -0.001950 0.097793 0.398993 0.077247
## 3 0.661688 -0.044393 0.187270 0.918026 0.043695
## 4 0.661688 -0.062796 0.183808 0.884139 0.051241
## 5 0.661688 0.020776 0.329298 0.008407 0.925762
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## Description: Metrics reported on temporary training frame with 10153 samples
##
## MSE: 0.06425998
## R^2: 0.06529115
## LogLoss: 0.2360321
## Mean Per-Class Error: 0.331089
## AUC: 0.7583378
## Gini: 0.5166755
##
## Confusion Matrix for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 8124 1275 0.135653 =1275/9399
## TRUE 397 357 0.526525 =397/754
## Totals 8521 1632 0.164680 =1672/10153
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.132762 0.299246 129
## 2 max f2 0.110446 0.410013 170
## 3 max f0point5 0.175470 0.306220 63
## 4 max accuracy 0.260613 0.926032 7
## 5 max precision 0.270213 0.666667 5
## 6 max recall 0.013713 1.000000 378
## 7 max specificity 0.289274 0.999894 0
## 8 max absolute_MCC 0.132762 0.241162 129
## 9 max min_per_class_accuracy 0.089669 0.684350 207
## 10 max mean_per_class_accuracy 0.102372 0.693200 184
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## Description: Metrics reported on full validation frame
##
## MSE: 0.06810449
## R^2: 0.0425764
## LogLoss: 0.2528093
## Mean Per-Class Error: 0.385003
## AUC: 0.7011861
## Gini: 0.4023723
##
## Confusion Matrix for F1-optimal threshold:
## FALSE TRUE Error Rate
## FALSE 3672 579 0.136203 =579/4251
## TRUE 225 130 0.633803 =225/355
## Totals 3897 709 0.174555 =804/4606
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.132774 0.244361 115
## 2 max f2 0.065734 0.371461 244
## 3 max f0point5 0.189017 0.244444 36
## 4 max accuracy 0.293939 0.923144 0
## 5 max precision 0.293939 1.000000 0
## 6 max recall 0.010462 1.000000 382
## 7 max specificity 0.293939 1.000000 0
## 8 max absolute_MCC 0.132774 0.169976 115
## 9 max min_per_class_accuracy 0.079685 0.645070 212
## 10 max mean_per_class_accuracy 0.075585 0.652419 221
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
##
## Scoring History:
## timestamp duration training_speed epochs
## 1 2016-07-18 15:39:17 0.000 sec 0.00000
## 2 2016-07-18 15:39:18 33 min 1.091 sec 33882 rows/sec 1.00000
## 3 2016-07-18 15:39:25 33 min 7.785 sec 36566 rows/sec 7.00000
## 4 2016-07-18 15:39:32 33 min 14.431 sec 36899 rows/sec 13.00000
## 5 2016-07-18 15:39:38 33 min 20.716 sec 37677 rows/sec 19.00000
## 6 2016-07-18 15:39:45 33 min 27.325 sec 37621 rows/sec 25.00000
## 7 2016-07-18 15:39:50 33 min 33.042 sec 38628 rows/sec 31.00000
## 8 2016-07-18 15:39:56 33 min 38.794 sec 39298 rows/sec 37.00000
## 9 2016-07-18 15:40:02 33 min 44.310 sec 40021 rows/sec 43.00000
## 10 2016-07-18 15:40:08 33 min 51.110 sec 40357 rows/sec 50.00000
## 11 2016-07-18 15:40:15 33 min 57.342 sec 41011 rows/sec 57.00000
## 12 2016-07-18 15:40:20 34 min 2.719 sec 41457 rows/sec 63.00000
## 13 2016-07-18 15:40:25 34 min 8.092 sec 41836 rows/sec 69.00000
## 14 2016-07-18 15:40:32 34 min 14.279 sec 42245 rows/sec 76.00000
## 15 2016-07-18 15:40:38 34 min 21.038 sec 41798 rows/sec 82.00000
## 16 2016-07-18 15:40:46 34 min 28.786 sec 41856 rows/sec 90.00000
## 17 2016-07-18 15:40:52 34 min 34.624 sec 41902 rows/sec 96.00000
## 18 2016-07-18 15:40:56 34 min 38.299 sec 42040 rows/sec 100.00000
## 19 2016-07-18 15:40:56 34 min 38.444 sec 42039 rows/sec 100.00000
## iterations samples training_MSE training_r2 training_logloss
## 1 0 0.000000
## 2 1 40422.000000 0.06569 0.04445 0.24766
## 3 7 282954.000000 0.06426 0.06529 0.23603
## 4 13 525486.000000 0.06278 0.08678 0.22795
## 5 19 768018.000000 0.06093 0.11372 0.21681
## 6 25 1010550.000000 0.06080 0.11562 0.21528
## 7 31 1253082.000000 0.05794 0.15719 0.20214
## 8 37 1495614.000000 0.05576 0.18896 0.19557
## 9 43 1738146.000000 0.05494 0.20085 0.19170
## 10 50 2021100.000000 0.05210 0.24214 0.18246
## 11 57 2304054.000000 0.05126 0.25436 0.17987
## 12 63 2546586.000000 0.05053 0.26498 0.17471
## 13 69 2789118.000000 0.04988 0.27453 0.17072
## 14 76 3072072.000000 0.04858 0.29336 0.16829
## 15 82 3314604.000000 0.04906 0.28638 0.16638
## 16 90 3637980.000000 0.04683 0.31883 0.16086
## 17 96 3880512.000000 0.04612 0.32918 0.15929
## 18 100 4042200.000000 0.04503 0.34498 0.15527
## 19 100 4042200.000000 0.06426 0.06529 0.23603
## training_AUC training_lift training_classification_error validation_MSE
## 1
## 2 0.70834 4.22448 0.15710 0.06822
## 3 0.75834 5.41261 0.16468 0.06810
## 4 0.78294 7.26082 0.11327 0.06860
## 5 0.81457 7.39283 0.12381 0.06951
## 6 0.82018 7.26082 0.10273 0.06966
## 7 0.85132 8.84500 0.09101 0.07011
## 8 0.86176 10.16515 0.09830 0.07130
## 9 0.87139 10.16515 0.08677 0.07085
## 10 0.88441 10.16515 0.07978 0.07260
## 11 0.89192 10.95723 0.06471 0.07262
## 12 0.89880 11.74932 0.06796 0.07305
## 13 0.90783 11.74932 0.06776 0.07490
## 14 0.90636 11.88134 0.06826 0.07593
## 15 0.91538 11.35328 0.06786 0.07455
## 16 0.91733 12.27738 0.06757 0.07611
## 17 0.91944 12.40940 0.06284 0.07798
## 18 0.92272 12.40940 0.06215 0.07794
## 19 0.75834 5.41261 0.16468 0.06810
## validation_r2 validation_logloss validation_AUC validation_lift
## 1
## 2 0.04089 0.25638 0.68741 6.07324
## 3 0.04258 0.25281 0.70119 6.07324
## 4 0.03558 0.25903 0.68377 4.69296
## 5 0.02277 0.27108 0.67638 4.41690
## 6 0.02074 0.27229 0.67084 3.31268
## 7 0.01441 0.29167 0.67970 3.58873
## 8 0.29029 0.66654 3.03662
## 9 0.00398 0.28776 0.67610 3.31268
## 10 0.29726 0.67496 3.86479
## 11 0.29745 0.67289 2.76056
## 12 0.31515 0.67075 3.31268
## 13 0.34036 0.65333 3.03662
## 14 0.33879 0.65439 3.58873
## 15 0.33393 0.65362 4.14085
## 16 0.36217 0.65190 3.03662
## 17 0.35514 0.64155 2.48451
## 18 0.36718 0.65519 3.03662
## 19 0.04258 0.25281 0.70119 6.07324
## validation_classification_error
## 1
## 2 0.18541
## 3 0.17455
## 4 0.18498
## 5 0.17716
## 6 0.21038
## 7 0.19062
## 8 0.22384
## 9 0.24620
## 10 0.23990
## 11 0.23838
## 12 0.24229
## 13 0.23860
## 14 0.21906
## 15 0.19496
## 16 0.19019
## 17 0.24989
## 18 0.28702
## 19 0.17455
plot(m)
pTrain <- predict(m,newdata=as.h2o(treatedTrainM))
##
|
| | 0%
|
|=================================================================| 100%
##
|
| | 0%
|
|==== | 7%
|
|=================================================================| 100%
treatedTrainM$pred <- as.data.frame(pTrain[,'TRUE'])[[1]]
WVPlots::ROCPlot(treatedTrainM,'pred',yName,'prediction on train')
pTest <- predict(m,newdata=as.h2o(treatedTest))
##
|
| | 0%
|
|=================================================================| 100%
##
|
| | 0%
|
|====================== | 33%
|
|=================================================================| 100%
treatedTest$pred <- as.data.frame(pTest[,'TRUE'])[[1]]
WVPlots::ROCPlot(treatedTest,'pred',yName,'prediction on test')
h2o.shutdown(prompt=FALSE)
## [1] TRUE
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}