#load some libraries
# https://github.com/WinVector/vtreat
library('vtreat')
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: plyr
## Loading required package: stringr
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-14. For overview type 'help("mgcv-package")'.
library('parallel')
library('randomForest')
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/' 

d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
                header=T,sep='\t',na.strings=c('NA',''), 
               stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
                    header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
                        header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
                        header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrainM = subset(d,rgroup<=0.5)  # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
                c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1
# build data treatments

set.seed(239525)

cl = parallel::makeCluster(4)

# build treatments on just the coding data
treatmentsC = designTreatmentsC(dTrainC,
                                vars,yName,yTarget,
                                smFactor=0.5,rareCount=2,rareSig=0.2,
                                parallelCluster=cl)
## [1] "desigining treatments Mon Sep 26 14:51:04 2016"
## [1] "designing treatments Mon Sep 26 14:51:04 2016"
## [1] " have level statistics Mon Sep 26 14:54:59 2016"
## [1] " scoring treatments Mon Sep 26 14:55:11 2016"
## [1] "have treatment plan Mon Sep 26 14:55:38 2016"
## [1] "rescoring complex variables Mon Sep 26 14:55:38 2016"
## [1] "done rescoring complex variables Mon Sep 26 15:01:38 2016"
library('ggplot2')
kddSig=0.05
ggplot(data=treatmentsC$scoreFrame,aes(x=csig)) + 
  geom_density(adjust=0.2) + geom_vline(xintercept=kddSig)

ggplot(data=treatmentsC$scoreFrame,aes(x=csig)) + 
  geom_density(adjust=0.2) + geom_vline(xintercept=kddSig) +
  scale_x_log10()

length(treatmentsC$scoreFrame$csig)
## [1] 506
sum(treatmentsC$scoreFrame$csig<kddSig)
## [1] 285
trainP = prepare(treatmentsC,
                 dTrainM,
                 scale=TRUE,
                 pruneSig=kddSig,
                 parallelCluster=cl)
selvars = setdiff(colnames(trainP),yName)

testP = prepare(treatmentsC,
                dTest,
                scale=TRUE,
                pruneSig=kddSig,
                parallelCluster=cl)

if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}
model <- randomForest(x=trainP[,selvars,drop=FALSE],
                      y=as.factor(as.character(trainP[[yName]])))
print(model)
## 
## Call:
##  randomForest(x = trainP[, selvars, drop = FALSE], y = as.factor(as.character(trainP[[yName]]))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 16
## 
##         OOB estimate of  error rate: 7.21%
## Confusion matrix:
##       -1 1  class.error
## -1 23206 1 4.309045e-05
## 1   1801 1 9.994451e-01
testP$pred <- predict(model,newdata=testP[,selvars,drop=FALSE],
                type='prob')[,as.character(yTarget),drop=TRUE]
testP[[yName]] = testP[[yName]]==yTarget
ti = 'RF prediction on test'
print(DoubleDensityPlot(testP, 'pred', yName, 
                               title=ti))

print(ROCPlot(testP, 'pred', yName, yTarget,
                     title=ti))