#load some libraries
# https://github.com/WinVector/vtreat
library('vtreat')
packageVersion("vtreat")
## [1] '1.0.3'
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
library('parallel')
library('randomForest')
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library('ggplot2')
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/'
d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
header=T,sep='\t',na.strings=c('NA',''),
stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrainM = subset(d,rgroup<=0.5) # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
cl = parallel::makeCluster(4)
# build treatments on just the coding data
treatmentsC = designTreatmentsC(dTrainC,
vars,yName,yTarget,
smFactor=0.5,rareCount=2,rareSig=0.2,
parallelCluster=cl)
## [1] "vtreat 1.0.3 inspecting inputs Sat Feb 17 12:17:00 2018"
## [1] "designing treatments Sat Feb 17 12:17:00 2018"
## [1] " have initial level statistics Sat Feb 17 12:19:46 2018"
## [1] " scoring treatments Sat Feb 17 12:19:55 2018"
## [1] "have treatment plan Sat Feb 17 12:20:23 2018"
## [1] "rescoring complex variables Sat Feb 17 12:20:23 2018"
## [1] "done rescoring complex variables Sat Feb 17 12:23:35 2018"
kddSig = 1/length(vars)
ggplot(data=treatmentsC$scoreFrame,aes(x=sig)) +
geom_density(adjust=0.2) +
geom_vline(xintercept=kddSig) +
ggtitle("distribution of variable significances")
ggplot(data=treatmentsC$scoreFrame,aes(x=sig)) +
geom_density(adjust=0.2) + geom_vline(xintercept=kddSig) +
scale_x_log10() +
ggtitle("distribution of variable significances")
length(treatmentsC$scoreFrame$sig)
## [1] 505
selvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<kddSig]
length(selvars)
## [1] 236
trainP = prepare(treatmentsC,
dTrainM,
scale=TRUE,
varRestriction=selvars,
parallelCluster=cl)
testP = prepare(treatmentsC,
dTest,
scale=TRUE,
varRestriction=selvars,
parallelCluster=cl)
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
model <- randomForest(x=trainP[,selvars,drop=FALSE],
y=as.factor(as.character(trainP[[yName]])))
print(model)
##
## Call:
## randomForest(x = trainP[, selvars, drop = FALSE], y = as.factor(as.character(trainP[[yName]])))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 7.2%
## Confusion matrix:
## -1 1 class.error
## -1 23202 5 0.0002154522
## 1 1796 6 0.9966703663
testP$pred <- predict(model,newdata=testP[,selvars,drop=FALSE],
type='prob')[,as.character(yTarget),drop=TRUE]
testP[[yName]] = testP[[yName]]==yTarget
ti = 'RF prediction on test'
print(DoubleDensityPlot(testP, 'pred', yName,
title=ti))
print(ROCPlot(testP, 'pred', yName, yTarget,
title=ti))