Prediction based only on pattern of missing values. Not a high quality prediction, but reminds us the pattern of missingness can be informative.
#load some libraries
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
library('parallel')
library('randomForest')
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library('plotly')
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library('ggplot2')
# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/'
d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
header=T,sep='\t',na.strings=c('NA',''),
stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrainM = subset(d,rgroup<=0.5) # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1
# build data treatments
set.seed(239525)
# convert all variables just to isNA indicators
for(vi in vars) {
dTrainC[[vi]] <- is.na(dTrainC[[vi]])
dTest[[vi]] <- is.na(dTest[[vi]])
}
# and drop out constant columns
varMoves <- vapply(vars,
function(vi) {
length(unique(dTrainC[[vi]]))>1
}, logical(1))
vars <- vars[varMoves]
model <- randomForest(x=dTrainC[,vars,drop=FALSE],
y=as.factor(as.character(dTrainC[[yName]])))
print(model)
##
## Call:
## randomForest(x = dTrainC[, vars, drop = FALSE], y = as.factor(as.character(dTrainC[[yName]])))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 13
##
## OOB estimate of error rate: 7.58%
## Confusion matrix:
## -1 1 class.error
## -1 18502 0 0
## 1 1517 0 1
dTest$pred <- predict(model,newdata=dTest[,vars,drop=FALSE],
type='prob')[,as.character(yTarget),drop=TRUE]
dTest[[yName]] = dTest[[yName]]==yTarget
ti = 'RF prediction on test'
print(DoubleDensityPlot(dTest, 'pred', yName,
title=ti))
print(ROCPlot(dTest, 'pred', yName, yTarget,
title=ti))
plotlyROC <- function(predictions, target, title) {
rocFrame <- WVPlots::graphROC(predictions, target)
plot_ly(rocFrame$pointGraph, x = ~FalsePositiveRate, y = ~TruePositiveRate,
type='scatter', mode='lines+markers', hoverinfo= 'text',
text= ~ paste('threshold:', model,
'</br>FalsePositiveRate:', FalsePositiveRate,
'</br>TruePositiveRate:', TruePositiveRate)) %>%
layout(title= title)
}
plotlyROC(dTest$pred, dTest[[yName]]==yTarget, title)