KDD2009 missing values only

Prediction based only on pattern of missing values. Not a high quality prediction, but reminds us the pattern of missingness can be informative.

#load some libraries
# devtools::install_github("WinVector/WVPlots")
library('WVPlots')
library('parallel')
library('randomForest')

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

library('plotly')

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library('ggplot2')

# load the data as in the book
# change this path to match your directory structure
dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/' 

d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
                header=T,sep='\t',na.strings=c('NA',''), 
               stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
                    header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
                        header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
                        header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
dTrainM = subset(d,rgroup<=0.5)  # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation
rm(list=c('d','churn','appetency','upselling','dir'))
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
                c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1

# build data treatments
set.seed(239525)

# convert all variables just to isNA indicators
for(vi in vars) {
  dTrainC[[vi]] <- is.na(dTrainC[[vi]])
  dTest[[vi]] <- is.na(dTest[[vi]])
}
# and drop out constant columns
varMoves <- vapply(vars,
                function(vi) {
                 length(unique(dTrainC[[vi]]))>1
                }, logical(1))
vars <- vars[varMoves]

model <- randomForest(x=dTrainC[,vars,drop=FALSE],
                      y=as.factor(as.character(dTrainC[[yName]])))
print(model)

## 
## Call:
##  randomForest(x = dTrainC[, vars, drop = FALSE], y = as.factor(as.character(dTrainC[[yName]]))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 13
## 
##         OOB estimate of  error rate: 7.58%
## Confusion matrix:
##       -1 1 class.error
## -1 18502 0           0
## 1   1517 0           1

dTest$pred <- predict(model,newdata=dTest[,vars,drop=FALSE],
                type='prob')[,as.character(yTarget),drop=TRUE]
dTest[[yName]] = dTest[[yName]]==yTarget

ti = 'RF prediction on test'
print(DoubleDensityPlot(dTest, 'pred', yName, 
                               title=ti))

print(ROCPlot(dTest, 'pred', yName, yTarget,
                     title=ti))

plotlyROC <- function(predictions, target, title) {
  rocFrame <- WVPlots::graphROC(predictions, target)
  plot_ly(rocFrame$pointGraph, x = ~FalsePositiveRate, y = ~TruePositiveRate, 
        type='scatter', mode='lines+markers', hoverinfo= 'text', 
        text= ~ paste('threshold:', model, 
                      '</br>FalsePositiveRate:', FalsePositiveRate,
                      '</br>TruePositiveRate:', TruePositiveRate)) %>%
    layout(title= title)
}

plotlyROC(dTest$pred, dTest[[yName]]==yTarget, title)

KDD2009 missing values only

John Mount

March 21, 2017