Variable Selection: KDD 2009 Example

# devtools::install_github("WinVector/vtreat")
library('vtreat')  # This library isn't public yet, intall instructions: http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
library('parallel')
# devtools::install_github("WinVector/WCPlots")
library('WVPlots')

## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## Loading required package: plyr
## Loading required package: stringr

Read in the data

library('gbm')

## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loaded gbm 2.1.1

# To make the html: echo "library(knitr); knit('KDD2009example.Rmd')" | R --vanilla ; pandoc KDD2009example.md -o KDD2009example.html
# Example of working with KDD2009 data (just to show library at work).
# For vtreat details see: http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
# and Chapter 6 of Practical Data Science with R: http://www.amazon.com/Practical-Data-Science/dp/1617291560
# For details on data see: https://github.com/WinVector/zmPDSwR/tree/master/KDD2009

# load the data as in the book
dir = '~/Documents/Projects/DataScienceBook/zmPDSwR/KDD2009/' # change this path to match your directory structure

d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
                header=T,sep='\t',na.strings=c('NA',''), stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
                    header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
                        header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
                        header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])

# Make the Training (modeling), Calibration (impact coding) and Test sets
dTrainM = subset(d,rgroup<=0.5)  # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation

# clean up
rm(list=c('d','churn','appetency','upselling','dir'))

# get variable and outcome columns
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
                c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1

Vtreat: automatic variable treatment

# get the standard degrees of freedom estimates
estDF = function(d) {
  vapply(d,function(v) {
    df = length(unique(v)) - 1
    if (is.character(v)) {
      df = min(1,df)
    }
    df
  },numeric(1))
}

# clean out the variables that are constants
dfe = estDF(dTrainM[,vars])
vars = names(dfe)[dfe>0]


# try the automatic variable treatment

set.seed(239525)

cl = parallel::makeCluster(4)

# build the data treatments on calibration data
treatments = designTreatmentsC(dTrainC,
    vars,yName,yTarget,
    smFactor=2.0,
    parallelCluster=cl)

## [1] "desigining treatments Sat Aug 22 15:33:40 2015"
## [1] "scoring treatments Sat Aug 22 15:33:45 2015"
## [1] "have treatment plan Sat Aug 22 15:35:44 2015"

if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}

# prepare the training and test sets
# don't need to prepare the calibration set,
# we only used to to fit data treatment parameters
# and impact coded models
treatedTrainM = prepare(treatments,dTrainM,
                        pruneSig=c())
treatedTest = prepare(treatments,dTest,
                       pruneSig=c())
varnames = treatments$vars

# remove the catN impact coded variables; we'll use only the catB impact coded variable
# catB: bayesian models
# catN: linear regression model against 0/1 outcome (deprecated, but still there)
isCatN = grepl("catN", varnames, fixed=TRUE)
print(paste("Number of catN variables: ", sum(isCatN)))

## [1] "Number of catN variables:  0"

varnames = varnames[!isCatN]

# yName is the y column
# convert the outcome to TRUE/FALSE
treatedTrainM[[yName]] = treatedTrainM[,yName]==yTarget
treatedTest[[yName]] = treatedTest[,yName]==yTarget

Load in the scoring functions

get_utility = function(model) {
   model$null.deviance - model$deviance
}

# get the chi-squared significance of a glm model (wrt deviance)
get_significance = function(model, df=NULL) {
  delta_deviance = model$null.deviance - model$deviance
  if(is.null(df)) {
    df = model$df.null - model$df.residual
  }
  pchisq(delta_deviance, df, lower.tail=FALSE)
}

# get the signal scores for the variables in a data set
# assume output is a binary variable named y
get_chiscores = function(dframe, yName, guessDF) {
  nvar = length(varnames)
  utils = numeric(nvar)
  scores = numeric(nvar)
  for(i in seq_len(nvar)) {
    model = glm(paste(yName,"~",varnames[i]), dframe,
                family=binomial(link="logit"))
    
    # this is a hack, added to try to adjust significance estimate for catB vars
    df = 1
    if((guessDF) && grepl("catB", varnames[i], fixed=TRUE)) {
      df = length(unique(dframe[[ varnames[i] ]]))-1
    }
    utils[[i]] = get_utility(model)
    scores[[i]] = get_significance(model, df)
  }
  
  sframe = data.frame(var=varnames,
                      scores=scores,
                      utilities=utils,
                      stringsAsFactors=FALSE)
  sframe
}

#
# Plot the scores of each variable
# frm has columns var and scores
# (output of get_chiscores)
#
scoreplot = function(frm, threshold, sort=1) {
  n = dim(frm)[1]
  frm$var = reorder(frm$var, frm$scores*sort, FUN=sum)
  frm$goodvar = frm$scores < threshold

  ggplot(frm, aes(x=var, y=scores, ymin=0, ymax=scores, color=goodvar)) +
    geom_pointrange() +
    geom_hline(yintercept=threshold, color="red", linetype=2) +
    scale_color_manual(values=c("TRUE"="darkgreen", "FALSE"="darkgray")) +
    theme(legend.position="none")
}

Prepare plotting frames

# prepare plotting frames
treatedTrainP = treatedTrainM[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]

Do the variable selection, and fit gbm and glm models for comparison

# Use training data to score variables.
vScoresI = get_chiscores(treatedTrainM,yName,FALSE) 

breaks=c(1e-6, 1e-5, 1e-4, 0.001, 0.01, 0.1, 1)
ggplot(vScoresI, aes(x=scores+1e-6)) + geom_density(adjust=0.5) + scale_x_log10("scores", breaks=breaks)

# threshold the significance scores
threshold = 1e-4
selVarsI = vScoresI$var[vScoresI$scores < threshold]
print(paste("Number of candidate variables:", nrow(vScoresI)))

## [1] "Number of candidate variables: 448"

print(paste("Number of variables selected:", length(selVarsI)))

## [1] "Number of variables selected: 87"

print(selVarsI)

##  [1] "Var6_isBAD"                            
##  [2] "Var7_clean"                            
##  [3] "Var7_isBAD"                            
##  [4] "Var13_clean"                           
##  [5] "Var13_isBAD"                           
##  [6] "Var21_isBAD"                           
##  [7] "Var22_isBAD"                           
##  [8] "Var25_isBAD"                           
##  [9] "Var28_isBAD"                           
## [10] "Var35_isBAD"                           
## [11] "Var38_isBAD"                           
## [12] "Var44_isBAD"                           
## [13] "Var65_clean"                           
## [14] "Var65_isBAD"                           
## [15] "Var72_clean"                           
## [16] "Var73_clean"                           
## [17] "Var74_clean"                           
## [18] "Var74_isBAD"                           
## [19] "Var76_isBAD"                           
## [20] "Var78_isBAD"                           
## [21] "Var81_clean"                           
## [22] "Var81_isBAD"                           
## [23] "Var83_isBAD"                           
## [24] "Var85_isBAD"                           
## [25] "Var112_isBAD"                          
## [26] "Var113_clean"                          
## [27] "Var119_isBAD"                          
## [28] "Var123_isBAD"                          
## [29] "Var125_isBAD"                          
## [30] "Var126_clean"                          
## [31] "Var126_isBAD"                          
## [32] "Var132_isBAD"                          
## [33] "Var133_isBAD"                          
## [34] "Var134_isBAD"                          
## [35] "Var140_clean"                          
## [36] "Var140_isBAD"                          
## [37] "Var143_isBAD"                          
## [38] "Var144_clean"                          
## [39] "Var144_isBAD"                          
## [40] "Var153_isBAD"                          
## [41] "Var160_isBAD"                          
## [42] "Var163_isBAD"                          
## [43] "Var173_isBAD"                          
## [44] "Var181_isBAD"                          
## [45] "Var189_clean"                          
## [46] "Var192_catB"                           
## [47] "Var193_catB"                           
## [48] "Var199_catB"                           
## [49] "Var200_catB"                           
## [50] "Var204_catB"                           
## [51] "Var205_lev_x.sJzTlal"                  
## [52] "Var205_lev_x.VpdQ"                     
## [53] "Var205_catB"                           
## [54] "Var206_catB"                           
## [55] "Var207_lev_x.7M47J5GA0pTYIFxg5uy"      
## [56] "Var207_lev_x.DHn_WUyBhW_whjA88g9bvA64_"
## [57] "Var207_lev_x.me75fM6ugJ"               
## [58] "Var207_catB"                           
## [59] "Var210_lev_x.g5HH"                     
## [60] "Var210_lev_x.uKAI"                     
## [61] "Var210_catB"                           
## [62] "Var211_lev_x.L84s"                     
## [63] "Var211_lev_x.Mtgm"                     
## [64] "Var212_catB"                           
## [65] "Var214_catB"                           
## [66] "Var216_catB"                           
## [67] "Var217_catB"                           
## [68] "Var218_lev_x.cJvF"                     
## [69] "Var218_lev_x.UYBR"                     
## [70] "Var218_catB"                           
## [71] "Var221_lev_x.d0EEeJi"                  
## [72] "Var221_lev_x.oslk"                     
## [73] "Var221_lev_x.zCkv"                     
## [74] "Var221_catB"                           
## [75] "Var225_lev_NA"                         
## [76] "Var225_lev_x.ELof"                     
## [77] "Var225_catB"                           
## [78] "Var226_lev_x.FSa2"                     
## [79] "Var226_catB"                           
## [80] "Var227_lev_x.RAYp"                     
## [81] "Var227_lev_x.ZI9m"                     
## [82] "Var227_catB"                           
## [83] "Var228_catB"                           
## [84] "Var229_lev_NA"                         
## [85] "Var229_lev_x.am7c"                     
## [86] "Var229_lev_x.mj86"                     
## [87] "Var229_catB"

# Two cases: model with allvars, model with selected vars
strats = list('allvars'=vScoresI$var,
              'selected_vars'=selVarsI)

for(strat in names(strats)) {
  selVars = strats[[strat]]
  formulaS = paste(yName,paste(selVars,collapse=' + '),sep=' ~ ')
  for(mname in c('gbmPred','glmPred')) {
    print("*****************************")
    print(date())
    print(paste(mname,strat,length(selVars)))
    if(mname=='gbmPred') {
      start = Sys.time()
      modelGBMs = gbm(as.formula(formulaS),
                      data=treatedTrainM,
                      distribution='bernoulli',
                      n.trees=500,
                      interaction.depth=2,
                      keep.data=FALSE,
                      cv.folds=5)
      end = Sys.time()
      diff = end-start
      print(paste(strat, mname, "time to fit:", diff))
      nTrees = gbm.perf(modelGBMs)
      treatedTrainP[[mname]] = predict(modelGBMs,newdata=treatedTrainM,type='response',
                                       n.trees=nTrees) 
      treatedTestP[[mname]] = predict(modelGBMs,newdata=treatedTest,type='response',
                                      n.trees=nTrees)
    } else {
      modelglms = glm(as.formula(formulaS),
                      data=treatedTrainM,
                      family=binomial(link='logit')
      )
      treatedTrainP[[mname]] = predict(modelglms,newdata=treatedTrainM,type='response')
      treatedTestP[[mname]] = predict(modelglms,newdata=treatedTest,type='response')
    }
    
    t1 = paste(mname,'trainingT data',strat)
    print(DoubleDensityPlot(treatedTrainP, mname, yName, 
                            title=t1))
    print(ROCPlot(treatedTrainP, mname, yName, 
                  title=t1))
    
    t3 = paste(mname,'test data',strat)
    print(DoubleDensityPlot(treatedTestP, mname, yName, 
                            title=t3))
    print(ROCPlot(treatedTestP, mname, yName, 
                  title=t3))
    print(date())
    print("*****************************")
  }
}

## [1] "*****************************"
## [1] "Sat Aug 22 15:36:43 2015"
## [1] "gbmPred allvars 448"
## [1] "allvars gbmPred time to fit: 7.00861508448919"
## Using cv method...

## [1] "Sat Aug 22 15:43:46 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:43:46 2015"
## [1] "glmPred allvars 448"

## [1] "Sat Aug 22 15:47:01 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:47:01 2015"
## [1] "gbmPred selected_vars 87"
## [1] "selected_vars gbmPred time to fit: 1.55934676726659"
## Using cv method...

## [1] "Sat Aug 22 15:48:37 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:48:37 2015"
## [1] "glmPred selected_vars 87"

## [1] "Sat Aug 22 15:48:44 2015"
## [1] "*****************************"

Variable Selection: KDD 2009 Example

Nina Zumel

August 5, 2015