# devtools::install_github("WinVector/vtreat")
library('vtreat') # This library isn't public yet, intall instructions: http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
library('parallel')
# devtools::install_github("WinVector/WCPlots")
library('WVPlots')
## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
Read in the data
library('gbm')
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loaded gbm 2.1.1
# To make the html: echo "library(knitr); knit('KDD2009example.Rmd')" | R --vanilla ; pandoc KDD2009example.md -o KDD2009example.html
# Example of working with KDD2009 data (just to show library at work).
# For vtreat details see: http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/
# and Chapter 6 of Practical Data Science with R: http://www.amazon.com/Practical-Data-Science/dp/1617291560
# For details on data see: https://github.com/WinVector/zmPDSwR/tree/master/KDD2009
# load the data as in the book
dir = '~/Documents/Projects/DataScienceBook/zmPDSwR/KDD2009/' # change this path to match your directory structure
d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
header=T,sep='\t',na.strings=c('NA',''), stringsAsFactors=FALSE)
churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
header=F,sep='\t')
d$churn = churn$V1
appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
header=F,sep='\t')
d$appetency = appetency$V1
upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
header=F,sep='\t')
d$upselling = upselling$V1
set.seed(729375)
d$rgroup = runif(dim(d)[[1]])
# Make the Training (modeling), Calibration (impact coding) and Test sets
dTrainM = subset(d,rgroup<=0.5) # set for building models
dTrainC = subset(d,(rgroup>0.5) & (rgroup<=0.9)) # set for impact coding
dTest = subset(d,rgroup>0.9) # set for evaluation
# clean up
rm(list=c('d','churn','appetency','upselling','dir'))
# get variable and outcome columns
outcomes = c('churn','appetency','upselling')
vars = setdiff(colnames(dTrainM),
c(outcomes,'rgroup'))
yName = 'churn'
yTarget = 1
Vtreat: automatic variable treatment
# get the standard degrees of freedom estimates
estDF = function(d) {
vapply(d,function(v) {
df = length(unique(v)) - 1
if (is.character(v)) {
df = min(1,df)
}
df
},numeric(1))
}
# clean out the variables that are constants
dfe = estDF(dTrainM[,vars])
vars = names(dfe)[dfe>0]
# try the automatic variable treatment
set.seed(239525)
cl = parallel::makeCluster(4)
# build the data treatments on calibration data
treatments = designTreatmentsC(dTrainC,
vars,yName,yTarget,
smFactor=2.0,
parallelCluster=cl)
## [1] "desigining treatments Sat Aug 22 15:33:40 2015"
## [1] "scoring treatments Sat Aug 22 15:33:45 2015"
## [1] "have treatment plan Sat Aug 22 15:35:44 2015"
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
# prepare the training and test sets
# don't need to prepare the calibration set,
# we only used to to fit data treatment parameters
# and impact coded models
treatedTrainM = prepare(treatments,dTrainM,
pruneSig=c())
treatedTest = prepare(treatments,dTest,
pruneSig=c())
varnames = treatments$vars
# remove the catN impact coded variables; we'll use only the catB impact coded variable
# catB: bayesian models
# catN: linear regression model against 0/1 outcome (deprecated, but still there)
isCatN = grepl("catN", varnames, fixed=TRUE)
print(paste("Number of catN variables: ", sum(isCatN)))
## [1] "Number of catN variables: 0"
varnames = varnames[!isCatN]
# yName is the y column
# convert the outcome to TRUE/FALSE
treatedTrainM[[yName]] = treatedTrainM[,yName]==yTarget
treatedTest[[yName]] = treatedTest[,yName]==yTarget
Load in the scoring functions
get_utility = function(model) {
model$null.deviance - model$deviance
}
# get the chi-squared significance of a glm model (wrt deviance)
get_significance = function(model, df=NULL) {
delta_deviance = model$null.deviance - model$deviance
if(is.null(df)) {
df = model$df.null - model$df.residual
}
pchisq(delta_deviance, df, lower.tail=FALSE)
}
# get the signal scores for the variables in a data set
# assume output is a binary variable named y
get_chiscores = function(dframe, yName, guessDF) {
nvar = length(varnames)
utils = numeric(nvar)
scores = numeric(nvar)
for(i in seq_len(nvar)) {
model = glm(paste(yName,"~",varnames[i]), dframe,
family=binomial(link="logit"))
# this is a hack, added to try to adjust significance estimate for catB vars
df = 1
if((guessDF) && grepl("catB", varnames[i], fixed=TRUE)) {
df = length(unique(dframe[[ varnames[i] ]]))-1
}
utils[[i]] = get_utility(model)
scores[[i]] = get_significance(model, df)
}
sframe = data.frame(var=varnames,
scores=scores,
utilities=utils,
stringsAsFactors=FALSE)
sframe
}
#
# Plot the scores of each variable
# frm has columns var and scores
# (output of get_chiscores)
#
scoreplot = function(frm, threshold, sort=1) {
n = dim(frm)[1]
frm$var = reorder(frm$var, frm$scores*sort, FUN=sum)
frm$goodvar = frm$scores < threshold
ggplot(frm, aes(x=var, y=scores, ymin=0, ymax=scores, color=goodvar)) +
geom_pointrange() +
geom_hline(yintercept=threshold, color="red", linetype=2) +
scale_color_manual(values=c("TRUE"="darkgreen", "FALSE"="darkgray")) +
theme(legend.position="none")
}
Prepare plotting frames
# prepare plotting frames
treatedTrainP = treatedTrainM[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]
Do the variable selection, and fit gbm and glm models for comparison
# Use training data to score variables.
vScoresI = get_chiscores(treatedTrainM,yName,FALSE)
breaks=c(1e-6, 1e-5, 1e-4, 0.001, 0.01, 0.1, 1)
ggplot(vScoresI, aes(x=scores+1e-6)) + geom_density(adjust=0.5) + scale_x_log10("scores", breaks=breaks)
# threshold the significance scores
threshold = 1e-4
selVarsI = vScoresI$var[vScoresI$scores < threshold]
print(paste("Number of candidate variables:", nrow(vScoresI)))
## [1] "Number of candidate variables: 448"
print(paste("Number of variables selected:", length(selVarsI)))
## [1] "Number of variables selected: 87"
print(selVarsI)
## [1] "Var6_isBAD"
## [2] "Var7_clean"
## [3] "Var7_isBAD"
## [4] "Var13_clean"
## [5] "Var13_isBAD"
## [6] "Var21_isBAD"
## [7] "Var22_isBAD"
## [8] "Var25_isBAD"
## [9] "Var28_isBAD"
## [10] "Var35_isBAD"
## [11] "Var38_isBAD"
## [12] "Var44_isBAD"
## [13] "Var65_clean"
## [14] "Var65_isBAD"
## [15] "Var72_clean"
## [16] "Var73_clean"
## [17] "Var74_clean"
## [18] "Var74_isBAD"
## [19] "Var76_isBAD"
## [20] "Var78_isBAD"
## [21] "Var81_clean"
## [22] "Var81_isBAD"
## [23] "Var83_isBAD"
## [24] "Var85_isBAD"
## [25] "Var112_isBAD"
## [26] "Var113_clean"
## [27] "Var119_isBAD"
## [28] "Var123_isBAD"
## [29] "Var125_isBAD"
## [30] "Var126_clean"
## [31] "Var126_isBAD"
## [32] "Var132_isBAD"
## [33] "Var133_isBAD"
## [34] "Var134_isBAD"
## [35] "Var140_clean"
## [36] "Var140_isBAD"
## [37] "Var143_isBAD"
## [38] "Var144_clean"
## [39] "Var144_isBAD"
## [40] "Var153_isBAD"
## [41] "Var160_isBAD"
## [42] "Var163_isBAD"
## [43] "Var173_isBAD"
## [44] "Var181_isBAD"
## [45] "Var189_clean"
## [46] "Var192_catB"
## [47] "Var193_catB"
## [48] "Var199_catB"
## [49] "Var200_catB"
## [50] "Var204_catB"
## [51] "Var205_lev_x.sJzTlal"
## [52] "Var205_lev_x.VpdQ"
## [53] "Var205_catB"
## [54] "Var206_catB"
## [55] "Var207_lev_x.7M47J5GA0pTYIFxg5uy"
## [56] "Var207_lev_x.DHn_WUyBhW_whjA88g9bvA64_"
## [57] "Var207_lev_x.me75fM6ugJ"
## [58] "Var207_catB"
## [59] "Var210_lev_x.g5HH"
## [60] "Var210_lev_x.uKAI"
## [61] "Var210_catB"
## [62] "Var211_lev_x.L84s"
## [63] "Var211_lev_x.Mtgm"
## [64] "Var212_catB"
## [65] "Var214_catB"
## [66] "Var216_catB"
## [67] "Var217_catB"
## [68] "Var218_lev_x.cJvF"
## [69] "Var218_lev_x.UYBR"
## [70] "Var218_catB"
## [71] "Var221_lev_x.d0EEeJi"
## [72] "Var221_lev_x.oslk"
## [73] "Var221_lev_x.zCkv"
## [74] "Var221_catB"
## [75] "Var225_lev_NA"
## [76] "Var225_lev_x.ELof"
## [77] "Var225_catB"
## [78] "Var226_lev_x.FSa2"
## [79] "Var226_catB"
## [80] "Var227_lev_x.RAYp"
## [81] "Var227_lev_x.ZI9m"
## [82] "Var227_catB"
## [83] "Var228_catB"
## [84] "Var229_lev_NA"
## [85] "Var229_lev_x.am7c"
## [86] "Var229_lev_x.mj86"
## [87] "Var229_catB"
# Two cases: model with allvars, model with selected vars
strats = list('allvars'=vScoresI$var,
'selected_vars'=selVarsI)
for(strat in names(strats)) {
selVars = strats[[strat]]
formulaS = paste(yName,paste(selVars,collapse=' + '),sep=' ~ ')
for(mname in c('gbmPred','glmPred')) {
print("*****************************")
print(date())
print(paste(mname,strat,length(selVars)))
if(mname=='gbmPred') {
start = Sys.time()
modelGBMs = gbm(as.formula(formulaS),
data=treatedTrainM,
distribution='bernoulli',
n.trees=500,
interaction.depth=2,
keep.data=FALSE,
cv.folds=5)
end = Sys.time()
diff = end-start
print(paste(strat, mname, "time to fit:", diff))
nTrees = gbm.perf(modelGBMs)
treatedTrainP[[mname]] = predict(modelGBMs,newdata=treatedTrainM,type='response',
n.trees=nTrees)
treatedTestP[[mname]] = predict(modelGBMs,newdata=treatedTest,type='response',
n.trees=nTrees)
} else {
modelglms = glm(as.formula(formulaS),
data=treatedTrainM,
family=binomial(link='logit')
)
treatedTrainP[[mname]] = predict(modelglms,newdata=treatedTrainM,type='response')
treatedTestP[[mname]] = predict(modelglms,newdata=treatedTest,type='response')
}
t1 = paste(mname,'trainingT data',strat)
print(DoubleDensityPlot(treatedTrainP, mname, yName,
title=t1))
print(ROCPlot(treatedTrainP, mname, yName,
title=t1))
t3 = paste(mname,'test data',strat)
print(DoubleDensityPlot(treatedTestP, mname, yName,
title=t3))
print(ROCPlot(treatedTestP, mname, yName,
title=t3))
print(date())
print("*****************************")
}
}
## [1] "*****************************"
## [1] "Sat Aug 22 15:36:43 2015"
## [1] "gbmPred allvars 448"
## [1] "allvars gbmPred time to fit: 7.00861508448919"
## Using cv method...
## [1] "Sat Aug 22 15:43:46 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:43:46 2015"
## [1] "glmPred allvars 448"
## [1] "Sat Aug 22 15:47:01 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:47:01 2015"
## [1] "gbmPred selected_vars 87"
## [1] "selected_vars gbmPred time to fit: 1.55934676726659"
## Using cv method...
## [1] "Sat Aug 22 15:48:37 2015"
## [1] "*****************************"
## [1] "*****************************"
## [1] "Sat Aug 22 15:48:37 2015"
## [1] "glmPred selected_vars 87"
## [1] "Sat Aug 22 15:48:44 2015"
## [1] "*****************************"