[KDD2009 example](http://www.sigkdd.org/kdd-cup-2009-customer-relationship-prediction). Winners had hold-out AUC of 0.7611 on churn. See [here](https://github.com/WinVector/zmPDSwR/tree/master/KDD2009) for more details. ```{r kddexlibs, tidy=FALSE} #load some libraries # http://www.win-vector.com/blog/2014/08/vtreat-designing-a-package-for-variable-treatment/ library('vtreat') packageVersion('vtreat') # devtools::install_github("WinVector/WVPlots") library('WVPlots') library('parallel') library('gbm') #library('class') library('ggplot2') library('glmnet') library('xgboost') # load the data as in the book # change this path to match your directory structure dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/' d = read.table(paste(dir,'orange_small_train.data.gz',sep=''), header=T,sep='\t',na.strings=c('NA',''), stringsAsFactors=FALSE) churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''), header=F,sep='\t') d$churn = churn$V1 appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''), header=F,sep='\t') d$appetency = appetency$V1 upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''), header=F,sep='\t') d$upselling = upselling$V1 set.seed(729375) rgroup = runif(dim(d)[[1]]) dTrain = d[rgroup<=0.9,] # set for building models dTest = d[rgroup>0.9,] # set for evaluation debug = FALSE if(debug) { dTrain <- dTrain[sample.int(nrow(dTrain),100),] dTest <- dTest[sample.int(nrow(dTest),100),] } rm(list=c('d','churn','appetency','upselling','dir')) outcomes = c('churn','appetency','upselling') nonvars <- c(outcomes,'rgroup') vars = setdiff(colnames(dTrain), nonvars) yName = 'churn' yTarget = 1 ``` ```{r kdddesign, tidy=FALSE} # build data treatments set.seed(239525) cl <- c() if(!debug) { ncore <- parallel::detectCores() cl <- parallel::makeCluster(ncore) } # build treatments trainPlan = mkCrossFrameCExperiment(dTrain, vars,yName,yTarget, smFactor=2.0, parallelCluster=cl) print(trainPlan$method) treatmentsC = trainPlan$treatments treatedTrainM = trainPlan$crossFrame kddSig = 1/nrow(treatmentsC$scoreFrame) print(kddSig) selvars = treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig