R version 3.2.1 (2015-06-18) -- "World-Famous Astronaut"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> 
> source('Lfns.R')
> source('Afns.R')
Loading required package: ggplot2
Loading required package: grid
Loading required package: gridExtra
Loading required package: reshape2
Loading required package: ROCR
Loading required package: gplots

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess

Loading required package: plyr
Loading required package: stringr
Loading required package: survival
Loading required package: lattice
Loading required package: splines
Loaded gbm 2.1.1
randomForest 4.6-10
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘dplyr’

The following object is masked from ‘package:randomForest’:

    combine

The following objects are masked from ‘package:data.table’:

    between, last

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

> 
> # load the data as in the book
> # change this path to match your directory structure
> #dir = '~/Documents/work/PracticalDataScienceWithR/zmPDSwR/KDD2009/'
> dir = './'
> 
> d = read.table(paste(dir,'orange_small_train.data.gz',sep=''),
+                header=T,sep='\t',na.strings=c('NA',''), 
+                stringsAsFactors=FALSE)
> churn = read.table(paste(dir,'orange_small_train_churn.labels.txt',sep=''),
+                    header=F,sep='\t')
> d$churn = churn$V1
> appetency = read.table(paste(dir,'orange_small_train_appetency.labels.txt',sep=''),
+                        header=F,sep='\t')
> d$appetency = appetency$V1
> upselling = read.table(paste(dir,'orange_small_train_upselling.labels.txt',sep=''),
+                        header=F,sep='\t')
> d$upselling = upselling$V1
> set.seed(729375)
> d$rgroup = runif(dim(d)[[1]])
> dTrainM = subset(d,rgroup<=0.7)  # set for building models
> dTrainC = subset(d,(rgroup>0.7) & (rgroup<=0.9)) # set for impact coding
> dTest = subset(d,rgroup>0.9) # set for evaluation
> rm(list=c('d','churn','appetency','upselling','dir'))
> outcomes = c('churn','appetency','upselling')
> vars = setdiff(colnames(dTrainM),
+                c(outcomes,'rgroup'))
> yName = 'churn'
> yTarget = 1
> 
> set.seed(239525)
> nCoreEstimate <-  parallel::detectCores()
> print(paste('core estimate',nCoreEstimate))
[1] "core estimate 32"
> parallelCluster = parallel::makeCluster(nCoreEstimate)
> 
> # build treatments on just the coding data
> treatmentsC = designTreatmentsC(dTrainC,
+                                 vars,yName,yTarget,
+                                 smFactor=2.0, 
+                                 parallelCluster=parallelCluster,
+                                 scoreVars=TRUE)
[1] "desigining treatments Sun Aug 16 19:33:14 2015"
[1] "scoring columns Sun Aug 16 19:33:16 2015"
[1] "have treatment plan Sun Aug 16 19:33:27 2015"
> 
> 
> # prepare data
> treatedTrainM = prepare(treatmentsC,
+                         dTrainM,
+                         pruneLevel=c())
> varSet = setdiff(colnames(treatedTrainM),yName)
> treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
> print(summary(treatedTrainM[[yName]]))
   Mode   FALSE    TRUE    NA's 
logical   32449    2572       0 
> 
> treatedTest = prepare(treatmentsC,
+                       dTest,
+                       pruneLevel=c())
> treatedTest[[yName]] = treatedTest[[yName]]==yTarget
> print(summary(treatedTest[[yName]]))
   Mode   FALSE    TRUE    NA's 
logical    4619     353       0 
> 
> 
> chosenVars <- names(treatmentsC$varScores)[treatmentsC$varScores<1]
> 
> 
> # debug
> # allFitters <- list('logistic regression'=doFitApplyLR,
> #                   'null model'=doFitApplyNullModel)
> 
> 
> # get performance on train and test
> scoreFList <- list(train=treatedTrainM,test=treatedTest)
> mkWorkerF1 <- function(allFitters,yName,chosenVars,treatedTrainM,scoreFList) {
+   force(allFitters)
+   force(yName)
+   force(chosenVars)
+   force(treatedTrainM)
+   force(scoreFList)
+   function(modelTitle) {
+     source('Lfns.R')
+     source('Afns.R')
+     fitter <- allFitters[[modelTitle]]
+     # not sure it is safe to pass parallel cluster to workers
+     bootScores <- fitter(yName,chosenVars,treatedTrainM,scoreFList,
+                          bootScore=TRUE,parallelCluster=c())
+     cbind(data.frame(model=modelTitle,
+                      stringsAsFactors=FALSE),
+           data.frame(as.list(unlist(bootScores))))
+   } 
+ }
> w1 <- mkWorkerF1(allFitters,yName,chosenVars,
+                  treatedTrainM,scoreFList)
> resList <- parallel::parLapply(parallelCluster,names(allFitters),w1)
> if(length(resList)!=length(allFitters)) {
+   stop("not all results came back from parLapply")
+ }
> 
> 
> 
> # get permuted training prerformance
> n <- nrow(treatedTrainM)
> yPerms <- lapply(1:10,function(i) {
+   list(repNum=i,
+        yP=treatedTrainM[[yName]][sample.int(n,n,replace=FALSE)])})
> tasks <- list()
> for(yP in yPerms) {
+   for(modelTitle in names(allFitters)) {
+     tasks[[1+length(tasks)]] <- list(repNum=yP$repNum,yP=yP$yP,modelTitle=modelTitle)
+   }
+ }
> mkWorkerF2 <- function(allFitters,yName,chosenVars,treatedTrainM) {
+   force(allFitters)
+   force(yName)
+   force(chosenVars)
+   force(treatedTrainM)
+   function(task) {
+     source('Lfns.R')
+     source('Afns.R')
+     modelTitle <- task$modelTitle
+     yP <- task$yP
+     repNum <- task$repNum
+     fitter <- allFitters[[modelTitle]]
+     treatedTrainP <- treatedTrainM
+     treatedTrainP[[yName]] <- yP
+     pScores <- fitter(yName,chosenVars,treatedTrainP,
+                       list(xptrain=treatedTrainP))
+     di <- cbind(data.frame(model=modelTitle,
+                            repNum=repNum,
+                            stringsAsFactors=FALSE),
+                 data.frame(as.list(unlist(pScores))))
+   }
+ }
> w2 <- mkWorkerF2(allFitters,yName,chosenVars,treatedTrainM)
> resListP <- parallel::parLapply(parallelCluster,tasks,w2)
> if(length(resListP)!=length(tasks)) {
+   stop("not all results came back from parLapply")
+ }
> 
> 
> 
> # shutdown, clean up
> if(!is.null(parallelCluster)) {
+   parallel::stopCluster(parallelCluster)
+   parallelCluster = NULL
+ }
> 
> save(list=ls(),file="bsteps.RData")
> 
> proc.time()
    user   system  elapsed 
  46.032    2.893 1163.241