# load libraries
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 3.2.4
library('reshape2')
library('rpart')
library('ROCR')
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
library('gdata')
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
# load problem
d = read.xls("CTG.xls",
             sheet=2,header=TRUE,skip=1,
             stringsAsFactors=FALSE,blank.lines.skip=TRUE)
vars <- colnames(d)[11:31]
d <- d[,c(vars,'NSP'),drop=FALSE]
d <- d[complete.cases(d),,drop=FALSE]
d[,'NSP'] <- c('Normal','Suspect','Pathological')[d[,'NSP']]
yColumn <- 'NSPbad'
d[,yColumn] <- d[,'NSP']!='Normal'

set.seed(2352)
d$isTest <- runif(nrow(d))<0.25
d$dataLabel <- ifelse(d$isTest,"test data","train data")
formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')

print(dim(d))
## [1] 2126   25
print(str(d))
## 'data.frame':    2126 obs. of  25 variables:
##  $ LB       : int  120 132 133 134 132 134 134 122 122 122 ...
##  $ AC.1     : num  0 0.006 0.003 0.003 0.007 0.001 0.001 0 0 0 ...
##  $ FM.1     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UC.1     : num  0 0.006 0.008 0.008 0.008 0.01 0.013 0 0.002 0.003 ...
##  $ DL.1     : num  0 0.003 0.003 0.003 0 0.009 0.008 0 0 0 ...
##  $ DS.1     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DP.1     : num  0 0 0 0 0 0.002 0.003 0 0 0 ...
##  $ ASTV     : num  73 17 16 16 16 26 29 83 84 86 ...
##  $ MSTV     : num  0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
##  $ ALTV     : num  43 0 0 0 0 0 0 6 5 6 ...
##  $ MLTV     : num  2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
##  $ Width    : int  64 130 130 117 117 150 150 68 68 68 ...
##  $ Min      : int  62 68 68 53 53 50 50 62 62 62 ...
##  $ Max      : int  126 198 198 170 170 200 200 130 130 130 ...
##  $ Nmax     : int  2 6 5 11 9 5 6 0 0 1 ...
##  $ Nzeros   : int  0 1 1 0 0 3 3 0 0 0 ...
##  $ Mode     : int  120 141 141 137 137 76 71 122 122 122 ...
##  $ Mean     : int  137 136 135 134 136 107 107 122 122 122 ...
##  $ Median   : int  121 140 138 137 138 107 106 123 123 123 ...
##  $ Variance : int  73 12 13 13 11 170 215 3 3 1 ...
##  $ Tendency : int  1 0 0 1 1 0 0 1 1 1 ...
##  $ NSP      : chr  "Suspect" "Normal" "Normal" "Normal" ...
##  $ NSPbad   : logi  TRUE FALSE FALSE FALSE FALSE TRUE ...
##  $ isTest   : logi  FALSE FALSE FALSE TRUE TRUE TRUE ...
##  $ dataLabel: chr  "train data" "train data" "train data" "test data" ...
## NULL
print(head(d))
##    LB  AC.1 FM.1  UC.1  DL.1 DS.1  DP.1 ASTV MSTV ALTV MLTV Width Min Max
## 1 120 0.000    0 0.000 0.000    0 0.000   73  0.5   43  2.4    64  62 126
## 2 132 0.006    0 0.006 0.003    0 0.000   17  2.1    0 10.4   130  68 198
## 3 133 0.003    0 0.008 0.003    0 0.000   16  2.1    0 13.4   130  68 198
## 4 134 0.003    0 0.008 0.003    0 0.000   16  2.4    0 23.0   117  53 170
## 5 132 0.007    0 0.008 0.000    0 0.000   16  2.4    0 19.9   117  53 170
## 6 134 0.001    0 0.010 0.009    0 0.002   26  5.9    0  0.0   150  50 200
##   Nmax Nzeros Mode Mean Median Variance Tendency          NSP NSPbad
## 1    2      0  120  137    121       73        1      Suspect   TRUE
## 2    6      1  141  136    140       12        0       Normal  FALSE
## 3    5      1  141  135    138       13        0       Normal  FALSE
## 4   11      0  137  134    137       13        1       Normal  FALSE
## 5    9      0  137  136    138       11        1       Normal  FALSE
## 6    5      3   76  107    107      170        0 Pathological   TRUE
##   isTest  dataLabel
## 1  FALSE train data
## 2  FALSE train data
## 3  FALSE train data
## 4   TRUE  test data
## 5   TRUE  test data
## 6   TRUE  test data
print(summary(d))
##        LB             AC.1               FM.1               UC.1         
##  Min.   :106.0   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:126.0   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.002000  
##  Median :133.0   Median :0.002000   Median :0.000000   Median :0.004000  
##  Mean   :133.3   Mean   :0.003178   Mean   :0.009481   Mean   :0.004366  
##  3rd Qu.:140.0   3rd Qu.:0.006000   3rd Qu.:0.003000   3rd Qu.:0.007000  
##  Max.   :160.0   Max.   :0.019000   Max.   :0.481000   Max.   :0.015000  
##       DL.1               DS.1                DP.1          
##  Min.   :0.000000   Min.   :0.000e+00   Min.   :0.0000000  
##  1st Qu.:0.000000   1st Qu.:0.000e+00   1st Qu.:0.0000000  
##  Median :0.000000   Median :0.000e+00   Median :0.0000000  
##  Mean   :0.001889   Mean   :3.293e-06   Mean   :0.0001585  
##  3rd Qu.:0.003000   3rd Qu.:0.000e+00   3rd Qu.:0.0000000  
##  Max.   :0.015000   Max.   :1.000e-03   Max.   :0.0050000  
##       ASTV            MSTV            ALTV             MLTV       
##  Min.   :12.00   Min.   :0.200   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:32.00   1st Qu.:0.700   1st Qu.: 0.000   1st Qu.: 4.600  
##  Median :49.00   Median :1.200   Median : 0.000   Median : 7.400  
##  Mean   :46.99   Mean   :1.333   Mean   : 9.847   Mean   : 8.188  
##  3rd Qu.:61.00   3rd Qu.:1.700   3rd Qu.:11.000   3rd Qu.:10.800  
##  Max.   :87.00   Max.   :7.000   Max.   :91.000   Max.   :50.700  
##      Width             Min              Max           Nmax       
##  Min.   :  3.00   Min.   : 50.00   Min.   :122   Min.   : 0.000  
##  1st Qu.: 37.00   1st Qu.: 67.00   1st Qu.:152   1st Qu.: 2.000  
##  Median : 67.50   Median : 93.00   Median :162   Median : 3.000  
##  Mean   : 70.45   Mean   : 93.58   Mean   :164   Mean   : 4.068  
##  3rd Qu.:100.00   3rd Qu.:120.00   3rd Qu.:174   3rd Qu.: 6.000  
##  Max.   :180.00   Max.   :159.00   Max.   :238   Max.   :18.000  
##      Nzeros             Mode            Mean           Median     
##  Min.   : 0.0000   Min.   : 60.0   Min.   : 73.0   Min.   : 77.0  
##  1st Qu.: 0.0000   1st Qu.:129.0   1st Qu.:125.0   1st Qu.:129.0  
##  Median : 0.0000   Median :139.0   Median :136.0   Median :139.0  
##  Mean   : 0.3236   Mean   :137.5   Mean   :134.6   Mean   :138.1  
##  3rd Qu.: 0.0000   3rd Qu.:148.0   3rd Qu.:145.0   3rd Qu.:148.0  
##  Max.   :10.0000   Max.   :187.0   Max.   :182.0   Max.   :186.0  
##     Variance         Tendency           NSP              NSPbad       
##  Min.   :  0.00   Min.   :-1.0000   Length:2126        Mode :logical  
##  1st Qu.:  2.00   1st Qu.: 0.0000   Class :character   FALSE:1655     
##  Median :  7.00   Median : 0.0000   Mode  :character   TRUE :471      
##  Mean   : 18.81   Mean   : 0.3203                      NA's :0        
##  3rd Qu.: 24.00   3rd Qu.: 1.0000                                     
##  Max.   :269.00   Max.   : 1.0000                                     
##    isTest         dataLabel        
##  Mode :logical   Length:2126       
##  FALSE:1602      Class :character  
##  TRUE :524       Mode  :character  
##  NA's :0                           
##                                    
## 
print(summary(as.factor(d$NSP)))
##       Normal Pathological      Suspect 
##         1655          176          295
# load some conveince functions

# calulcate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc <- function(x,y) {
   # append extra points to get rid of degenerate cases
   x <- c(0,x,1)
   y <- c(0,y,1)
   n <- length(x)
   sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}


gainCurve = function(truthcol, predcol, title) {
  truthcol <- as.numeric(truthcol)
  # data frame of pred and truth, sorted in order of the predictions
  d = data.frame(predcol=predcol,truthcol=truthcol)
  predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
  wizard = order(d[['truthcol']], decreasing=TRUE)
  npop = dim(d)[1]
  
  # data frame the cumulative prediction/truth as a function
  # of the fraction of the population we're considering, highest first
  results = data.frame(pctpop= (1:npop)/npop,
                       model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
                       wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
  
  # calculate the areas under each curve
  # gini score is 2* (area - 0.5)
  idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
  modelArea = areaCalc(results$pctpop,results$model) - 0.5
  giniScore = modelArea/idealArea # actually, normalized gini score
  
  # melt the frame into the tall form, for plotting
  results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
                 variable.name="sort_criterion", value.name="pct_outcome")
  
  gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) + 
    geom_point() + geom_line() + 
    geom_abline(color="gray") +
    ggtitle(paste("Gain curve,", title, '\n', 
       'relative Gini score', format(giniScore,digits=2))) +
       xlab("% items in score order") + ylab("% total category") +
       scale_x_continuous(breaks=seq(0,1,0.1)) +
       scale_y_continuous(breaks=seq(0,1,0.1)) +
    scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
  gplot
}


plotROC <- function(title,outcol,predcol) {
  pred <- prediction(predcol,outcol)
  perf <- performance(pred,'tpr','fpr')
  auc <- as.numeric(performance(pred,'auc')@y.values)
  pf <- data.frame(
    FalsePositiveRate=perf@x.values[[1]],
    TruePositiveRate=perf@y.values[[1]])
  plot=ggplot() +
    geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
      fill='blue',alpha=0.3) +
      geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
      geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
      ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
  list(pf=pf,plot=plot)
}


deviance <- function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}


reportStats <- function(d,test,modelName,title,epsilon) {
  dSub <- d[d$isTest==test,,drop=FALSE]
  tab <- table(truth=dSub[,yColumn],pred=dSub[,modelName]>0.5)
  accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
  note = ifelse(test,'test','train')
  print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2)))
  residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon)
  #print(paste('\tresidual.deviance',residual.deviance))
  null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon)
  #print(paste('\tnull.deviance',null.deviance))
  print(paste("\tmodel explained a",
              format((1-residual.deviance/null.deviance),digits=2),
            "fraction of the variation on",note))  
}

report <- function(d,modelName,title,epsilon=1.0e-2) {
  print("***********")
  print(paste("model",modelName,title))
  reportStats(d,FALSE,modelName,title,epsilon)
  reportStats(d,TRUE,modelName,title,epsilon)
  print(ggplot(data=d,aes_string(x=modelName,color=yColumn)) + 
    geom_density() + facet_wrap(~dataLabel,ncol=1,scales='free_y') +
    ggtitle(title))
  print(plotROC(paste(title,'train'),
                d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName])$plot)
  print(plotROC(paste(title,'test'),
                d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName])$plot)
  print(gainCurve(d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName],
                paste(title,'train')))
  print(gainCurve(d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName],
                paste(title,'test')))
  print("***********")
}
# build a decision tree
treeModelPred <- function(d,vars) {
  formula <- paste('as.factor(',yColumn,') ~ ',paste(vars,collapse=' + '),sep='')
  modelTree <- rpart(formula,data=d)
  predict(modelTree,newdata=d,type='prob')[,'TRUE',drop=TRUE]
}

d$modelTree <- treeModelPred(d,vars)
report(d,'modelTree',"decision tree (with some missing values)")
## [1] "***********"
## [1] "model modelTree decision tree (with some missing values)"
## [1] "\t train accuracy modelTree 0.95"
## [1] "\tmodel explained a 0.62 fraction of the variation on train"
## [1] "\t test accuracy modelTree 0.94"
## [1] "\tmodel explained a 0.63 fraction of the variation on test"

## [1] "***********"
# build a GLMNET model
glmNetModelPred <- function(d,vars) {
  # Nota bene: only save to all as.matrix() on a dataframe that is all numeric
  # is it is in this example.
  cv <- cv.glmnet(x=as.matrix(d[!d$isTest,vars]),y=d[!d$isTest,yColumn,drop=TRUE],
                family='binomial')
  predict(cv,newx=as.matrix(d[,vars]),type='response')[,1]
}

d$modelEGLM <- glmNetModelPred(d,vars)
report(d,'modelEGLM',"glmnet")
## [1] "***********"
## [1] "model modelEGLM glmnet"
## [1] "\t train accuracy modelEGLM 0.91"
## [1] "\tmodel explained a 0.6 fraction of the variation on train"
## [1] "\t test accuracy modelEGLM 0.92"
## [1] "\tmodel explained a 0.61 fraction of the variation on test"

## [1] "***********"
# force some values to missing at random (unifromative)
d2 <- d
nMissing <- 1000
set.seed(235)
for(v in sample(vars,nMissing,replace=TRUE)) {
  i = sample.int(nrow(d2),1)
  d2[i,v] <- NA
}
# try to re-build a decision tree model
d$modelTreeE <- treeModelPred(d2,vars)
report(d,'modelTreeE',"decision tree")
## [1] "***********"
## [1] "model modelTreeE decision tree"
## [1] "\t train accuracy modelTreeE 0.94"
## [1] "\tmodel explained a 0.58 fraction of the variation on train"
## [1] "\t test accuracy modelTreeE 0.94"
## [1] "\tmodel explained a 0.59 fraction of the variation on test"

## [1] "***********"
# try to re-build a GLMNET model
tryCatch(
  glmNetModelPred(d2,vars),
  warning = function(w) {print(paste('warning',w)); c()},
  error = function(e) {print(paste('error',e)) ; c()})
## [1] "error Error in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : NA/NaN/Inf in foreign function call (arg 5)\n"
## NULL
# Install a library try to work around problem
# install.packages('vtreat')
library('vtreat') # https://github.com/WinVector/vtreat
# try a fix for the missing values
treatmentsC <- designTreatmentsC(d2[!d2$isTest,],vars,yColumn,TRUE,verbose=FALSE)
dTreated <- prepare(treatmentsC,d2,pruneSig=c())
treatedVars <- setdiff(colnames(dTreated),c(yColumn,'isTest'))
dTreated$isTest <- d2$isTest
print(treatedVars)
##  [1] "LB_clean"       "LB_isBAD"       "AC.1_clean"     "AC.1_isBAD"    
##  [5] "FM.1_clean"     "FM.1_isBAD"     "UC.1_clean"     "UC.1_isBAD"    
##  [9] "DL.1_clean"     "DL.1_isBAD"     "DS.1_clean"     "DS.1_isBAD"    
## [13] "DP.1_clean"     "DP.1_isBAD"     "ASTV_clean"     "ASTV_isBAD"    
## [17] "MSTV_clean"     "MSTV_isBAD"     "ALTV_clean"     "ALTV_isBAD"    
## [21] "MLTV_clean"     "MLTV_isBAD"     "Width_clean"    "Width_isBAD"   
## [25] "Min_clean"      "Min_isBAD"      "Max_clean"      "Max_isBAD"     
## [29] "Nmax_clean"     "Nmax_isBAD"     "Nzeros_clean"   "Nzeros_isBAD"  
## [33] "Mode_clean"     "Mode_isBAD"     "Mean_clean"     "Mean_isBAD"    
## [37] "Median_clean"   "Median_isBAD"   "Variance_clean" "Variance_isBAD"
## [41] "Tendency_clean" "Tendency_isBAD"
print(head(dTreated))
##   LB_clean LB_isBAD AC.1_clean AC.1_isBAD FM.1_clean FM.1_isBAD UC.1_clean
## 1      120        0      0.000          0          0          0      0.000
## 2      132        0      0.006          0          0          0      0.006
## 3      133        0      0.003          0          0          0      0.008
## 4      134        0      0.003          0          0          0      0.008
## 5      132        0      0.007          0          0          0      0.008
## 6      134        0      0.001          0          0          0      0.010
##   UC.1_isBAD DL.1_clean DL.1_isBAD   DS.1_clean DS.1_isBAD DP.1_clean
## 1          0      0.000          0 0.000000e+00          0      0.000
## 2          0      0.003          0 0.000000e+00          0      0.000
## 3          0      0.003          0 0.000000e+00          0      0.000
## 4          0      0.003          0 1.918159e-06          1      0.000
## 5          0      0.000          0 0.000000e+00          0      0.000
## 6          0      0.009          0 0.000000e+00          0      0.002
##   DP.1_isBAD ASTV_clean ASTV_isBAD MSTV_clean MSTV_isBAD ALTV_clean
## 1          0         73          0        0.5          0         43
## 2          0         17          0        2.1          0          0
## 3          0         16          0        2.1          0          0
## 4          0         16          0        2.4          0          0
## 5          0         16          0        2.4          0          0
## 6          0         26          0        5.9          0          0
##   ALTV_isBAD MLTV_clean MLTV_isBAD Width_clean Width_isBAD Min_clean
## 1          0        2.4          0          64           0        62
## 2          0       10.4          0         130           0        68
## 3          0       13.4          0         130           0        68
## 4          0       23.0          0         117           0        53
## 5          0       19.9          0         117           0        53
## 6          0        0.0          0         150           0        50
##   Min_isBAD Max_clean Max_isBAD Nmax_clean Nmax_isBAD Nzeros_clean
## 1         0  126.0000         0          2          0            0
## 2         0  198.0000         0          6          0            1
## 3         0  198.0000         0          5          0            1
## 4         0  170.0000         0         11          0            0
## 5         0  164.1275         1          9          0            0
## 6         0  200.0000         0          5          0            3
##   Nzeros_isBAD Mode_clean Mode_isBAD Mean_clean Mean_isBAD Median_clean
## 1            0        120          0        137          0          121
## 2            0        141          0        136          0          140
## 3            0        141          0        135          0          138
## 4            0        137          0        134          0          137
## 5            0        137          0        136          0          138
## 6            0         76          0        107          0          107
##   Median_isBAD Variance_clean Variance_isBAD Tendency_clean Tendency_isBAD
## 1            0             73              0              1              0
## 2            0             12              0              0              0
## 3            0             13              0              0              0
## 4            0             13              0              1              0
## 5            0             11              0              1              0
## 6            0            170              0              0              0
##   NSPbad isTest
## 1   TRUE  FALSE
## 2  FALSE  FALSE
## 3  FALSE  FALSE
## 4  FALSE   TRUE
## 5  FALSE   TRUE
## 6   TRUE   TRUE
d$modelEGLME <- glmNetModelPred(dTreated,treatedVars)
report(d,'modelEGLME',"glmnet (with missing data)")
## [1] "***********"
## [1] "model modelEGLME glmnet (with missing data)"
## [1] "\t train accuracy modelEGLME 0.91"
## [1] "\tmodel explained a 0.58 fraction of the variation on train"
## [1] "\t test accuracy modelEGLME 0.9"
## [1] "\tmodel explained a 0.58 fraction of the variation on test"

## [1] "***********"
# show what informative data loss can look like
d2 <- d
nMissing <- 1000
set.seed(235)
for(v in sample(vars,nMissing,replace=TRUE)) {
  i = sample.int(nrow(d2),1)
  # suppose data is only lost on negative examples (extreme form of informative!)
  if(!d2[i,yColumn]) {
    d2[i,v] <- NA
  }
}
treatmentsC <- designTreatmentsC(d2[!d2$isTest,],vars,yColumn,TRUE,verbose=FALSE)
dTreated <- prepare(treatmentsC,d2,pruneSig=c())
treatedVars <- setdiff(colnames(dTreated),c(yColumn,'isTest'))
dTreated$isTest <- d2$isTest
print(treatedVars)
##  [1] "LB_clean"       "LB_isBAD"       "AC.1_clean"     "AC.1_isBAD"    
##  [5] "FM.1_clean"     "FM.1_isBAD"     "UC.1_clean"     "UC.1_isBAD"    
##  [9] "DL.1_clean"     "DL.1_isBAD"     "DS.1_clean"     "DS.1_isBAD"    
## [13] "DP.1_clean"     "DP.1_isBAD"     "ASTV_clean"     "ASTV_isBAD"    
## [17] "MSTV_clean"     "MSTV_isBAD"     "ALTV_clean"     "ALTV_isBAD"    
## [21] "MLTV_clean"     "MLTV_isBAD"     "Width_clean"    "Width_isBAD"   
## [25] "Min_clean"      "Min_isBAD"      "Max_clean"      "Max_isBAD"     
## [29] "Nmax_clean"     "Nmax_isBAD"     "Nzeros_clean"   "Nzeros_isBAD"  
## [33] "Mode_clean"     "Mode_isBAD"     "Mean_clean"     "Mean_isBAD"    
## [37] "Median_clean"   "Median_isBAD"   "Variance_clean" "Variance_isBAD"
## [41] "Tendency_clean" "Tendency_isBAD"
print(head(dTreated))
##   LB_clean LB_isBAD AC.1_clean AC.1_isBAD FM.1_clean FM.1_isBAD UC.1_clean
## 1      120        0      0.000          0          0          0      0.000
## 2      132        0      0.006          0          0          0      0.006
## 3      133        0      0.003          0          0          0      0.008
## 4      134        0      0.003          0          0          0      0.008
## 5      132        0      0.007          0          0          0      0.008
## 6      134        0      0.001          0          0          0      0.010
##   UC.1_isBAD DL.1_clean DL.1_isBAD   DS.1_clean DS.1_isBAD DP.1_clean
## 1          0      0.000          0 0.000000e+00          0      0.000
## 2          0      0.003          0 0.000000e+00          0      0.000
## 3          0      0.003          0 0.000000e+00          0      0.000
## 4          0      0.003          0 1.907184e-06          1      0.000
## 5          0      0.000          0 0.000000e+00          0      0.000
## 6          0      0.009          0 0.000000e+00          0      0.002
##   DP.1_isBAD ASTV_clean ASTV_isBAD MSTV_clean MSTV_isBAD ALTV_clean
## 1          0         73          0        0.5          0         43
## 2          0         17          0        2.1          0          0
## 3          0         16          0        2.1          0          0
## 4          0         16          0        2.4          0          0
## 5          0         16          0        2.4          0          0
## 6          0         26          0        5.9          0          0
##   ALTV_isBAD MLTV_clean MLTV_isBAD Width_clean Width_isBAD Min_clean
## 1          0        2.4          0          64           0        62
## 2          0       10.4          0         130           0        68
## 3          0       13.4          0         130           0        68
## 4          0       23.0          0         117           0        53
## 5          0       19.9          0         117           0        53
## 6          0        0.0          0         150           0        50
##   Min_isBAD Max_clean Max_isBAD Nmax_clean Nmax_isBAD Nzeros_clean
## 1         0   126.000         0          2          0            0
## 2         0   198.000         0          6          0            1
## 3         0   198.000         0          5          0            1
## 4         0   170.000         0         11          0            0
## 5         0   164.075         1          9          0            0
## 6         0   200.000         0          5          0            3
##   Nzeros_isBAD Mode_clean Mode_isBAD Mean_clean Mean_isBAD Median_clean
## 1            0        120          0        137          0          121
## 2            0        141          0        136          0          140
## 3            0        141          0        135          0          138
## 4            0        137          0        134          0          137
## 5            0        137          0        136          0          138
## 6            0         76          0        107          0          107
##   Median_isBAD Variance_clean Variance_isBAD Tendency_clean Tendency_isBAD
## 1            0             73              0              1              0
## 2            0             12              0              0              0
## 3            0             13              0              0              0
## 4            0             13              0              1              0
## 5            0             11              0              1              0
## 6            0            170              0              0              0
##   NSPbad isTest
## 1   TRUE  FALSE
## 2  FALSE  FALSE
## 3  FALSE  FALSE
## 4  FALSE   TRUE
## 5  FALSE   TRUE
## 6   TRUE   TRUE
d$modelEGLMEI <- glmNetModelPred(dTreated,treatedVars)
report(d,'modelEGLMEI',"glmnet (with informative missing data)")
## [1] "***********"
## [1] "model modelEGLMEI glmnet (with informative missing data)"
## [1] "\t train accuracy modelEGLMEI 0.93"
## [1] "\tmodel explained a 0.69 fraction of the variation on train"
## [1] "\t test accuracy modelEGLMEI 0.94"
## [1] "\tmodel explained a 0.7 fraction of the variation on test"

## [1] "***********"