# load libraries
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 3.2.4
library('reshape2')
library('rpart')
library('ROCR')
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
library('gdata')
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
# load problem
d = read.xls("CTG.xls",
sheet=2,header=TRUE,skip=1,
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
vars <- colnames(d)[11:31]
d <- d[,c(vars,'NSP'),drop=FALSE]
d <- d[complete.cases(d),,drop=FALSE]
d[,'NSP'] <- c('Normal','Suspect','Pathological')[d[,'NSP']]
yColumn <- 'NSPbad'
d[,yColumn] <- d[,'NSP']!='Normal'
set.seed(2352)
d$isTest <- runif(nrow(d))<0.25
d$dataLabel <- ifelse(d$isTest,"test data","train data")
formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
print(dim(d))
## [1] 2126 25
print(str(d))
## 'data.frame': 2126 obs. of 25 variables:
## $ LB : int 120 132 133 134 132 134 134 122 122 122 ...
## $ AC.1 : num 0 0.006 0.003 0.003 0.007 0.001 0.001 0 0 0 ...
## $ FM.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UC.1 : num 0 0.006 0.008 0.008 0.008 0.01 0.013 0 0.002 0.003 ...
## $ DL.1 : num 0 0.003 0.003 0.003 0 0.009 0.008 0 0 0 ...
## $ DS.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DP.1 : num 0 0 0 0 0 0.002 0.003 0 0 0 ...
## $ ASTV : num 73 17 16 16 16 26 29 83 84 86 ...
## $ MSTV : num 0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
## $ ALTV : num 43 0 0 0 0 0 0 6 5 6 ...
## $ MLTV : num 2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
## $ Width : int 64 130 130 117 117 150 150 68 68 68 ...
## $ Min : int 62 68 68 53 53 50 50 62 62 62 ...
## $ Max : int 126 198 198 170 170 200 200 130 130 130 ...
## $ Nmax : int 2 6 5 11 9 5 6 0 0 1 ...
## $ Nzeros : int 0 1 1 0 0 3 3 0 0 0 ...
## $ Mode : int 120 141 141 137 137 76 71 122 122 122 ...
## $ Mean : int 137 136 135 134 136 107 107 122 122 122 ...
## $ Median : int 121 140 138 137 138 107 106 123 123 123 ...
## $ Variance : int 73 12 13 13 11 170 215 3 3 1 ...
## $ Tendency : int 1 0 0 1 1 0 0 1 1 1 ...
## $ NSP : chr "Suspect" "Normal" "Normal" "Normal" ...
## $ NSPbad : logi TRUE FALSE FALSE FALSE FALSE TRUE ...
## $ isTest : logi FALSE FALSE FALSE TRUE TRUE TRUE ...
## $ dataLabel: chr "train data" "train data" "train data" "test data" ...
## NULL
print(head(d))
## LB AC.1 FM.1 UC.1 DL.1 DS.1 DP.1 ASTV MSTV ALTV MLTV Width Min Max
## 1 120 0.000 0 0.000 0.000 0 0.000 73 0.5 43 2.4 64 62 126
## 2 132 0.006 0 0.006 0.003 0 0.000 17 2.1 0 10.4 130 68 198
## 3 133 0.003 0 0.008 0.003 0 0.000 16 2.1 0 13.4 130 68 198
## 4 134 0.003 0 0.008 0.003 0 0.000 16 2.4 0 23.0 117 53 170
## 5 132 0.007 0 0.008 0.000 0 0.000 16 2.4 0 19.9 117 53 170
## 6 134 0.001 0 0.010 0.009 0 0.002 26 5.9 0 0.0 150 50 200
## Nmax Nzeros Mode Mean Median Variance Tendency NSP NSPbad
## 1 2 0 120 137 121 73 1 Suspect TRUE
## 2 6 1 141 136 140 12 0 Normal FALSE
## 3 5 1 141 135 138 13 0 Normal FALSE
## 4 11 0 137 134 137 13 1 Normal FALSE
## 5 9 0 137 136 138 11 1 Normal FALSE
## 6 5 3 76 107 107 170 0 Pathological TRUE
## isTest dataLabel
## 1 FALSE train data
## 2 FALSE train data
## 3 FALSE train data
## 4 TRUE test data
## 5 TRUE test data
## 6 TRUE test data
print(summary(d))
## LB AC.1 FM.1 UC.1
## Min. :106.0 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:126.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.002000
## Median :133.0 Median :0.002000 Median :0.000000 Median :0.004000
## Mean :133.3 Mean :0.003178 Mean :0.009481 Mean :0.004366
## 3rd Qu.:140.0 3rd Qu.:0.006000 3rd Qu.:0.003000 3rd Qu.:0.007000
## Max. :160.0 Max. :0.019000 Max. :0.481000 Max. :0.015000
## DL.1 DS.1 DP.1
## Min. :0.000000 Min. :0.000e+00 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.000e+00 1st Qu.:0.0000000
## Median :0.000000 Median :0.000e+00 Median :0.0000000
## Mean :0.001889 Mean :3.293e-06 Mean :0.0001585
## 3rd Qu.:0.003000 3rd Qu.:0.000e+00 3rd Qu.:0.0000000
## Max. :0.015000 Max. :1.000e-03 Max. :0.0050000
## ASTV MSTV ALTV MLTV
## Min. :12.00 Min. :0.200 Min. : 0.000 Min. : 0.000
## 1st Qu.:32.00 1st Qu.:0.700 1st Qu.: 0.000 1st Qu.: 4.600
## Median :49.00 Median :1.200 Median : 0.000 Median : 7.400
## Mean :46.99 Mean :1.333 Mean : 9.847 Mean : 8.188
## 3rd Qu.:61.00 3rd Qu.:1.700 3rd Qu.:11.000 3rd Qu.:10.800
## Max. :87.00 Max. :7.000 Max. :91.000 Max. :50.700
## Width Min Max Nmax
## Min. : 3.00 Min. : 50.00 Min. :122 Min. : 0.000
## 1st Qu.: 37.00 1st Qu.: 67.00 1st Qu.:152 1st Qu.: 2.000
## Median : 67.50 Median : 93.00 Median :162 Median : 3.000
## Mean : 70.45 Mean : 93.58 Mean :164 Mean : 4.068
## 3rd Qu.:100.00 3rd Qu.:120.00 3rd Qu.:174 3rd Qu.: 6.000
## Max. :180.00 Max. :159.00 Max. :238 Max. :18.000
## Nzeros Mode Mean Median
## Min. : 0.0000 Min. : 60.0 Min. : 73.0 Min. : 77.0
## 1st Qu.: 0.0000 1st Qu.:129.0 1st Qu.:125.0 1st Qu.:129.0
## Median : 0.0000 Median :139.0 Median :136.0 Median :139.0
## Mean : 0.3236 Mean :137.5 Mean :134.6 Mean :138.1
## 3rd Qu.: 0.0000 3rd Qu.:148.0 3rd Qu.:145.0 3rd Qu.:148.0
## Max. :10.0000 Max. :187.0 Max. :182.0 Max. :186.0
## Variance Tendency NSP NSPbad
## Min. : 0.00 Min. :-1.0000 Length:2126 Mode :logical
## 1st Qu.: 2.00 1st Qu.: 0.0000 Class :character FALSE:1655
## Median : 7.00 Median : 0.0000 Mode :character TRUE :471
## Mean : 18.81 Mean : 0.3203 NA's :0
## 3rd Qu.: 24.00 3rd Qu.: 1.0000
## Max. :269.00 Max. : 1.0000
## isTest dataLabel
## Mode :logical Length:2126
## FALSE:1602 Class :character
## TRUE :524 Mode :character
## NA's :0
##
##
print(summary(as.factor(d$NSP)))
## Normal Pathological Suspect
## 1655 176 295
# load some conveince functions
# calulcate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc <- function(x,y) {
# append extra points to get rid of degenerate cases
x <- c(0,x,1)
y <- c(0,y,1)
n <- length(x)
sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}
gainCurve = function(truthcol, predcol, title) {
truthcol <- as.numeric(truthcol)
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol=predcol,truthcol=truthcol)
predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
wizard = order(d[['truthcol']], decreasing=TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(pctpop= (1:npop)/npop,
model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
modelArea = areaCalc(results$pctpop,results$model) - 0.5
giniScore = modelArea/idealArea # actually, normalized gini score
# melt the frame into the tall form, for plotting
results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
variable.name="sort_criterion", value.name="pct_outcome")
gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) +
geom_point() + geom_line() +
geom_abline(color="gray") +
ggtitle(paste("Gain curve,", title, '\n',
'relative Gini score', format(giniScore,digits=2))) +
xlab("% items in score order") + ylab("% total category") +
scale_x_continuous(breaks=seq(0,1,0.1)) +
scale_y_continuous(breaks=seq(0,1,0.1)) +
scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
gplot
}
plotROC <- function(title,outcol,predcol) {
pred <- prediction(predcol,outcol)
perf <- performance(pred,'tpr','fpr')
auc <- as.numeric(performance(pred,'auc')@y.values)
pf <- data.frame(
FalsePositiveRate=perf@x.values[[1]],
TruePositiveRate=perf@y.values[[1]])
plot=ggplot() +
geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
fill='blue',alpha=0.3) +
geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
list(pf=pf,plot=plot)
}
deviance <- function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
reportStats <- function(d,test,modelName,title,epsilon) {
dSub <- d[d$isTest==test,,drop=FALSE]
tab <- table(truth=dSub[,yColumn],pred=dSub[,modelName]>0.5)
accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
note = ifelse(test,'test','train')
print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2)))
residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon)
#print(paste('\tresidual.deviance',residual.deviance))
null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon)
#print(paste('\tnull.deviance',null.deviance))
print(paste("\tmodel explained a",
format((1-residual.deviance/null.deviance),digits=2),
"fraction of the variation on",note))
}
report <- function(d,modelName,title,epsilon=1.0e-2) {
print("***********")
print(paste("model",modelName,title))
reportStats(d,FALSE,modelName,title,epsilon)
reportStats(d,TRUE,modelName,title,epsilon)
print(ggplot(data=d,aes_string(x=modelName,color=yColumn)) +
geom_density() + facet_wrap(~dataLabel,ncol=1,scales='free_y') +
ggtitle(title))
print(plotROC(paste(title,'train'),
d[d$isTest==FALSE,yColumn],
d[d$isTest==FALSE,modelName])$plot)
print(plotROC(paste(title,'test'),
d[d$isTest==TRUE,yColumn],
d[d$isTest==TRUE,modelName])$plot)
print(gainCurve(d[d$isTest==FALSE,yColumn],
d[d$isTest==FALSE,modelName],
paste(title,'train')))
print(gainCurve(d[d$isTest==TRUE,yColumn],
d[d$isTest==TRUE,modelName],
paste(title,'test')))
print("***********")
}
# build a decision tree
treeModelPred <- function(d,vars) {
formula <- paste('as.factor(',yColumn,') ~ ',paste(vars,collapse=' + '),sep='')
modelTree <- rpart(formula,data=d)
predict(modelTree,newdata=d,type='prob')[,'TRUE',drop=TRUE]
}
d$modelTree <- treeModelPred(d,vars)
report(d,'modelTree',"decision tree (with some missing values)")
## [1] "***********"
## [1] "model modelTree decision tree (with some missing values)"
## [1] "\t train accuracy modelTree 0.95"
## [1] "\tmodel explained a 0.62 fraction of the variation on train"
## [1] "\t test accuracy modelTree 0.94"
## [1] "\tmodel explained a 0.63 fraction of the variation on test"
## [1] "***********"
# build a GLMNET model
glmNetModelPred <- function(d,vars) {
# Nota bene: only save to all as.matrix() on a dataframe that is all numeric
# is it is in this example.
cv <- cv.glmnet(x=as.matrix(d[!d$isTest,vars]),y=d[!d$isTest,yColumn,drop=TRUE],
family='binomial')
predict(cv,newx=as.matrix(d[,vars]),type='response')[,1]
}
d$modelEGLM <- glmNetModelPred(d,vars)
report(d,'modelEGLM',"glmnet")
## [1] "***********"
## [1] "model modelEGLM glmnet"
## [1] "\t train accuracy modelEGLM 0.91"
## [1] "\tmodel explained a 0.6 fraction of the variation on train"
## [1] "\t test accuracy modelEGLM 0.92"
## [1] "\tmodel explained a 0.61 fraction of the variation on test"
## [1] "***********"
# force some values to missing at random (unifromative)
d2 <- d
nMissing <- 1000
set.seed(235)
for(v in sample(vars,nMissing,replace=TRUE)) {
i = sample.int(nrow(d2),1)
d2[i,v] <- NA
}
# try to re-build a decision tree model
d$modelTreeE <- treeModelPred(d2,vars)
report(d,'modelTreeE',"decision tree")
## [1] "***********"
## [1] "model modelTreeE decision tree"
## [1] "\t train accuracy modelTreeE 0.94"
## [1] "\tmodel explained a 0.58 fraction of the variation on train"
## [1] "\t test accuracy modelTreeE 0.94"
## [1] "\tmodel explained a 0.59 fraction of the variation on test"
## [1] "***********"
# try to re-build a GLMNET model
tryCatch(
glmNetModelPred(d2,vars),
warning = function(w) {print(paste('warning',w)); c()},
error = function(e) {print(paste('error',e)) ; c()})
## [1] "error Error in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : NA/NaN/Inf in foreign function call (arg 5)\n"
## NULL
# Install a library try to work around problem
# install.packages('vtreat')
library('vtreat') # https://github.com/WinVector/vtreat
# try a fix for the missing values
treatmentsC <- designTreatmentsC(d2[!d2$isTest,],vars,yColumn,TRUE,verbose=FALSE)
dTreated <- prepare(treatmentsC,d2,pruneSig=c())
treatedVars <- setdiff(colnames(dTreated),c(yColumn,'isTest'))
dTreated$isTest <- d2$isTest
print(treatedVars)
## [1] "LB_clean" "LB_isBAD" "AC.1_clean" "AC.1_isBAD"
## [5] "FM.1_clean" "FM.1_isBAD" "UC.1_clean" "UC.1_isBAD"
## [9] "DL.1_clean" "DL.1_isBAD" "DS.1_clean" "DS.1_isBAD"
## [13] "DP.1_clean" "DP.1_isBAD" "ASTV_clean" "ASTV_isBAD"
## [17] "MSTV_clean" "MSTV_isBAD" "ALTV_clean" "ALTV_isBAD"
## [21] "MLTV_clean" "MLTV_isBAD" "Width_clean" "Width_isBAD"
## [25] "Min_clean" "Min_isBAD" "Max_clean" "Max_isBAD"
## [29] "Nmax_clean" "Nmax_isBAD" "Nzeros_clean" "Nzeros_isBAD"
## [33] "Mode_clean" "Mode_isBAD" "Mean_clean" "Mean_isBAD"
## [37] "Median_clean" "Median_isBAD" "Variance_clean" "Variance_isBAD"
## [41] "Tendency_clean" "Tendency_isBAD"
print(head(dTreated))
## LB_clean LB_isBAD AC.1_clean AC.1_isBAD FM.1_clean FM.1_isBAD UC.1_clean
## 1 120 0 0.000 0 0 0 0.000
## 2 132 0 0.006 0 0 0 0.006
## 3 133 0 0.003 0 0 0 0.008
## 4 134 0 0.003 0 0 0 0.008
## 5 132 0 0.007 0 0 0 0.008
## 6 134 0 0.001 0 0 0 0.010
## UC.1_isBAD DL.1_clean DL.1_isBAD DS.1_clean DS.1_isBAD DP.1_clean
## 1 0 0.000 0 0.000000e+00 0 0.000
## 2 0 0.003 0 0.000000e+00 0 0.000
## 3 0 0.003 0 0.000000e+00 0 0.000
## 4 0 0.003 0 1.918159e-06 1 0.000
## 5 0 0.000 0 0.000000e+00 0 0.000
## 6 0 0.009 0 0.000000e+00 0 0.002
## DP.1_isBAD ASTV_clean ASTV_isBAD MSTV_clean MSTV_isBAD ALTV_clean
## 1 0 73 0 0.5 0 43
## 2 0 17 0 2.1 0 0
## 3 0 16 0 2.1 0 0
## 4 0 16 0 2.4 0 0
## 5 0 16 0 2.4 0 0
## 6 0 26 0 5.9 0 0
## ALTV_isBAD MLTV_clean MLTV_isBAD Width_clean Width_isBAD Min_clean
## 1 0 2.4 0 64 0 62
## 2 0 10.4 0 130 0 68
## 3 0 13.4 0 130 0 68
## 4 0 23.0 0 117 0 53
## 5 0 19.9 0 117 0 53
## 6 0 0.0 0 150 0 50
## Min_isBAD Max_clean Max_isBAD Nmax_clean Nmax_isBAD Nzeros_clean
## 1 0 126.0000 0 2 0 0
## 2 0 198.0000 0 6 0 1
## 3 0 198.0000 0 5 0 1
## 4 0 170.0000 0 11 0 0
## 5 0 164.1275 1 9 0 0
## 6 0 200.0000 0 5 0 3
## Nzeros_isBAD Mode_clean Mode_isBAD Mean_clean Mean_isBAD Median_clean
## 1 0 120 0 137 0 121
## 2 0 141 0 136 0 140
## 3 0 141 0 135 0 138
## 4 0 137 0 134 0 137
## 5 0 137 0 136 0 138
## 6 0 76 0 107 0 107
## Median_isBAD Variance_clean Variance_isBAD Tendency_clean Tendency_isBAD
## 1 0 73 0 1 0
## 2 0 12 0 0 0
## 3 0 13 0 0 0
## 4 0 13 0 1 0
## 5 0 11 0 1 0
## 6 0 170 0 0 0
## NSPbad isTest
## 1 TRUE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE TRUE
## 5 FALSE TRUE
## 6 TRUE TRUE
d$modelEGLME <- glmNetModelPred(dTreated,treatedVars)
report(d,'modelEGLME',"glmnet (with missing data)")
## [1] "***********"
## [1] "model modelEGLME glmnet (with missing data)"
## [1] "\t train accuracy modelEGLME 0.91"
## [1] "\tmodel explained a 0.58 fraction of the variation on train"
## [1] "\t test accuracy modelEGLME 0.9"
## [1] "\tmodel explained a 0.58 fraction of the variation on test"
## [1] "***********"
# show what informative data loss can look like
d2 <- d
nMissing <- 1000
set.seed(235)
for(v in sample(vars,nMissing,replace=TRUE)) {
i = sample.int(nrow(d2),1)
# suppose data is only lost on negative examples (extreme form of informative!)
if(!d2[i,yColumn]) {
d2[i,v] <- NA
}
}
treatmentsC <- designTreatmentsC(d2[!d2$isTest,],vars,yColumn,TRUE,verbose=FALSE)
dTreated <- prepare(treatmentsC,d2,pruneSig=c())
treatedVars <- setdiff(colnames(dTreated),c(yColumn,'isTest'))
dTreated$isTest <- d2$isTest
print(treatedVars)
## [1] "LB_clean" "LB_isBAD" "AC.1_clean" "AC.1_isBAD"
## [5] "FM.1_clean" "FM.1_isBAD" "UC.1_clean" "UC.1_isBAD"
## [9] "DL.1_clean" "DL.1_isBAD" "DS.1_clean" "DS.1_isBAD"
## [13] "DP.1_clean" "DP.1_isBAD" "ASTV_clean" "ASTV_isBAD"
## [17] "MSTV_clean" "MSTV_isBAD" "ALTV_clean" "ALTV_isBAD"
## [21] "MLTV_clean" "MLTV_isBAD" "Width_clean" "Width_isBAD"
## [25] "Min_clean" "Min_isBAD" "Max_clean" "Max_isBAD"
## [29] "Nmax_clean" "Nmax_isBAD" "Nzeros_clean" "Nzeros_isBAD"
## [33] "Mode_clean" "Mode_isBAD" "Mean_clean" "Mean_isBAD"
## [37] "Median_clean" "Median_isBAD" "Variance_clean" "Variance_isBAD"
## [41] "Tendency_clean" "Tendency_isBAD"
print(head(dTreated))
## LB_clean LB_isBAD AC.1_clean AC.1_isBAD FM.1_clean FM.1_isBAD UC.1_clean
## 1 120 0 0.000 0 0 0 0.000
## 2 132 0 0.006 0 0 0 0.006
## 3 133 0 0.003 0 0 0 0.008
## 4 134 0 0.003 0 0 0 0.008
## 5 132 0 0.007 0 0 0 0.008
## 6 134 0 0.001 0 0 0 0.010
## UC.1_isBAD DL.1_clean DL.1_isBAD DS.1_clean DS.1_isBAD DP.1_clean
## 1 0 0.000 0 0.000000e+00 0 0.000
## 2 0 0.003 0 0.000000e+00 0 0.000
## 3 0 0.003 0 0.000000e+00 0 0.000
## 4 0 0.003 0 1.907184e-06 1 0.000
## 5 0 0.000 0 0.000000e+00 0 0.000
## 6 0 0.009 0 0.000000e+00 0 0.002
## DP.1_isBAD ASTV_clean ASTV_isBAD MSTV_clean MSTV_isBAD ALTV_clean
## 1 0 73 0 0.5 0 43
## 2 0 17 0 2.1 0 0
## 3 0 16 0 2.1 0 0
## 4 0 16 0 2.4 0 0
## 5 0 16 0 2.4 0 0
## 6 0 26 0 5.9 0 0
## ALTV_isBAD MLTV_clean MLTV_isBAD Width_clean Width_isBAD Min_clean
## 1 0 2.4 0 64 0 62
## 2 0 10.4 0 130 0 68
## 3 0 13.4 0 130 0 68
## 4 0 23.0 0 117 0 53
## 5 0 19.9 0 117 0 53
## 6 0 0.0 0 150 0 50
## Min_isBAD Max_clean Max_isBAD Nmax_clean Nmax_isBAD Nzeros_clean
## 1 0 126.000 0 2 0 0
## 2 0 198.000 0 6 0 1
## 3 0 198.000 0 5 0 1
## 4 0 170.000 0 11 0 0
## 5 0 164.075 1 9 0 0
## 6 0 200.000 0 5 0 3
## Nzeros_isBAD Mode_clean Mode_isBAD Mean_clean Mean_isBAD Median_clean
## 1 0 120 0 137 0 121
## 2 0 141 0 136 0 140
## 3 0 141 0 135 0 138
## 4 0 137 0 134 0 137
## 5 0 137 0 136 0 138
## 6 0 76 0 107 0 107
## Median_isBAD Variance_clean Variance_isBAD Tendency_clean Tendency_isBAD
## 1 0 73 0 1 0
## 2 0 12 0 0 0
## 3 0 13 0 0 0
## 4 0 13 0 1 0
## 5 0 11 0 1 0
## 6 0 170 0 0 0
## NSPbad isTest
## 1 TRUE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE TRUE
## 5 FALSE TRUE
## 6 TRUE TRUE
d$modelEGLMEI <- glmNetModelPred(dTreated,treatedVars)
report(d,'modelEGLMEI',"glmnet (with informative missing data)")
## [1] "***********"
## [1] "model modelEGLMEI glmnet (with informative missing data)"
## [1] "\t train accuracy modelEGLMEI 0.93"
## [1] "\tmodel explained a 0.69 fraction of the variation on train"
## [1] "\t test accuracy modelEGLMEI 0.94"
## [1] "\tmodel explained a 0.7 fraction of the variation on test"
## [1] "***********"