Demonstration code for model evaluation

First, load all libraries needed for the examples

#
# This is the vtreat library that will be discussed in the 
# Missing Values lesson. Uncomment the next two lines
# to install the package
#
# install.packages('WinVector/vtreat')
library(vtreat)

library(ggplot2) # for plotting
## Warning: package 'ggplot2' was built under R version 3.2.4
library(reshape2) # for the melt command
library(ROCR)  # for ROC plots
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
# set the random number generator seed, so the random assignments are the same every time
set.seed(45433622) 

Regression

We will use the Baseball salaries data.

# load data
salaryData = readRDS("salaryData.rds")

# set the outcome variable, and the input variables
outcome = "logSalary"
vars = setdiff(colnames(salaryData), c("Salary", "Player", "logSalary"))

Split the data into a training set and a test set.

nr = nrow(salaryData)
# make the train/test assignments (set aside 25% of the data for test)
isTest = runif(nr)<=0.25

# split the data
test = salaryData[isTest,]
train = salaryData[!isTest, ]

salaryData$isTest = isTest  # put the test marker back in the data, for reproducibility

Train a linear regression model on the training set

fmla = paste(outcome, "~", paste(vars, collapse="+")) # set up the variables
model1 = lm(fmla, data=train)
summary(model1)
## 
## Call:
## lm(formula = fmla, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09000 -0.12836 -0.01524  0.14299  0.55490 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.2631291  0.1210689  18.693  < 2e-16 ***
## batting_average          0.8327396  0.9743267   0.855  0.39357    
## OBP                     -1.1435733  0.8556458  -1.337  0.18263    
## runs                    -0.0010018  0.0020816  -0.481  0.63075    
## hits                     0.0035977  0.0012131   2.966  0.00332 ** 
## doubles                 -0.0031576  0.0032206  -0.980  0.32784    
## triples                 -0.0122794  0.0082382  -1.491  0.13737    
## homeruns                 0.0054996  0.0047118   1.167  0.24427    
## RBI                      0.0025508  0.0019029   1.340  0.18135    
## walks                    0.0033186  0.0016985   1.954  0.05186 .  
## strikeouts              -0.0018785  0.0007916  -2.373  0.01842 *  
## stolenbases              0.0029310  0.0017296   1.695  0.09142 .  
## errors                  -0.0049083  0.0028484  -1.723  0.08612 .  
## free_agency_eligibility  0.6521692  0.0413078  15.788  < 2e-16 ***
## free_agent_in_1991_2    -0.0704410  0.0498863  -1.412  0.15921    
## arbitration_eligibility  0.5667131  0.0460688  12.301  < 2e-16 ***
## arbitration_in_1991_2   -0.0542029  0.0914759  -0.593  0.55404    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2287 on 244 degrees of freedom
## Multiple R-squared:  0.8147, Adjusted R-squared:  0.8025 
## F-statistic: 67.04 on 16 and 244 DF,  p-value: < 2.2e-16
# the model summary will report training set statistics
summ1 = summary(model1)
err = summ1$residuals # response - prediction
sqrt(mean(err^2))  # the root mean squared error  (0.2211)
## [1] 0.2211685
# the residual error in the summary report is root of the sum of squared error,
# divided by the degrees of freedom
sqrt(sum(err^2)/model1$df)
## [1] 0.2287434
# r-squared and adjusted r-squared
summ1$r.squared
## [1] 0.8146898
summ1$adj.r.squared
## [1] 0.8025383

Evaluate the model on test data

Root mean squared error and R-squared

#
# first, wrap the evaluations in functions
#

# root mean squared error
rmse = function(truth, pred) {
  err = truth-pred
  sqrt(mean(err^2))
}

# R-squared 
# We won't look at adjusted R2, because it's really
# just an estimate of R-squared on holdout data, which
# we can calculate directly
R2 = function(truth, pred) {
  err = truth-pred
  meany = mean(truth)
  null_err = truth-meany
  
  1 - (sum(err^2)/sum(null_err^2))
}

# make the predictions on the salaryData frame
salPred = predict(model1, newdata=salaryData)

# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]], 
                  pred = salPred, isTest=salaryData$isTest)

perfTrain = perf[!isTest,]
perfTest = perf[isTest,]

# check the training stats came out as reported
rmse(perfTrain$logSalary, perfTrain$pred)
## [1] 0.2211685
R2(perfTrain$logSalary, perfTrain$pred)
## [1] 0.8146898
# check test set performance
rmse(perfTest$logSalary, perfTest$pred)
## [1] 0.2571019
R2(perfTest$logSalary, perfTest$pred)
## [1] 0.7309676

Plots

Plot predictions

ggplot(perf, aes(x=pred, y=logSalary, color=isTest)) + 
  geom_point(aes(shape=isTest)) +  
  geom_abline(slope=1) + 
  scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
  coord_fixed()

Plot residuals

ggplot(perf, aes(x=pred, y=logSalary-pred, color=isTest)) + 
  geom_point(aes(shape=isTest)) +  
  geom_abline(slope=0) + 
  scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
  coord_fixed()

The Gain Curve

#
# Let's make this a function, since we'll use it again
#

# calculate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc = function(x,y) {
   # append extra points to get rid of degenerate cases
   x = c(0,x,1)
   y = c(0,y,1)
   n = length(x)
   sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}


gainCurve = function(truthcol, predcol, title) {
  # data frame of pred and truth, sorted in order of the predictions
  d = data.frame(predcol=predcol,truthcol=truthcol)
  predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
  wizard = order(d[['truthcol']], decreasing=TRUE)
  npop = dim(d)[1]
  
  # data frame with the cumulative prediction/truth as a function
  # of the fraction of the population we're considering, highest first
  results = data.frame(pctpop= (1:npop)/npop,
                       model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
                       wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
  
  # calculate the areas under each curve
  # gini score is 2* (area - 0.5)
  idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
  modelArea = areaCalc(results$pctpop,results$model) - 0.5
  giniScore = modelArea/idealArea # actually, normalized gini score
  
  # melt the frame into the tall form, for plotting
  results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
                 variable.name="sort_criterion", value.name="pct_outcome")
  
  gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) + 
    geom_point() + geom_line() + 
    geom_abline(color="gray") +
    ggtitle(paste("Gain curve,", title, '\n', 
       'relative Gini score', format(giniScore,digits=2))) +
       xlab("% items in score order") + ylab("% total category") +
       scale_x_continuous(breaks=seq(0,1,0.1)) +
       scale_y_continuous(breaks=seq(0,1,0.1)) +
    scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
  gplot
}

Now we can plot the gain curves, one for training and one for test. We’ll plot salary, not log salary.

gainCurve(10^perfTrain$logSalary, 10^perfTrain$pred, "Baseball salaries, training:")

gainCurve(10^perfTest$logSalary, 10^perfTest$pred, "Baseball salaries, test:")

Classification (Class probabilities)

Task: Predict the onset of diabetes within 5 years

Load the data

d = read.table("pima-indians-diabetes.data.txt", header=FALSE, sep=",", stringsAsFactors=FALSE)

# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

colnames(d) = c("npregnant",
             "glucose",
             "blood_pressure",
             "tricep_skin_fold_thickness",
             "insulin",
             "bmi",
             "diabetes_pedigree",
             "age",
             "diabetic")  # 1=diagnosed as diabetic within 5 years/0=not diagnosed after 5 years
d$diabetic = d$diabetic>0.5 # switch outcome to logical

Find missing data. Note: although the dataset donors claim there are no missing values, some of the zeros are clearly actually missing values. I am going to treat 0 as ‘missing’ for glucose, blood_pressure, skin fold thickness, and bmi.

zero_as_missing = c("glucose",
             "blood_pressure",
             "tricep_skin_fold_thickness",
             "bmi")
leave = setdiff(colnames(d), zero_as_missing)

d0 = as.data.frame(lapply(d[,zero_as_missing], 
                          FUN=function(x) ifelse(x==0, NA, x)))

d = cbind(d[,leave], d0)

Set outcome and iput variables, split into training and test

yColumn = 'diabetic'
vars = setdiff(colnames(d),c(yColumn, "isTest", "dataLabel"))
d$isTest = runif(nrow(d))<0.25
d$dataLabel = ifelse(d$isTest,"test data","train data")
# a nicety, so that train plots above test when we facet wrap below
d$dataLabel = reorder(d$dataLabel, ifelse(d$dataLabel=='train data', 1, 2), FUN=mean)

Now treat the missing values, by substituting mean value for them and adding an additional informational column. We will use the vtreat library to do this.

dtrain = d[!d$isTest, ]
treatPlan = designTreatmentsC(dtrain, vars, yColumn, TRUE, verbose=FALSE)

dtrainTreat = prepare(treatPlan, dtrain, pruneSig=NULL, doCollar=FALSE)
# the treated data has all NAs replaced by the mean value of the variable,
# and additional columns to mark which values were formerly NAs
head(dtrainTreat)
##   npregnant_clean insulin_clean diabetes_pedigree_clean age_clean
## 1               1             0                   0.351        31
## 2               8             0                   0.672        32
## 3               1            94                   0.167        21
## 4               5             0                   0.201        30
## 5               4             0                   0.191        30
## 6              10             0                   1.441        57
##   glucose_clean glucose_isBAD blood_pressure_clean blood_pressure_isBAD
## 1            85             0                   66                    0
## 2           183             0                   64                    0
## 3            89             0                   66                    0
## 4           116             0                   74                    0
## 5           110             0                   92                    0
## 6           139             0                   80                    0
##   tricep_skin_fold_thickness_clean tricep_skin_fold_thickness_isBAD
## 1                         29.00000                                0
## 2                         29.59799                                1
## 3                         23.00000                                0
## 4                         29.59799                                1
## 5                         29.59799                                1
## 6                         29.59799                                1
##   bmi_clean bmi_isBAD diabetic
## 1      26.6         0    FALSE
## 2      23.3         0     TRUE
## 3      28.1         0    FALSE
## 4      25.6         0    FALSE
## 5      37.6         0    FALSE
## 6      27.1         0    FALSE
# get the new variable names
newvars = setdiff(colnames(dtrainTreat), yColumn)

Train the model (glm)

# make the formula
fmla = paste(yColumn, "~", paste(newvars, collapse="+"))
model2 = glm(fmla, data=dtrainTreat, family=binomial(link="logit"))
summary(model2)
## 
## Call:
## glm(formula = fmla, family = binomial(link = "logit"), data = dtrainTreat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7303  -0.7183  -0.3683   0.7064   2.3521  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)                      -9.2605136  0.9768067  -9.480  < 2e-16
## npregnant_clean                   0.1481717  0.0382212   3.877 0.000106
## insulin_clean                    -0.0002292  0.0011059  -0.207 0.835784
## diabetes_pedigree_clean           1.0824446  0.3574341   3.028 0.002459
## age_clean                         0.0111342  0.0107686   1.034 0.301157
## glucose_clean                     0.0391050  0.0045513   8.592  < 2e-16
## glucose_isBAD                     1.5511821  1.4489791   1.071 0.284379
## blood_pressure_clean             -0.0101702  0.0100806  -1.009 0.313027
## blood_pressure_isBAD              0.9678432  0.6013335   1.609 0.107508
## tricep_skin_fold_thickness_clean  0.0124684  0.0158142   0.788 0.430447
## tricep_skin_fold_thickness_isBAD  0.2911630  0.2884797   1.009 0.312830
## bmi_clean                         0.0767548  0.0204467   3.754 0.000174
## bmi_isBAD                        -1.7652892  1.1667321  -1.513 0.130275
##                                     
## (Intercept)                      ***
## npregnant_clean                  ***
## insulin_clean                       
## diabetes_pedigree_clean          ** 
## age_clean                           
## glucose_clean                    ***
## glucose_isBAD                       
## blood_pressure_clean                
## blood_pressure_isBAD                
## tricep_skin_fold_thickness_clean    
## tricep_skin_fold_thickness_isBAD    
## bmi_clean                        ***
## bmi_isBAD                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 738.38  on 562  degrees of freedom
## Residual deviance: 515.16  on 550  degrees of freedom
## AIC: 541.16
## 
## Number of Fisher Scoring iterations: 5
# model reports training data diagnostics
# deviance
model2$deviance
## [1] 515.1598
#pseudo-Rsquared
1 - (model2$deviance/model2$null.deviance)
## [1] 0.302308

Make predictions

# treat all the data first
dTreat = prepare(treatPlan, d, pruneSig=NULL, doCollar=FALSE)

# put the predictions back into the original frame
d$model2 = predict(model2, newdata=dTreat, type='response')

Evaluate the model on training and test data.

Deviance and Pseudo-R-squared

# Convenience functions

# deviance
# epsilon > 0 is needed for smoothing for models that can return 0 or 1 probablities, 
# which a properly converged logistic regression model can't
deviance = function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}

# pseudo-R-squared
pseudoR2 = function(truth, pred, epsilon=0) {
  dev = deviance(truth, pred)
  nulldev = deviance(truth, mean(truth))
  1-(dev/nulldev)
}

dtrain = d[!d$isTest,]
dtest = d[d$isTest,]

# training deviance
deviance(dtrain[[yColumn]], dtrain$model2)
## [1] 515.1598
# test deviance
deviance(dtest[[yColumn]], dtest$model2)
## [1] 194.8468
# remember, deviance can't be compared across data sets
# pseudo-R-squared can be compared across data sets

# training
pseudoR2(dtrain[[yColumn]], dtrain$model2)
## [1] 0.302308
# test
pseudoR2(dtest[[yColumn]], dtest$model2)
## [1] 0.2296849

Plots

Double density plot

ggplot(d, aes_string(x='model2', color=yColumn)) + geom_density(adjust=0.5) + 
  facet_wrap(~dataLabel, ncol=1) 

ROC plot*

plotROC = function(title,outcol,predcol) {
  #
  # prediction and performance are in the ROCR package
  # 
  pred = prediction(predcol,outcol)
  perf = performance(pred,'tpr','fpr')
  # get the auc
  auc = as.numeric(performance(pred,'auc')@y.values)
  
  # pull the information out of the perf structure so
  # we can plot it with ggplot
  pf = data.frame(
    FalsePositiveRate=perf@x.values[[1]],
    TruePositiveRate=perf@y.values[[1]])
  plot=ggplot() +
    geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
      fill='blue',alpha=0.3) +
      geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
      geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
      ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
  
 plot
}

# plot the ROC for training and test sets
plotROC("Training data", d[!d$isTest, yColumn], d[!d$isTest, "model2"])

plotROC("Test data", d[d$isTest, yColumn], d[d$isTest, "model2"])

Gain Curve

gainCurve(d[!d$isTest, yColumn], d[!d$isTest, "model2"], "Training data")

gainCurve(d[d$isTest, yColumn], d[d$isTest, "model2"], "Test data")

Classification (Class Labels)

If you use a classification model that returns class probabilities, you can turn the probabilities into labels by thresholding. Thresholding at 50% is the most straightforward; you are simply assigning the more probable label.

d$predLabel = d$model2 >=0.5

For simplicity, in this section we’ll calculate the performance stats over the whole set.

Confusion matrix

Many classification measures are summarized by the confusion matrix.

# the rows are true outcome, the columns predicted outcome
cmat = table(diabetic=d[[yColumn]], pred=d$predLabel)
cmat
##         pred
## diabetic FALSE TRUE
##    FALSE   434   66
##    TRUE    110  158

Accuracy Accuracy is the sum of the diagonals of the confusion matrix

accuracy = function(cmat) {sum(diag(cmat))/sum(cmat)}
accuracy(cmat)
## [1] 0.7708333

False Positive Rate, False Negative Rate

# false positives: predicted diabetic but really not. Upper right corner
# false positive rate - the fraction of non-diabetics misdiagnosed as diabetic
fpr = function(cmat) {cmat[1, 2]/sum(cmat[1,])}
fpr(cmat)
## [1] 0.132
# false negatives: predicted not diabetic but really are. Lower left corner
# false negative rate - fraction of diabetics misdiagnosed as non-diabetic
fnr = function(cmat) {cmat[2,1]/sum(cmat[2,])}
fnr(cmat)
## [1] 0.4104478

Precision and Recall

# precision - of all the subjects in the population of interest (female Pima indians)
# who test positive, how many are true diabetics?
precision = function(cmat) {cmat[2,2]/sum(cmat[,2])}
precision(cmat)
## [1] 0.7053571
# recall - of all the diabetics in the population of interest, how many does this test identify?
recall = function(cmat) {cmat[2,2]/sum(cmat[2,])}
recall(cmat)
## [1] 0.5895522

Sensitivity and Specificity

# sensitivity - the true positive rate, or the rate at which diabetics are correctly diagnosed
# Same as recall
sensitivity = function(cmat){cmat[2,2]/sum(cmat[2,])}
sensitivity(cmat)
## [1] 0.5895522
# specificity - the true negative rate, or the rate at which non-diabetics are correctly diagnosed
specificity = function(cmat) {cmat[1,1]/sum(cmat[1,])}
specificity(cmat)
## [1] 0.868

Note that sensitivity and specificity are independent of the population distribution, but accuracy and precision are not.

# posit a test with a given sensitivity/specificity
# and populations with different prevalences of diabetes
scenario = function(sens, spec, prevalence) {
  npos = prevalence
  nneg = 1-prevalence
  
  tPos = npos*sens  # true positives
  tNeg = nneg*spec  # true negatives
  
  fPos = nneg - tNeg # false positives (negatives mis-diagnosed)
  fNeg = npos - tPos # false negatives (positives mis-diagnosed)
  
  print(paste("accuracy = ", (tPos + tNeg))) # what we got right
  print(paste("precision = ", (tPos/(tPos + fPos))))  # the number predicted positive that really are
}


prev_pima = sum(d$diabetic/nrow(d))
prev_pima
## [1] 0.3489583
scenario(sensitivity(cmat), specificity(cmat), prev_pima)
## [1] "accuracy =  0.770833333333333"
## [1] "precision =  0.705357142857143"
scenario(sensitivity(cmat), specificity(cmat), 0.1)
## [1] "accuracy =  0.840155223880597"
## [1] "precision =  0.331665211258145"
scenario(sensitivity(cmat), specificity(cmat), 0.9)
## [1] "accuracy =  0.617397014925373"
## [1] "precision =  0.975726236743298"