Demonstration code for model evaluation
First, load all libraries needed for the examples
#
# This is the vtreat library that will be discussed in the
# Missing Values lesson. Uncomment the next two lines
# to install the package
#
# install.packages('WinVector/vtreat')
library(vtreat)
library(ggplot2) # for plotting
## Warning: package 'ggplot2' was built under R version 3.2.4
library(reshape2) # for the melt command
library(ROCR) # for ROC plots
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# set the random number generator seed, so the random assignments are the same every time
set.seed(45433622)
We will use the Baseball salaries data.
# load data
salaryData = readRDS("salaryData.rds")
# set the outcome variable, and the input variables
outcome = "logSalary"
vars = setdiff(colnames(salaryData), c("Salary", "Player", "logSalary"))
Split the data into a training set and a test set.
nr = nrow(salaryData)
# make the train/test assignments (set aside 25% of the data for test)
isTest = runif(nr)<=0.25
# split the data
test = salaryData[isTest,]
train = salaryData[!isTest, ]
salaryData$isTest = isTest # put the test marker back in the data, for reproducibility
Train a linear regression model on the training set
fmla = paste(outcome, "~", paste(vars, collapse="+")) # set up the variables
model1 = lm(fmla, data=train)
summary(model1)
##
## Call:
## lm(formula = fmla, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09000 -0.12836 -0.01524 0.14299 0.55490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2631291 0.1210689 18.693 < 2e-16 ***
## batting_average 0.8327396 0.9743267 0.855 0.39357
## OBP -1.1435733 0.8556458 -1.337 0.18263
## runs -0.0010018 0.0020816 -0.481 0.63075
## hits 0.0035977 0.0012131 2.966 0.00332 **
## doubles -0.0031576 0.0032206 -0.980 0.32784
## triples -0.0122794 0.0082382 -1.491 0.13737
## homeruns 0.0054996 0.0047118 1.167 0.24427
## RBI 0.0025508 0.0019029 1.340 0.18135
## walks 0.0033186 0.0016985 1.954 0.05186 .
## strikeouts -0.0018785 0.0007916 -2.373 0.01842 *
## stolenbases 0.0029310 0.0017296 1.695 0.09142 .
## errors -0.0049083 0.0028484 -1.723 0.08612 .
## free_agency_eligibility 0.6521692 0.0413078 15.788 < 2e-16 ***
## free_agent_in_1991_2 -0.0704410 0.0498863 -1.412 0.15921
## arbitration_eligibility 0.5667131 0.0460688 12.301 < 2e-16 ***
## arbitration_in_1991_2 -0.0542029 0.0914759 -0.593 0.55404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2287 on 244 degrees of freedom
## Multiple R-squared: 0.8147, Adjusted R-squared: 0.8025
## F-statistic: 67.04 on 16 and 244 DF, p-value: < 2.2e-16
# the model summary will report training set statistics
summ1 = summary(model1)
err = summ1$residuals # response - prediction
sqrt(mean(err^2)) # the root mean squared error (0.2211)
## [1] 0.2211685
# the residual error in the summary report is root of the sum of squared error,
# divided by the degrees of freedom
sqrt(sum(err^2)/model1$df)
## [1] 0.2287434
# r-squared and adjusted r-squared
summ1$r.squared
## [1] 0.8146898
summ1$adj.r.squared
## [1] 0.8025383
Evaluate the model on test data
#
# first, wrap the evaluations in functions
#
# root mean squared error
rmse = function(truth, pred) {
err = truth-pred
sqrt(mean(err^2))
}
# R-squared
# We won't look at adjusted R2, because it's really
# just an estimate of R-squared on holdout data, which
# we can calculate directly
R2 = function(truth, pred) {
err = truth-pred
meany = mean(truth)
null_err = truth-meany
1 - (sum(err^2)/sum(null_err^2))
}
# make the predictions on the salaryData frame
salPred = predict(model1, newdata=salaryData)
# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]],
pred = salPred, isTest=salaryData$isTest)
perfTrain = perf[!isTest,]
perfTest = perf[isTest,]
# check the training stats came out as reported
rmse(perfTrain$logSalary, perfTrain$pred)
## [1] 0.2211685
R2(perfTrain$logSalary, perfTrain$pred)
## [1] 0.8146898
# check test set performance
rmse(perfTest$logSalary, perfTest$pred)
## [1] 0.2571019
R2(perfTest$logSalary, perfTest$pred)
## [1] 0.7309676
Plot predictions
ggplot(perf, aes(x=pred, y=logSalary, color=isTest)) +
geom_point(aes(shape=isTest)) +
geom_abline(slope=1) +
scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
coord_fixed()
Plot residuals
ggplot(perf, aes(x=pred, y=logSalary-pred, color=isTest)) +
geom_point(aes(shape=isTest)) +
geom_abline(slope=0) +
scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
coord_fixed()
The Gain Curve
#
# Let's make this a function, since we'll use it again
#
# calculate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc = function(x,y) {
# append extra points to get rid of degenerate cases
x = c(0,x,1)
y = c(0,y,1)
n = length(x)
sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}
gainCurve = function(truthcol, predcol, title) {
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol=predcol,truthcol=truthcol)
predord = order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
wizard = order(d[['truthcol']], decreasing=TRUE)
npop = dim(d)[1]
# data frame with the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(pctpop= (1:npop)/npop,
model = cumsum(d[predord,'truthcol'])/sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol'])/sum(d[['truthcol']]))
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(results$pctpop,results$wizard) - 0.5
modelArea = areaCalc(results$pctpop,results$model) - 0.5
giniScore = modelArea/idealArea # actually, normalized gini score
# melt the frame into the tall form, for plotting
results = melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
variable.name="sort_criterion", value.name="pct_outcome")
gplot = ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) +
geom_point() + geom_line() +
geom_abline(color="gray") +
ggtitle(paste("Gain curve,", title, '\n',
'relative Gini score', format(giniScore,digits=2))) +
xlab("% items in score order") + ylab("% total category") +
scale_x_continuous(breaks=seq(0,1,0.1)) +
scale_y_continuous(breaks=seq(0,1,0.1)) +
scale_color_manual(values=c('model'='darkblue', 'wizard'='darkgreen'))
gplot
}
Now we can plot the gain curves, one for training and one for test. We’ll plot salary, not log salary.
gainCurve(10^perfTrain$logSalary, 10^perfTrain$pred, "Baseball salaries, training:")
gainCurve(10^perfTest$logSalary, 10^perfTest$pred, "Baseball salaries, test:")
Task: Predict the onset of diabetes within 5 years
Load the data
d = read.table("pima-indians-diabetes.data.txt", header=FALSE, sep=",", stringsAsFactors=FALSE)
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)
colnames(d) = c("npregnant",
"glucose",
"blood_pressure",
"tricep_skin_fold_thickness",
"insulin",
"bmi",
"diabetes_pedigree",
"age",
"diabetic") # 1=diagnosed as diabetic within 5 years/0=not diagnosed after 5 years
d$diabetic = d$diabetic>0.5 # switch outcome to logical
Find missing data. Note: although the dataset donors claim there are no missing values, some of the zeros are clearly actually missing values. I am going to treat 0 as ‘missing’ for glucose, blood_pressure, skin fold thickness, and bmi.
zero_as_missing = c("glucose",
"blood_pressure",
"tricep_skin_fold_thickness",
"bmi")
leave = setdiff(colnames(d), zero_as_missing)
d0 = as.data.frame(lapply(d[,zero_as_missing],
FUN=function(x) ifelse(x==0, NA, x)))
d = cbind(d[,leave], d0)
Set outcome and iput variables, split into training and test
yColumn = 'diabetic'
vars = setdiff(colnames(d),c(yColumn, "isTest", "dataLabel"))
d$isTest = runif(nrow(d))<0.25
d$dataLabel = ifelse(d$isTest,"test data","train data")
# a nicety, so that train plots above test when we facet wrap below
d$dataLabel = reorder(d$dataLabel, ifelse(d$dataLabel=='train data', 1, 2), FUN=mean)
Now treat the missing values, by substituting mean value for them and adding an additional informational column. We will use the vtreat library to do this.
dtrain = d[!d$isTest, ]
treatPlan = designTreatmentsC(dtrain, vars, yColumn, TRUE, verbose=FALSE)
dtrainTreat = prepare(treatPlan, dtrain, pruneSig=NULL, doCollar=FALSE)
# the treated data has all NAs replaced by the mean value of the variable,
# and additional columns to mark which values were formerly NAs
head(dtrainTreat)
## npregnant_clean insulin_clean diabetes_pedigree_clean age_clean
## 1 1 0 0.351 31
## 2 8 0 0.672 32
## 3 1 94 0.167 21
## 4 5 0 0.201 30
## 5 4 0 0.191 30
## 6 10 0 1.441 57
## glucose_clean glucose_isBAD blood_pressure_clean blood_pressure_isBAD
## 1 85 0 66 0
## 2 183 0 64 0
## 3 89 0 66 0
## 4 116 0 74 0
## 5 110 0 92 0
## 6 139 0 80 0
## tricep_skin_fold_thickness_clean tricep_skin_fold_thickness_isBAD
## 1 29.00000 0
## 2 29.59799 1
## 3 23.00000 0
## 4 29.59799 1
## 5 29.59799 1
## 6 29.59799 1
## bmi_clean bmi_isBAD diabetic
## 1 26.6 0 FALSE
## 2 23.3 0 TRUE
## 3 28.1 0 FALSE
## 4 25.6 0 FALSE
## 5 37.6 0 FALSE
## 6 27.1 0 FALSE
# get the new variable names
newvars = setdiff(colnames(dtrainTreat), yColumn)
Train the model (glm)
# make the formula
fmla = paste(yColumn, "~", paste(newvars, collapse="+"))
model2 = glm(fmla, data=dtrainTreat, family=binomial(link="logit"))
summary(model2)
##
## Call:
## glm(formula = fmla, family = binomial(link = "logit"), data = dtrainTreat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7303 -0.7183 -0.3683 0.7064 2.3521
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.2605136 0.9768067 -9.480 < 2e-16
## npregnant_clean 0.1481717 0.0382212 3.877 0.000106
## insulin_clean -0.0002292 0.0011059 -0.207 0.835784
## diabetes_pedigree_clean 1.0824446 0.3574341 3.028 0.002459
## age_clean 0.0111342 0.0107686 1.034 0.301157
## glucose_clean 0.0391050 0.0045513 8.592 < 2e-16
## glucose_isBAD 1.5511821 1.4489791 1.071 0.284379
## blood_pressure_clean -0.0101702 0.0100806 -1.009 0.313027
## blood_pressure_isBAD 0.9678432 0.6013335 1.609 0.107508
## tricep_skin_fold_thickness_clean 0.0124684 0.0158142 0.788 0.430447
## tricep_skin_fold_thickness_isBAD 0.2911630 0.2884797 1.009 0.312830
## bmi_clean 0.0767548 0.0204467 3.754 0.000174
## bmi_isBAD -1.7652892 1.1667321 -1.513 0.130275
##
## (Intercept) ***
## npregnant_clean ***
## insulin_clean
## diabetes_pedigree_clean **
## age_clean
## glucose_clean ***
## glucose_isBAD
## blood_pressure_clean
## blood_pressure_isBAD
## tricep_skin_fold_thickness_clean
## tricep_skin_fold_thickness_isBAD
## bmi_clean ***
## bmi_isBAD
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 738.38 on 562 degrees of freedom
## Residual deviance: 515.16 on 550 degrees of freedom
## AIC: 541.16
##
## Number of Fisher Scoring iterations: 5
# model reports training data diagnostics
# deviance
model2$deviance
## [1] 515.1598
#pseudo-Rsquared
1 - (model2$deviance/model2$null.deviance)
## [1] 0.302308
Make predictions
# treat all the data first
dTreat = prepare(treatPlan, d, pruneSig=NULL, doCollar=FALSE)
# put the predictions back into the original frame
d$model2 = predict(model2, newdata=dTreat, type='response')
Evaluate the model on training and test data.
# Convenience functions
# deviance
# epsilon > 0 is needed for smoothing for models that can return 0 or 1 probablities,
# which a properly converged logistic regression model can't
deviance = function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
# pseudo-R-squared
pseudoR2 = function(truth, pred, epsilon=0) {
dev = deviance(truth, pred)
nulldev = deviance(truth, mean(truth))
1-(dev/nulldev)
}
dtrain = d[!d$isTest,]
dtest = d[d$isTest,]
# training deviance
deviance(dtrain[[yColumn]], dtrain$model2)
## [1] 515.1598
# test deviance
deviance(dtest[[yColumn]], dtest$model2)
## [1] 194.8468
# remember, deviance can't be compared across data sets
# pseudo-R-squared can be compared across data sets
# training
pseudoR2(dtrain[[yColumn]], dtrain$model2)
## [1] 0.302308
# test
pseudoR2(dtest[[yColumn]], dtest$model2)
## [1] 0.2296849
Double density plot
ggplot(d, aes_string(x='model2', color=yColumn)) + geom_density(adjust=0.5) +
facet_wrap(~dataLabel, ncol=1)
ROC plot*
plotROC = function(title,outcol,predcol) {
#
# prediction and performance are in the ROCR package
#
pred = prediction(predcol,outcol)
perf = performance(pred,'tpr','fpr')
# get the auc
auc = as.numeric(performance(pred,'auc')@y.values)
# pull the information out of the perf structure so
# we can plot it with ggplot
pf = data.frame(
FalsePositiveRate=perf@x.values[[1]],
TruePositiveRate=perf@y.values[[1]])
plot=ggplot() +
geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
fill='blue',alpha=0.3) +
geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
plot
}
# plot the ROC for training and test sets
plotROC("Training data", d[!d$isTest, yColumn], d[!d$isTest, "model2"])
plotROC("Test data", d[d$isTest, yColumn], d[d$isTest, "model2"])
Gain Curve
gainCurve(d[!d$isTest, yColumn], d[!d$isTest, "model2"], "Training data")
gainCurve(d[d$isTest, yColumn], d[d$isTest, "model2"], "Test data")
If you use a classification model that returns class probabilities, you can turn the probabilities into labels by thresholding. Thresholding at 50% is the most straightforward; you are simply assigning the more probable label.
d$predLabel = d$model2 >=0.5
For simplicity, in this section we’ll calculate the performance stats over the whole set.
Many classification measures are summarized by the confusion matrix.
# the rows are true outcome, the columns predicted outcome
cmat = table(diabetic=d[[yColumn]], pred=d$predLabel)
cmat
## pred
## diabetic FALSE TRUE
## FALSE 434 66
## TRUE 110 158
Accuracy Accuracy is the sum of the diagonals of the confusion matrix
accuracy = function(cmat) {sum(diag(cmat))/sum(cmat)}
accuracy(cmat)
## [1] 0.7708333
False Positive Rate, False Negative Rate
# false positives: predicted diabetic but really not. Upper right corner
# false positive rate - the fraction of non-diabetics misdiagnosed as diabetic
fpr = function(cmat) {cmat[1, 2]/sum(cmat[1,])}
fpr(cmat)
## [1] 0.132
# false negatives: predicted not diabetic but really are. Lower left corner
# false negative rate - fraction of diabetics misdiagnosed as non-diabetic
fnr = function(cmat) {cmat[2,1]/sum(cmat[2,])}
fnr(cmat)
## [1] 0.4104478
Precision and Recall
# precision - of all the subjects in the population of interest (female Pima indians)
# who test positive, how many are true diabetics?
precision = function(cmat) {cmat[2,2]/sum(cmat[,2])}
precision(cmat)
## [1] 0.7053571
# recall - of all the diabetics in the population of interest, how many does this test identify?
recall = function(cmat) {cmat[2,2]/sum(cmat[2,])}
recall(cmat)
## [1] 0.5895522
Sensitivity and Specificity
# sensitivity - the true positive rate, or the rate at which diabetics are correctly diagnosed
# Same as recall
sensitivity = function(cmat){cmat[2,2]/sum(cmat[2,])}
sensitivity(cmat)
## [1] 0.5895522
# specificity - the true negative rate, or the rate at which non-diabetics are correctly diagnosed
specificity = function(cmat) {cmat[1,1]/sum(cmat[1,])}
specificity(cmat)
## [1] 0.868
Note that sensitivity and specificity are independent of the population distribution, but accuracy and precision are not.
# posit a test with a given sensitivity/specificity
# and populations with different prevalences of diabetes
scenario = function(sens, spec, prevalence) {
npos = prevalence
nneg = 1-prevalence
tPos = npos*sens # true positives
tNeg = nneg*spec # true negatives
fPos = nneg - tNeg # false positives (negatives mis-diagnosed)
fNeg = npos - tPos # false negatives (positives mis-diagnosed)
print(paste("accuracy = ", (tPos + tNeg))) # what we got right
print(paste("precision = ", (tPos/(tPos + fPos)))) # the number predicted positive that really are
}
prev_pima = sum(d$diabetic/nrow(d))
prev_pima
## [1] 0.3489583
scenario(sensitivity(cmat), specificity(cmat), prev_pima)
## [1] "accuracy = 0.770833333333333"
## [1] "precision = 0.705357142857143"
scenario(sensitivity(cmat), specificity(cmat), 0.1)
## [1] "accuracy = 0.840155223880597"
## [1] "precision = 0.331665211258145"
scenario(sensitivity(cmat), specificity(cmat), 0.9)
## [1] "accuracy = 0.617397014925373"
## [1] "precision = 0.975726236743298"