Example use of Logistic Regression (glm) function in R
Read in the data. We have to add the column names by hand.
library('ggplot2')
d <- read.table('abalone.data.txt.gz',header=FALSE,sep=',',
stringsAsFactors=TRUE)
# add the column names
colnames(d) <- c('Sex', 'Length', 'Diameter', 'Height', 'WholeWeight',
'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings')
dim(d)
## [1] 4177 9
summary(d)
## Sex Length Diameter Height
## F:1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## I:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## M:1528 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## WholeWeight ShuckedWeight VisceraWeight ShellWeight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.934
## 3rd Qu.:11.000
## Max. :29.000
Define the outcome variable to be whether or not the abalone has ten rings or more.
# define 'old' as more than 10 rings (more than ~11.5 years old)
d$old <- d$Rings>=10
table(d$old)/nrow(d)
##
## FALSE TRUE
## 0.5017955 0.4982045
Look at the Sex variable
summary(d$Sex)
## F I M
## 1307 1342 1528
Check whether Sex separates the data
tab = table(truth=d$Rings, sex=d$Sex)
rs = rowSums(tab)
tab/rowSums(tab)
## sex
## truth F I M
## 1 0.00000000 1.00000000 0.00000000
## 2 0.00000000 1.00000000 0.00000000
## 3 0.00000000 0.80000000 0.20000000
## 4 0.00000000 0.89473684 0.10526316
## 5 0.03478261 0.86956522 0.09565217
## 6 0.06177606 0.83397683 0.10424710
## 7 0.11253197 0.68286445 0.20460358
## 8 0.21478873 0.48239437 0.30281690
## 9 0.34542816 0.25108853 0.40348331
## 10 0.39116719 0.14511041 0.46372240
## 11 0.41067762 0.12731006 0.46201232
## 12 0.47940075 0.07865169 0.44194757
## 13 0.43349754 0.11822660 0.44827586
## 14 0.44444444 0.11111111 0.44444444
## 15 0.39805825 0.09708738 0.50485437
## 16 0.44776119 0.10447761 0.44776119
## 17 0.44827586 0.12068966 0.43103448
## 18 0.45238095 0.11904762 0.42857143
## 19 0.46875000 0.06250000 0.46875000
## 20 0.46153846 0.07692308 0.46153846
## 21 0.50000000 0.07142857 0.42857143
## 22 0.50000000 0.00000000 0.50000000
## 23 0.66666667 0.00000000 0.33333333
## 24 0.50000000 0.00000000 0.50000000
## 25 1.00000000 0.00000000 0.00000000
## 26 0.00000000 0.00000000 1.00000000
## 27 0.50000000 0.00000000 0.50000000
## 29 1.00000000 0.00000000 0.00000000
Do the test/train split
set.seed(2352) # set random seed for reproducibility
# do the random split (25% held out for test), put the label back into the data frame
d$isTest <- runif(nrow(d))<0.25
d$dataLabel <- ifelse(d$isTest,"test data","train data")
Try a non-invasive prediction procedure first. (Only the measurements we can take without dissecting the abalone)
nonInvasiveVars <- c('Sex', 'Length', 'Diameter', 'Height', 'WholeWeight')
# the model formula
fNonInvasive <- paste('old',paste(nonInvasiveVars,collapse=' + '),sep=' ~ ')
fNonInvasive
## [1] "old ~ Sex + Length + Diameter + Height + WholeWeight"
Run the model.
model1 <- glm(fNonInvasive,data=d[!d$isTest,],family=binomial(link='logit'))
summary(model1)
##
## Call:
## glm(formula = fNonInvasive, family = binomial(link = "logit"),
## data = d[!d$isTest, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1128 -0.7410 -0.1659 0.7968 2.3648
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.461294 0.511738 -8.718 < 2e-16 ***
## SexI -0.864019 0.127225 -6.791 1.11e-11 ***
## SexM 0.005323 0.102343 0.052 0.958518
## Length -7.830134 2.356678 -3.323 0.000892 ***
## Diameter 14.005609 2.858492 4.900 9.60e-07 ***
## Height 19.006321 2.915010 6.520 7.02e-11 ***
## WholeWeight 0.434641 0.324189 1.341 0.180017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4359.6 on 3144 degrees of freedom
## Residual deviance: 3122.6 on 3138 degrees of freedom
## AIC: 3136.6
##
## Number of Fisher Scoring iterations: 5
Look at the model’s predictions on the data.
d$model1 <- predict(model1,newdata=d,type='response')
head(d[,c("old","model1")])
## old model1
## 1 TRUE 0.29366948
## 2 FALSE 0.15756601
## 3 FALSE 0.53275208
## 4 TRUE 0.45286105
## 5 FALSE 0.06132856
## 6 FALSE 0.07634611
Calculate pseudo-R-Squared on training data
# training pseudo-R2
1 - model1$deviance/model1$null.deviance
## [1] 0.2837351
Plot the predictions for the training data.
dtrain = d[!d$isTest,] # makes the following calls easier
ggplot(dtrain, aes(x=model1, color=old)) + geom_density()
Create the confusion matrix
# confusion matrix
cmat = table(truth=dtrain$old, pred=dtrain$model1 > 0.5)
cmat
## pred
## truth FALSE TRUE
## FALSE 1127 461
## TRUE 351 1206
Calculate the accuracy (training data)
accuracy = (cmat[1,1] + cmat[2,2])/sum(cmat)
accuracy
## [1] 0.7418124
Convenience functions to report performance
deviance <- function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
deviance(dtrain$old, dtrain$model1)
## [1] 3122.622
model1$deviance
## [1] 3122.622
reportStats <- function(d,test,modelName,title) {
dSub <- d[d$isTest==test,,drop=FALSE]
tab <- table(truth=dSub$old,pred=dSub[,modelName]>0.5)
accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
note = ifelse(test,'test','train')
print(paste(' ',note,'accuracy',modelName,format(accuracy,digits=2)))
residual.deviance <- deviance(dSub[,'old'],dSub[,modelName])
null.deviance <- deviance(dSub[,'old'],mean(dSub[,'old']))
print(paste(" model explained a",
format((1-residual.deviance/null.deviance),digits=2),
"fraction of the variation on",note))
}
report <- function(d,modelName,title) {
print("***********")
print(paste("model",modelName,title))
reportStats(d,FALSE,modelName,title)
reportStats(d,TRUE,modelName,title)
print(ggplot(data=d,aes_string(x=modelName,color='old')) +
geom_density() + facet_wrap(~dataLabel,ncol=1,scales='free_y') +
ggtitle(title))
print("***********")
}
Performance report for both the training and test data
# report the performance of the model on hold-out
report(d, "model1", "Logistic Regression: Non-invasive Model")
## [1] "***********"
## [1] "model model1 Logistic Regression: Non-invasive Model"
## [1] " train accuracy model1 0.74"
## [1] " model explained a 0.28 fraction of the variation on train"
## [1] " test accuracy model1 0.73"
## [1] " model explained a 0.26 fraction of the variation on test"
## [1] "***********"
Set up a model that includes invasive measurments
invasiveVars <- c('ShuckedWeight', 'VisceraWeight', 'ShellWeight')
fAllVars <- paste('old',paste(c(nonInvasiveVars,invasiveVars),collapse=' + '),sep=' ~ ')
fAllVars
## [1] "old ~ Sex + Length + Diameter + Height + WholeWeight + ShuckedWeight + VisceraWeight + ShellWeight"
Fit the model
model2 <- glm(fAllVars,data=d[!d$isTest,],family=binomial(link='logit'))
d$model2 <- predict(model2,newdata=d,type='response')
summary(model2)
##
## Call:
## glm(formula = fAllVars, family = binomial(link = "logit"), data = d[!d$isTest,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1087 -0.6498 -0.2303 0.6920 2.4051
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1955 0.5189 -6.159 7.33e-10 ***
## SexI -0.7044 0.1369 -5.145 2.68e-07 ***
## SexM 0.1011 0.1113 0.908 0.36393
## Length -3.7254 2.4690 -1.509 0.13132
## Diameter 5.7295 3.0462 1.881 0.05999 .
## Height 4.0616 2.2038 1.843 0.06533 .
## WholeWeight 9.3729 1.3509 6.938 3.97e-12 ***
## ShuckedWeight -17.1020 1.5189 -11.260 < 2e-16 ***
## VisceraWeight -6.6730 2.0701 -3.224 0.00127 **
## ShellWeight 8.8940 1.9063 4.666 3.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4359.6 on 3144 degrees of freedom
## Residual deviance: 2750.1 on 3135 degrees of freedom
## AIC: 2770.1
##
## Number of Fisher Scoring iterations: 5
Report performance statistics
report(d,'model2', "invasive model logistic regression")
## [1] "***********"
## [1] "model model2 invasive model logistic regression"
## [1] " train accuracy model2 0.79"
## [1] " model explained a 0.37 fraction of the variation on train"
## [1] " test accuracy model2 0.77"
## [1] " model explained a 0.34 fraction of the variation on test"
## [1] "***********"