Abalone

Example use of Logistic Regression (glm) function in R

Read in the data. We have to add the column names by hand.

library('ggplot2')
d <- read.table('abalone.data.txt.gz',header=FALSE,sep=',',
                stringsAsFactors=TRUE)
# add the column names
colnames(d) <- c('Sex', 'Length', 'Diameter', 'Height', 'WholeWeight',
   'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings')
dim(d)

## [1] 4177    9

summary(d)

##  Sex          Length         Diameter          Height      
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  M:1528   Median :0.545   Median :0.4250   Median :0.1400  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   WholeWeight     ShuckedWeight    VisceraWeight     ShellWeight    
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000

Define the outcome variable to be whether or not the abalone has ten rings or more.

# define 'old' as more than 10 rings (more than ~11.5 years old)
d$old <- d$Rings>=10
table(d$old)/nrow(d)

## 
##     FALSE      TRUE 
## 0.5017955 0.4982045

Look at the Sex variable

summary(d$Sex)

##    F    I    M 
## 1307 1342 1528

Check whether Sex separates the data

tab = table(truth=d$Rings, sex=d$Sex)
rs = rowSums(tab)
tab/rowSums(tab)

##      sex
## truth          F          I          M
##    1  0.00000000 1.00000000 0.00000000
##    2  0.00000000 1.00000000 0.00000000
##    3  0.00000000 0.80000000 0.20000000
##    4  0.00000000 0.89473684 0.10526316
##    5  0.03478261 0.86956522 0.09565217
##    6  0.06177606 0.83397683 0.10424710
##    7  0.11253197 0.68286445 0.20460358
##    8  0.21478873 0.48239437 0.30281690
##    9  0.34542816 0.25108853 0.40348331
##    10 0.39116719 0.14511041 0.46372240
##    11 0.41067762 0.12731006 0.46201232
##    12 0.47940075 0.07865169 0.44194757
##    13 0.43349754 0.11822660 0.44827586
##    14 0.44444444 0.11111111 0.44444444
##    15 0.39805825 0.09708738 0.50485437
##    16 0.44776119 0.10447761 0.44776119
##    17 0.44827586 0.12068966 0.43103448
##    18 0.45238095 0.11904762 0.42857143
##    19 0.46875000 0.06250000 0.46875000
##    20 0.46153846 0.07692308 0.46153846
##    21 0.50000000 0.07142857 0.42857143
##    22 0.50000000 0.00000000 0.50000000
##    23 0.66666667 0.00000000 0.33333333
##    24 0.50000000 0.00000000 0.50000000
##    25 1.00000000 0.00000000 0.00000000
##    26 0.00000000 0.00000000 1.00000000
##    27 0.50000000 0.00000000 0.50000000
##    29 1.00000000 0.00000000 0.00000000

Do the test/train split

set.seed(2352) # set random seed for reproducibility

# do the random split (25% held out for test), put the label back into the data frame
d$isTest <- runif(nrow(d))<0.25
d$dataLabel <- ifelse(d$isTest,"test data","train data")

Try a non-invasive prediction procedure first. (Only the measurements we can take without dissecting the abalone)

nonInvasiveVars <- c('Sex', 'Length', 'Diameter', 'Height', 'WholeWeight')
# the model formula
fNonInvasive <- paste('old',paste(nonInvasiveVars,collapse=' + '),sep=' ~ ')
fNonInvasive

## [1] "old ~ Sex + Length + Diameter + Height + WholeWeight"

Run the model.

model1 <- glm(fNonInvasive,data=d[!d$isTest,],family=binomial(link='logit'))
summary(model1)

## 
## Call:
## glm(formula = fNonInvasive, family = binomial(link = "logit"), 
##     data = d[!d$isTest, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.1128  -0.7410  -0.1659   0.7968   2.3648  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.461294   0.511738  -8.718  < 2e-16 ***
## SexI        -0.864019   0.127225  -6.791 1.11e-11 ***
## SexM         0.005323   0.102343   0.052 0.958518    
## Length      -7.830134   2.356678  -3.323 0.000892 ***
## Diameter    14.005609   2.858492   4.900 9.60e-07 ***
## Height      19.006321   2.915010   6.520 7.02e-11 ***
## WholeWeight  0.434641   0.324189   1.341 0.180017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4359.6  on 3144  degrees of freedom
## Residual deviance: 3122.6  on 3138  degrees of freedom
## AIC: 3136.6
## 
## Number of Fisher Scoring iterations: 5

Look at the model’s predictions on the data.

d$model1 <- predict(model1,newdata=d,type='response')
head(d[,c("old","model1")])

##     old     model1
## 1  TRUE 0.29366948
## 2 FALSE 0.15756601
## 3 FALSE 0.53275208
## 4  TRUE 0.45286105
## 5 FALSE 0.06132856
## 6 FALSE 0.07634611

Calculate pseudo-R-Squared on training data

# training pseudo-R2
1 - model1$deviance/model1$null.deviance

## [1] 0.2837351

Plot the predictions for the training data.

dtrain = d[!d$isTest,]  # makes the following calls easier
ggplot(dtrain, aes(x=model1, color=old)) + geom_density()

Create the confusion matrix

# confusion matrix

cmat = table(truth=dtrain$old, pred=dtrain$model1 > 0.5)
cmat

##        pred
## truth   FALSE TRUE
##   FALSE  1127  461
##   TRUE    351 1206

Calculate the accuracy (training data)

accuracy = (cmat[1,1] + cmat[2,2])/sum(cmat)
accuracy

## [1] 0.7418124

Convenience functions to report performance

deviance <- function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}

deviance(dtrain$old, dtrain$model1)

## [1] 3122.622

model1$deviance

## [1] 3122.622

reportStats <- function(d,test,modelName,title) {
  dSub <- d[d$isTest==test,,drop=FALSE]
  tab <- table(truth=dSub$old,pred=dSub[,modelName]>0.5)
  accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
  note = ifelse(test,'test','train')
  print(paste('   ',note,'accuracy',modelName,format(accuracy,digits=2)))
  residual.deviance <- deviance(dSub[,'old'],dSub[,modelName])
  null.deviance <- deviance(dSub[,'old'],mean(dSub[,'old']))
  print(paste("   model explained a",
              format((1-residual.deviance/null.deviance),digits=2),
            "fraction of the variation on",note))  
}

report <- function(d,modelName,title) {
  print("***********")
  print(paste("model",modelName,title))
  reportStats(d,FALSE,modelName,title)
  reportStats(d,TRUE,modelName,title)
  print(ggplot(data=d,aes_string(x=modelName,color='old')) + 
    geom_density() + facet_wrap(~dataLabel,ncol=1,scales='free_y') +
    ggtitle(title))
  print("***********")
}

Performance report for both the training and test data

# report the performance of the model on hold-out
report(d, "model1", "Logistic Regression: Non-invasive Model")

## [1] "***********"
## [1] "model model1 Logistic Regression: Non-invasive Model"
## [1] "    train accuracy model1 0.74"
## [1] "   model explained a 0.28 fraction of the variation on train"
## [1] "    test accuracy model1 0.73"
## [1] "   model explained a 0.26 fraction of the variation on test"

## [1] "***********"

Set up a model that includes invasive measurments

invasiveVars <- c('ShuckedWeight', 'VisceraWeight', 'ShellWeight')
fAllVars <- paste('old',paste(c(nonInvasiveVars,invasiveVars),collapse=' + '),sep=' ~ ')
fAllVars

## [1] "old ~ Sex + Length + Diameter + Height + WholeWeight + ShuckedWeight + VisceraWeight + ShellWeight"

Fit the model

model2 <- glm(fAllVars,data=d[!d$isTest,],family=binomial(link='logit'))
d$model2 <- predict(model2,newdata=d,type='response')
summary(model2)

## 
## Call:
## glm(formula = fAllVars, family = binomial(link = "logit"), data = d[!d$isTest, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1087  -0.6498  -0.2303   0.6920   2.4051  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1955     0.5189  -6.159 7.33e-10 ***
## SexI           -0.7044     0.1369  -5.145 2.68e-07 ***
## SexM            0.1011     0.1113   0.908  0.36393    
## Length         -3.7254     2.4690  -1.509  0.13132    
## Diameter        5.7295     3.0462   1.881  0.05999 .  
## Height          4.0616     2.2038   1.843  0.06533 .  
## WholeWeight     9.3729     1.3509   6.938 3.97e-12 ***
## ShuckedWeight -17.1020     1.5189 -11.260  < 2e-16 ***
## VisceraWeight  -6.6730     2.0701  -3.224  0.00127 ** 
## ShellWeight     8.8940     1.9063   4.666 3.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4359.6  on 3144  degrees of freedom
## Residual deviance: 2750.1  on 3135  degrees of freedom
## AIC: 2770.1
## 
## Number of Fisher Scoring iterations: 5

Report performance statistics

report(d,'model2', "invasive model logistic regression")

## [1] "***********"
## [1] "model model2 invasive model logistic regression"
## [1] "    train accuracy model2 0.79"
## [1] "   model explained a 0.37 fraction of the variation on train"
## [1] "    test accuracy model2 0.77"
## [1] "   model explained a 0.34 fraction of the variation on test"

## [1] "***********"