library('ggplot2')
library('randomForest')
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

First we load the data. The test and training sets have already been given in this set, but I’ll put them together with a test/train marker as we usually have

# test and training sets already given
dTrain = read.table("isolet1+2+3+4.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTrain$testlabel="train"

dTest = read.table("isolet5.data.gz",
              header=FALSE,sep=',',
              stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest = TRUE
dTest$testlabel="test"
d = rbind(dTrain,dTest) 
# make "train" the first level, so it shows up first in the plots
rank = ifelse(d$isTest, 2, 1)
d$testlabel = reorder(d$testlabel, rank, FUN=mean)
rm(list=c('dTest','dTrain')) # clean up the environment a little by deleting the smaller sets

dim(d)
## [1] 7797  620
colnames(d)[1:5]
## [1] "V1" "V2" "V3" "V4" "V5"

Make the frame a little more user-friendly

# column 618 designates the letter as a number
unique(d$V618)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
letters # a predefined (by R) vector
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
# so let's map the numbers to letters
d$V618 = letters[d$V618]
summary(as.factor(d$V618))
##   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o   p   q   r 
## 300 300 300 300 300 298 300 300 300 300 300 300 299 300 300 300 300 300 
##   s   t   u   v   w   x   y   z 
## 300 300 300 300 300 300 300 300
# rename the column
colnames(d)[618] = "letter"

Define variables and output

vars = colnames(d)[1:617]
yColumn = 'isN'

We are just going to work on the letters ‘m’ and ‘n’. ‘n’ is our target class

# Subset to just the utterances 'n', and 'm'
d = d[d$letter %in% c('m','n'),,drop=FALSE]
# 'n' is our target class
d[,yColumn] = d[,'letter']=='n'
table(d[,yColumn])/nrow(d)
## 
##     FALSE      TRUE 
## 0.4991653 0.5008347

First, some convenience functions for performance stats

deviance = function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}

pseudo_Rsquared = function(truth, pred, epsilon) {
  dev = deviance(truth, pred, epsilon)
  null.dev = deviance(truth, mean(pred), epsilon)
  1 - (dev/null.dev)
}

accuracy = function(truth, pred) {
  # confusion matrix
  cmat = table(truth, pred>0.5)
  sum(diag(cmat))/sum(cmat)
}

reportStats = function(d,test,modelName,title,epsilon=1e-02) {
  dSub = d[d$isTest==test,,drop=FALSE]
  acc = accuracy(dSub[,yColumn], dSub[,modelName])
  r2 = pseudo_Rsquared(dSub[,yColumn], dSub[,modelName], epsilon)
  note = ifelse(test,'test','train')
  print(paste('\t',note,'accuracy',modelName,format(acc,digits=2)))
  print(paste("\tmodel explained a",
              format(r2,digits=2),
            "fraction of the variation on",note))  
}

report <- function(d,modelName,title,epsilon=1.0e-2) {
  print("***********")
  reportStats(d, FALSE, modelName, title, epsilon)
  reportStats(d, TRUE, modelName, title, epsilon)
  print("***********")
}

Run the random forest model

set.seed(23424) # set seed for reproducibility 
                # RF algorithm is randomized
# it's better not to use formulas for random forest. Use x and y
model1 = randomForest(x=d[!d$isTest,vars],
                      y=as.factor(d[!d$isTest,yColumn]),
                      importance=TRUE)
model1
## 
## Call:
##  randomForest(x = d[!d$isTest, vars], y = as.factor(d[!d$isTest,      yColumn]), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 24
## 
##         OOB estimate of  error rate: 6.04%
## Confusion matrix:
##       FALSE TRUE class.error
## FALSE   226   14  0.05833333
## TRUE     15  225  0.06250000

Do the predictions

d$model1 = predict(model1,newdata=d,type='prob')[,'TRUE',drop=TRUE]

Report performance

# double density plot
ggplot(d, aes_string(x="model1", color=yColumn)) + geom_density() + facet_wrap(~testlabel, ncol=1)

report(d,'model1',"randomForest")
## [1] "***********"
## [1] "\t train accuracy model1 1"
## [1] "\tmodel explained a 0.88 fraction of the variation on train"
## [1] "\t test accuracy model1 0.91"
## [1] "\tmodel explained a 0.58 fraction of the variation on test"
## [1] "***********"

We can examine the variable importances

# first plot it (top 30 or so)
varImpPlot(model1, type=1) # just look at mean decrease accuracy

Get the variable importances

# get the variable importances
vImps = importance(model1, class='TRUE')
head(vImps)
##          FALSE       TRUE MeanDecreaseAccuracy MeanDecreaseGini
## V1  2.53556495 -0.9732944           1.97847625        0.1223433
## V2  0.10457294  0.1951671           0.14140647        0.1432184
## V3 -0.02845957 -0.2103421          -0.05327325        0.1379641
## V4  0.04260867  1.3768126           0.66431383        0.1108353
## V5 -0.07567487 -1.9620488          -1.40990004        0.1269676
## V6  2.19743775  1.6102564           2.52706699        0.2519189

Sort the variable importances

ord = order(vImps[,3], decreasing=TRUE) # sort, descending order
vord = vImps[ord, 3]
head(vord)
##      V297      V298      V106      V107      V138      V105 
## 13.105658 12.036795 11.243596 11.075592 11.050098  9.434104

Get the top 30

# limit to the top 30
topvars = names(vord[1:30])

Make a reduced model

model2 = randomForest(x=d[!d$isTest,topvars],y=as.factor(d[!d$isTest,yColumn]))
d$model2 = predict(model2,newdata=d,type='prob')[,'TRUE',drop=TRUE]                      
report(d,'model2',"randomForest")
## [1] "***********"
## [1] "\t train accuracy model2 1"
## [1] "\tmodel explained a 0.92 fraction of the variation on train"
## [1] "\t test accuracy model2 0.91"
## [1] "\tmodel explained a 0.67 fraction of the variation on test"
## [1] "***********"

(Interestingly, did better on deviance)