library('ggplot2')
library('randomForest')
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
First we load the data. The test and training sets have already been given in this set, but I’ll put them together with a test/train marker as we usually have
# test and training sets already given
dTrain = read.table("isolet1+2+3+4.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTrain$testlabel="train"
dTest = read.table("isolet5.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest = TRUE
dTest$testlabel="test"
d = rbind(dTrain,dTest)
# make "train" the first level, so it shows up first in the plots
rank = ifelse(d$isTest, 2, 1)
d$testlabel = reorder(d$testlabel, rank, FUN=mean)
rm(list=c('dTest','dTrain')) # clean up the environment a little by deleting the smaller sets
dim(d)
## [1] 7797 620
colnames(d)[1:5]
## [1] "V1" "V2" "V3" "V4" "V5"
Make the frame a little more user-friendly
# column 618 designates the letter as a number
unique(d$V618)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
letters # a predefined (by R) vector
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
# so let's map the numbers to letters
d$V618 = letters[d$V618]
summary(as.factor(d$V618))
## a b c d e f g h i j k l m n o p q r
## 300 300 300 300 300 298 300 300 300 300 300 300 299 300 300 300 300 300
## s t u v w x y z
## 300 300 300 300 300 300 300 300
# rename the column
colnames(d)[618] = "letter"
Define variables and output
vars = colnames(d)[1:617]
yColumn = 'isN'
We are just going to work on the letters ‘m’ and ‘n’. ‘n’ is our target class
# Subset to just the utterances 'n', and 'm'
d = d[d$letter %in% c('m','n'),,drop=FALSE]
# 'n' is our target class
d[,yColumn] = d[,'letter']=='n'
table(d[,yColumn])/nrow(d)
##
## FALSE TRUE
## 0.4991653 0.5008347
First, some convenience functions for performance stats
deviance = function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
pseudo_Rsquared = function(truth, pred, epsilon) {
dev = deviance(truth, pred, epsilon)
null.dev = deviance(truth, mean(pred), epsilon)
1 - (dev/null.dev)
}
accuracy = function(truth, pred) {
# confusion matrix
cmat = table(truth, pred>0.5)
sum(diag(cmat))/sum(cmat)
}
reportStats = function(d,test,modelName,title,epsilon=1e-02) {
dSub = d[d$isTest==test,,drop=FALSE]
acc = accuracy(dSub[,yColumn], dSub[,modelName])
r2 = pseudo_Rsquared(dSub[,yColumn], dSub[,modelName], epsilon)
note = ifelse(test,'test','train')
print(paste('\t',note,'accuracy',modelName,format(acc,digits=2)))
print(paste("\tmodel explained a",
format(r2,digits=2),
"fraction of the variation on",note))
}
report <- function(d,modelName,title,epsilon=1.0e-2) {
print("***********")
reportStats(d, FALSE, modelName, title, epsilon)
reportStats(d, TRUE, modelName, title, epsilon)
print("***********")
}
Run the random forest model
set.seed(23424) # set seed for reproducibility
# RF algorithm is randomized
# it's better not to use formulas for random forest. Use x and y
model1 = randomForest(x=d[!d$isTest,vars],
y=as.factor(d[!d$isTest,yColumn]),
importance=TRUE)
model1
##
## Call:
## randomForest(x = d[!d$isTest, vars], y = as.factor(d[!d$isTest, yColumn]), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 24
##
## OOB estimate of error rate: 6.04%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 226 14 0.05833333
## TRUE 15 225 0.06250000
Do the predictions
d$model1 = predict(model1,newdata=d,type='prob')[,'TRUE',drop=TRUE]
Report performance
# double density plot
ggplot(d, aes_string(x="model1", color=yColumn)) + geom_density() + facet_wrap(~testlabel, ncol=1)
report(d,'model1',"randomForest")
## [1] "***********"
## [1] "\t train accuracy model1 1"
## [1] "\tmodel explained a 0.88 fraction of the variation on train"
## [1] "\t test accuracy model1 0.91"
## [1] "\tmodel explained a 0.58 fraction of the variation on test"
## [1] "***********"
We can examine the variable importances
# first plot it (top 30 or so)
varImpPlot(model1, type=1) # just look at mean decrease accuracy
Get the variable importances
# get the variable importances
vImps = importance(model1, class='TRUE')
head(vImps)
## FALSE TRUE MeanDecreaseAccuracy MeanDecreaseGini
## V1 2.53556495 -0.9732944 1.97847625 0.1223433
## V2 0.10457294 0.1951671 0.14140647 0.1432184
## V3 -0.02845957 -0.2103421 -0.05327325 0.1379641
## V4 0.04260867 1.3768126 0.66431383 0.1108353
## V5 -0.07567487 -1.9620488 -1.40990004 0.1269676
## V6 2.19743775 1.6102564 2.52706699 0.2519189
Sort the variable importances
ord = order(vImps[,3], decreasing=TRUE) # sort, descending order
vord = vImps[ord, 3]
head(vord)
## V297 V298 V106 V107 V138 V105
## 13.105658 12.036795 11.243596 11.075592 11.050098 9.434104
Get the top 30
# limit to the top 30
topvars = names(vord[1:30])
Make a reduced model
model2 = randomForest(x=d[!d$isTest,topvars],y=as.factor(d[!d$isTest,yColumn]))
d$model2 = predict(model2,newdata=d,type='prob')[,'TRUE',drop=TRUE]
report(d,'model2',"randomForest")
## [1] "***********"
## [1] "\t train accuracy model2 1"
## [1] "\tmodel explained a 0.92 fraction of the variation on train"
## [1] "\t test accuracy model2 0.91"
## [1] "\tmodel explained a 0.67 fraction of the variation on test"
## [1] "***********"
(Interestingly, did better on deviance)