Load libraries and read in data
library('glmnet')
## Loading required package: Matrix
## Loaded glmnet 1.9-8
dTrain = read.table("isolet1+2+3+4.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTrain$isTest <- FALSE
dTest = read.table("isolet5.data.gz",
header=FALSE,sep=',',
stringsAsFactors=FALSE,blank.lines.skip=TRUE)
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)
rm(list=c('dTest','dTrain'))
colnames(d)[618] = 'letters'
d$letters <- letters[d$letters]
# define input variables and target variable
vars <- colnames(d)[1:617]
yColumn <- 'isN'
d <- d[d$letters %in% c('m','n'),,drop=FALSE]
d[,yColumn] <- d[,"letters"]=='n'
Set up the variable formula
formula <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
First, some convenience functions for performance stats
deviance = function(truth,pred,epsilon=0) {
pred = pmax(pred, epsilon)
pred = pmin(pred, 1-epsilon)
S = 0.0 # assumed log-likelihood of saturated model
-2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}
pseudo_Rsquared = function(truth, pred, epsilon) {
dev = deviance(truth, pred, epsilon)
null.dev = deviance(truth, mean(pred), epsilon)
1 - (dev/null.dev)
}
accuracy = function(truth, pred) {
# confusion matrix
cmat = table(truth, pred>0.5)
sum(diag(cmat))/sum(cmat)
}
reportStats = function(d,test,modelName,title,epsilon=1e-02) {
dSub = d[d$isTest==test,,drop=FALSE]
acc = accuracy(dSub[,yColumn], dSub[,modelName])
r2 = pseudo_Rsquared(dSub[,yColumn], dSub[,modelName], epsilon)
note = ifelse(test,'test','train')
print(paste('\t',note,'accuracy',modelName,format(acc,digits=2)))
print(paste("\tmodel explained a",
format(r2,digits=2),
"fraction of the variation on",note))
}
report <- function(d,modelName,title,epsilon=1.0e-2) {
print("***********")
reportStats(d, FALSE, modelName, title, epsilon)
reportStats(d, TRUE, modelName, title, epsilon)
print("***********")
}
Try glm first
model1 <- glm(formula,data=d[!d$isTest,],family=binomial(link='logit'))
## Warning: glm.fit: algorithm did not converge
# print out a few of the coefficients
coef = summary(model1)$coefficients
coef[1:10,]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 53940.69066 1419463923 3.800075e-05 0.9999697
## V1 -143.32916 19421683 -7.379853e-06 0.9999941
## V2 -22.07253 3392768 -6.505757e-06 0.9999948
## V3 309.53685 23582300 1.312581e-05 0.9999895
## V4 -310.01841 22349961 -1.387109e-05 0.9999889
## V5 284.40530 11141885 2.552578e-05 0.9999796
## V6 116.22047 10080466 1.152928e-05 0.9999908
## V7 -81.19816 8485021 -9.569589e-06 0.9999924
## V8 13.85082 5086722 2.722935e-06 0.9999978
## V9 436.10663 19604331 2.224542e-05 0.9999823
d$model1 <- predict(model1,newdata=d,type='response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
report(d,'model1',"logistic regression")
## [1] "***********"
## [1] "\t train accuracy model1 1"
## [1] "\tmodel explained a 0.99 fraction of the variation on train"
## [1] "\t test accuracy model1 0.44"
## [1] "\tmodel explained a -2.7 fraction of the variation on test"
## [1] "***********"
dim(d)
## [1] 599 621
length(vars)
## [1] 617
Then try glmnet
set.seed(245623) # set seed for reproducibility
# cv.glmnet is randomized
# finds the best lambda parameter by cross validation
# and returns the corresponding model
# glmnet doesn't take formulas
# Nota bene: as.matrix is only safe to call on data frames that are entirely
# numeric, as isolet is.
model2 <- cv.glmnet(x=as.matrix(d[!d$isTest,vars]),y=d[!d$isTest,yColumn,drop=TRUE],
family='binomial')
lambda = model2$lambda.1se # the value of lambda used by default
lambda
## [1] 0.01746463
Get the non-zero variables
coefs = as.matrix(coef(model2)) # convert to a matrix (618 by 1)
ix = which(abs(coefs[,1]) > 0)
length(ix)
## [1] 49
coefs[ix,1, drop=FALSE] # note V297 and V298 are on this list
## 1
## (Intercept) 2.4676439670
## V34 -0.0206317457
## V54 0.1489633517
## V73 -0.0606658554
## V106 -1.1177851286
## V107 -0.6166861906
## V112 0.1593158257
## V113 0.0913560199
## V118 0.1706853599
## V133 0.2963202983
## V138 -0.2001166968
## V140 -0.9745713740
## V166 0.9073035891
## V170 -1.0285151544
## V171 -0.0772500119
## V177 0.1182826323
## V178 0.1284279040
## V182 0.1086738925
## V201 -0.2191432077
## V202 -1.0278548669
## V209 0.0949300532
## V233 -0.0004867513
## V236 -0.1839334317
## V263 -0.3159201348
## V267 -0.3071332954
## V292 -0.1071626033
## V294 -1.1969357701
## V295 -0.9652940165
## V297 -2.7027030591
## V298 -0.2084936074
## V303 0.3896651990
## V304 0.3289521112
## V318 0.2296788680
## V319 0.0437763559
## V326 -0.3165024392
## V362 -0.0617198457
## V379 -0.8942052276
## V386 -0.0348311775
## V401 0.3088381947
## V407 0.5665718032
## V419 0.8957275102
## V433 -0.0129395573
## V444 0.0541403114
## V462 0.0687837935
## V474 0.5376256042
## V543 0.0768528818
## V568 0.0453626814
## V590 -0.0758340256
## V613 0.4667893756
d$model2 <- predict(model2,newx=as.matrix(d[,vars]),type='response')[,1]
report(d,'model2',"glmnet") # comparable to random forest
## [1] "***********"
## [1] "\t train accuracy model2 0.97"
## [1] "\tmodel explained a 0.82 fraction of the variation on train"
## [1] "\t test accuracy model2 0.9"
## [1] "\tmodel explained a 0.64 fraction of the variation on test"
## [1] "***********"