library('glmnet')
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library('ggplot2')
d <- read.table('car.data',header=FALSE,sep=',',stringsAsFactors=TRUE)
yColumn <- 'rating'
vars <- c('buying', 'maint', 'doors', 'persons',
'lug_boot', 'safety')
colnames(d) <- c(vars,yColumn)
summary(d)
## buying maint doors persons lug_boot safety
## high :432 high :432 2 :432 2 :576 big :576 high:576
## low :432 low :432 3 :432 4 :576 med :576 low :576
## med :432 med :432 4 :432 more:576 small:576 med :576
## vhigh:432 vhigh:432 5more:432
## rating
## acc : 384
## good : 69
## unacc:1210
## vgood: 65
# View(d)
set.seed(53525)
d$isTest <- runif(nrow(d))<0.2
d$want <- d[,yColumn]!='unacc'
f <- paste('want',paste(vars,collapse=' + '),sep=' ~ ')
# try glmnet on original data
tryCatch(
model1 <- cv.glmnet(x=as.matrix(d[!d$isTest,vars,drop=FALSE]),
y=d[!d$isTest,'want',drop=TRUE],
family='binomial'),
warning = function(w) {w},
error = function(e) {e})
## <simpleWarning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, nvars, jd, vp, cl, ne, nx, nlam, flmin, ulam, thresh, isd, intr, vnames, maxit, kopt, family): NAs introduced by coercion>
# errors out due to non-numeric features
# try to fix it the wrong way- convert to numeric
# (either through ordinal levels or through "hashed features")
dN <- d
for(v in vars) {
dN[,v] <- as.numeric(d[,v])
}
model2 <- cv.glmnet(x=as.matrix(dN[!dN$isTest,vars,drop=FALSE]),
y=dN[!dN$isTest,'want',drop=TRUE],
family='binomial')
d$predN <- predict(model2,
newx=as.matrix(dN[,vars,drop=FALSE]),
type='response')[,1]
ggplot(data=d[d$isTest,]) +
geom_density(aes(x=predN,color=want))
print(table(truth=d[d$isTest,'want'],decision=d[d$isTest,'predN']>0.5))
## decision
## truth FALSE TRUE
## FALSE 208 14
## TRUE 77 17
print(table(truth=d[d$isTest,'want'],decision=d[d$isTest,'predN']>median(d[d$isTest,'predN'])))
## decision
## truth FALSE TRUE
## FALSE 151 71
## TRUE 17 77
# fix it the right way: model.matrix()
dM <- model.matrix(as.formula(f),d)
print(head(dM))
## (Intercept) buyinglow buyingmed buyingvhigh maintlow maintmed maintvhigh
## 1 1 0 0 1 0 0 1
## 2 1 0 0 1 0 0 1
## 3 1 0 0 1 0 0 1
## 4 1 0 0 1 0 0 1
## 5 1 0 0 1 0 0 1
## 6 1 0 0 1 0 0 1
## doors3 doors4 doors5more persons4 personsmore lug_bootmed lug_bootsmall
## 1 0 0 0 0 0 0 1
## 2 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 1
## 4 0 0 0 0 0 1 0
## 5 0 0 0 0 0 1 0
## 6 0 0 0 0 0 1 0
## safetylow safetymed
## 1 1 0
## 2 0 1
## 3 0 0
## 4 1 0
## 5 0 1
## 6 0 0
model3 <- cv.glmnet(x=dM[!d$isTest,,drop=FALSE],
y=d[!d$isTest,'want',drop=TRUE],
family='binomial')
d$predC <- predict(model3,
newx=dM,
type='response')[,1]
ggplot(data=d[d$isTest,]) +
geom_density(aes(x=predC,color=want))
print(table(truth=d[d$isTest,'want'],decision=d[d$isTest,'predC']>0.5))
## decision
## truth FALSE TRUE
## FALSE 211 11
## TRUE 11 83