# load libraries, in particular mgcv which supplies the gam model
library('ggplot2')
library('reshape2')
library('mgcv')
## Loading required package: nlme
## This is mgcv 1.8-3. For overview type 'help("mgcv-package")'.
library('ROCR')
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
# work an artificial example
d <- data.frame(x=1:40)
d$y <- sin(0.3*d$x)
m1 <- lm(y~x,data=d)
d$pred1 <- predict(m1,newdata=d)
ggplot(data=d,aes(x=pred1,y=y)) + 
  geom_point() + geom_abline(slope=1,color='blue')

mGAM <- gam(y~s(x),data=d)
d$predGAM <- predict(mGAM,newdata=d)
ggplot(data=d,aes(x=predGAM,y=y)) +
  geom_point() + geom_abline(slope=1,color='blue')

plot(mGAM)

# load the UCI Abalone data
d <- read.table('abalone.data.txt.gz',header=FALSE,sep=',',
                stringsAsFactors=TRUE)
colnames(d) <- c('Sex', 'Length', 'Diameter', 'Height', 'WholeWeight',
   'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings')
yColumn <- 'old'
d[,yColumn] <- d$Rings>=10
d$LongDim <- pmax(d$Length,d$Diameter)
set.seed(2352)
d$isTest <- runif(nrow(d))<0.25
d$dataLabel <- ifelse(d$isTest,"test data","train data")

nonInvasiveVars <- c('Sex', 'Length', 'Diameter', 'LongDim', 'Height', 'WholeWeight')
invasiveVars <- c('ShuckedWeight', 'VisceraWeight', 'ShellWeight')

# actual number of rings model
#vars <- c(nonInvasiveVars,invasiveVars)
vars <- nonInvasiveVars

rmseErrors <- function(modelName) {
  print(paste('train RMSE',
              format(sqrt(mean((d[!d$isTest,modelName]-d[!d$isTest,'Rings'])^2)),digits=4)))
  print(paste('test RMSE',
              format(sqrt(mean((d[d$isTest,modelName]-d[d$isTest,'Rings'])^2)),digits=4)))
}
# build a model of ring count (proxy for age) just on longest dimension.
# this is what regulations are written using.
modelR0 <- lm(Rings~0+LongDim,data=d[!d$isTest,])
print(modelR0)
## 
## Call:
## lm(formula = Rings ~ 0 + LongDim, data = d[!d$isTest, ])
## 
## Coefficients:
## LongDim  
##   18.64
modelR0 <- lm(Rings~LongDim,data=d[!d$isTest,])
print(modelR0)
## 
## Call:
## lm(formula = Rings ~ LongDim, data = d[!d$isTest, ])
## 
## Coefficients:
## (Intercept)      LongDim  
##       2.112       14.822
d$modelR0 <- predict(modelR0,newdata=d)
rmseErrors('modelR0')
## [1] "train RMSE 2.621"
## [1] "test RMSE 2.847"
ggplot(data=d[d$isTest,],aes(x=modelR0,y=Rings)) + 
  geom_point(alpha=0.5) + geom_abline(slope=1,linetype=2)

# build a better linear model using the non-invasive vars
fAllVarsR <- paste('Rings',paste(vars,collapse=' + '),sep=' ~ ')
modelR1 <- lm(fAllVarsR,data=d[!d$isTest,])
d$modelR1 <- predict(modelR1,newdata=d)
rmseErrors('modelR1')
## [1] "train RMSE 2.507"
## [1] "test RMSE 2.713"
ggplot(data=d[d$isTest,],aes(x=modelR1,y=Rings)) +
  geom_point(alpha=0.5) + geom_abline(slope=1,color='blue')

# build a better linear model using the non-invasive vars and GAM splines
splineVars <- function(vlist) {
  ifelse(vapply(vlist,function (v) is.numeric(d[,v]),c(FALSE)),
                      paste('s(',vlist,')',sep=''),vlist)
  }
splinedVars <- splineVars(vars)
fAllVarsRS <- paste('Rings',paste(splinedVars,collapse=' + '),sep=' ~ ')
print(fAllVarsRS)
## [1] "Rings ~ Sex + s(Length) + s(Diameter) + s(LongDim) + s(Height) + s(WholeWeight)"
modelR2 <- gam(as.formula(fAllVarsRS),data=d[!d$isTest,])
d$modelR2 <- predict(modelR2,newdata=d,type='response')
rmseErrors('modelR2')
## [1] "train RMSE 2.419"
## [1] "test RMSE 2.654"
ggplot(data=d[d$isTest,],aes(x=modelR2,y=Rings)) +
  geom_point(alpha=0.5) + geom_abline(slope=1,color='blue')

#plot(modelR2)
# build a better linear model using the non-invasive vars and GAM splines plus the invasive vars
splinedVarsA <- splineVars(c(nonInvasiveVars,invasiveVars))
fAllVarsRSA <- paste('Rings',paste(splinedVarsA,collapse=' + '),sep=' ~ ')
print(fAllVarsRSA)
## [1] "Rings ~ Sex + s(Length) + s(Diameter) + s(LongDim) + s(Height) + s(WholeWeight) + s(ShuckedWeight) + s(VisceraWeight) + s(ShellWeight)"
modelR3 <- gam(as.formula(fAllVarsRSA),data=d[!d$isTest,])
d$modelR3 <- predict(modelR3,newdata=d,type='response')
rmseErrors('modelR3')
## [1] "train RMSE 2.04"
## [1] "test RMSE 2.256"
ggplot(data=d[d$isTest,],aes(x=modelR3,y=Rings)) +
  geom_point(alpha=0.5) + geom_abline(slope=1,color='blue')

plot(modelR3)