Linear Regression

# libraries
library('ggplot2')

# load the data
# as with the starting with R lesson
d <- read.table('http://www.amstat.org/publications/jse/datasets/homes76.dat.txt',
                header=TRUE,sep='\t',stringsAsFactor=FALSE)
colnames(d) <- c('id', 'Price', 'Size', 'Lot', 'Bath', 'Bed', 'BathBed', 'Year', 'Age',
   'Agesq', 'Garage', 'Status', 'Active', 'Elem', 'Edison Elementary',
   'Harris Elementary', 'Adams Elementary', 'Crest Elementary', 'Parker Elementary')

# perform some elemenatry data treatment
d$Lot <- as.factor(d$Lot)
# define columns we will be working with
yColumn <- 'Price'
vars <- c("Size", "Lot", "Bath", "Bed", "Year", "Age", 
     "Garage", "Status", "Active", "Elem")

# look at the data
print(dim(d))

## [1] 76 19

print(head(d))

##   id Price  Size Lot Bath Bed BathBed Year  Age Agesq Garage Status Active
## 1  1 388.0 2.180   4    3   4      12 1940 -3.0  9.00      0    sld      0
## 2  2 450.0 2.054   5    3   4      12 1957 -1.3  1.69      2    sld      0
## 3  3 386.0 2.112   5    2   4       8 1955 -1.5  2.25      2    sld      0
## 4  4 350.0 1.442   6    1   2       2 1956 -1.4  1.96      1    act      1
## 5  5 155.5 1.800   1    2   4       8 1994  2.4  5.76      1    sld      0
## 6  6 220.0 1.965   5    2   3       6 1940 -3.0  9.00      1    sld      0
##     Elem Edison Elementary Harris Elementary Adams Elementary
## 1 edison                 1                 0                0
## 2 edison                 1                 0                0
## 3 edison                 1                 0                0
## 4  adams                 0                 0                1
## 5  adams                 0                 0                1
## 6  adams                 0                 0                1
##   Crest Elementary Parker Elementary
## 1                0                 0
## 2                0                 0
## 3                0                 0
## 4                0                 0
## 5                0                 0
## 6                0                 0

print(head(d[,c(vars,yColumn)]))

##    Size Lot Bath Bed Year  Age Garage Status Active   Elem Price
## 1 2.180   4    3   4 1940 -3.0      0    sld      0 edison 388.0
## 2 2.054   5    3   4 1957 -1.3      2    sld      0 edison 450.0
## 3 2.112   5    2   4 1955 -1.5      2    sld      0 edison 386.0
## 4 1.442   6    1   2 1956 -1.4      1    act      1  adams 350.0
## 5 1.800   1    2   4 1994  2.4      1    sld      0  adams 155.5
## 6 1.965   5    2   3 1940 -3.0      1    sld      0  adams 220.0

print(summary(d[,c(vars,yColumn)]))

##       Size           Lot          Bath           Bed            Year     
##  Min.   :1.44   4      :29   Min.   :1.00   Min.   :2.00   Min.   :1905  
##  1st Qu.:1.86   5      :18   1st Qu.:2.00   1st Qu.:3.00   1st Qu.:1958  
##  Median :1.97   3      :12   Median :2.00   Median :3.00   Median :1970  
##  Mean   :1.97   1      : 7   Mean   :2.21   Mean   :3.45   Mean   :1969  
##  3rd Qu.:2.11   2      : 4   3rd Qu.:3.00   3rd Qu.:4.00   3rd Qu.:1980  
##  Max.   :2.90   7      : 3   Max.   :3.10   Max.   :6.00   Max.   :2005  
##                 (Other): 3                                               
##       Age             Garage        Status              Active     
##  Min.   :-6.500   Min.   :0.00   Length:76          Min.   :0.000  
##  1st Qu.:-1.225   1st Qu.:1.00   Class :character   1st Qu.:0.000  
##  Median :-0.050   Median :2.00   Mode  :character   Median :0.000  
##  Mean   :-0.059   Mean   :1.57                      Mean   :0.329  
##  3rd Qu.: 1.000   3rd Qu.:2.00                      3rd Qu.:1.000  
##  Max.   : 3.500   Max.   :3.00                      Max.   :1.000  
##                                                                    
##      Elem               Price    
##  Length:76          Min.   :156  
##  Class :character   1st Qu.:243  
##  Mode  :character   Median :276  
##                     Mean   :286  
##                     3rd Qu.:337  
##                     Max.   :450  
##

# guess at reproducting original analysis
fOrig <- "Price ~ Size + Lot + I(Bath*Bed) + Age + I(Age*Age) + Garage + Status + Elem"
modelOrig <- lm(as.formula(fOrig),data=d)
d$modelOrig <- predict(modelOrig,newdata=d)
sqrt(mean((d$Price-d$modelOrig)^2)) # comes out to 35.0

## [1] 35.04

# data scientists always want to see the fit.  Statisticians are a little more patient
# and look at diagnostics first.
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-3

# Look at some of the classic diagnostic
print(summary(modelOrig))

## 
## Call:
## lm(formula = as.formula(fOrig), data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67.25 -22.55  -0.42  20.63 107.69 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      79.50      59.23    1.34  0.18504    
## Size             48.59      29.68    1.64  0.10725    
## Lot2              6.59      30.21    0.22  0.82821    
## Lot3             -8.89      25.58   -0.35  0.72949    
## Lot4             10.78      22.44    0.48  0.63300    
## Lot5             29.91      23.94    1.25  0.21689    
## Lot6            185.05      56.20    3.29  0.00174 ** 
## Lot7             24.25      31.32    0.77  0.44213    
## Lot8             11.41      49.99    0.23  0.82031    
## Lot11           174.17      48.31    3.61  0.00067 ***
## I(Bath * Bed)     1.17       2.20    0.53  0.59750    
## Age               4.12       3.22    1.28  0.20575    
## I(Age * Age)      1.88       0.80    2.35  0.02214 *  
## Garage           15.12       8.90    1.70  0.09501 .  
## Statuspen       -21.99      17.38   -1.27  0.21107    
## Statussld       -34.16      14.01   -2.44  0.01805 *  
## Elemcrest        66.29      37.90    1.75  0.08581 .  
## Elemedge         63.50      33.56    1.89  0.06376 .  
## Elemedison      125.14      34.32    3.65  0.00059 ***
## Elemharris       94.88      33.00    2.88  0.00573 ** 
## Elemparker       43.29      33.81    1.28  0.20581    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.2 on 55 degrees of freedom
## Multiple R-squared:  0.658,  Adjusted R-squared:  0.534 
## F-statistic:  5.3 on 20 and 55 DF,  p-value: 4.51e-07

plot(modelOrig)

## Warning: not plotting observations with leverage one:
##   4, 21, 74

plot of chunk unnamed-chunk-4

## Warning: not plotting observations with leverage one:
##   4, 21, 74

plot of chunk unnamed-chunk-4

# We are going to be comparing different models with different 
# model complexity (in this case numbers of variables and levels).
# So try to add a test/train split for scoring.
set.seed(123)
d$isTest <- runif(nrow(d))<=0.25
modelSplit <- lm(as.formula(fOrig),data=d[!d$isTest,,drop=FALSE])
tryCatch(
   d$modelSplit <- predict(modelSplit,newdata=d),
     warning = function(w) {print(paste('warn:',w))},
     error = function(e) {print(paste('error:',e))}
  )

## [1] "error: Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor Lot has new levels 11\n"

# Find out what failed in the test/train split
# look at the variable types a bit
print(str(d))

## 'data.frame':    76 obs. of  21 variables:
##  $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Price            : num  388 450 386 350 156 ...
##  $ Size             : num  2.18 2.05 2.11 1.44 1.8 ...
##  $ Lot              : Factor w/ 9 levels "1","2","3","4",..: 4 5 5 6 1 5 4 4 4 5 ...
##  $ Bath             : num  3 3 2 1 2 2 1.1 2 2.1 2.1 ...
##  $ Bed              : int  4 4 4 2 4 3 4 4 4 3 ...
##  $ BathBed          : num  12 12 8 2 8 6 4.4 8 8.4 6.3 ...
##  $ Year             : int  1940 1957 1955 1956 1994 1940 1958 1961 1965 1968 ...
##  $ Age              : num  -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
##  $ Agesq            : num  9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
##  $ Garage           : int  0 2 2 1 1 1 1 2 2 2 ...
##  $ Status           : chr  "sld" "sld" "sld" "act" ...
##  $ Active           : int  0 0 0 1 0 0 1 0 1 0 ...
##  $ Elem             : chr  "edison" "edison" "edison" "adams" ...
##  $ Edison Elementary: int  1 1 1 0 0 0 0 0 0 0 ...
##  $ Harris Elementary: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Adams Elementary : int  0 0 0 1 1 1 0 0 0 0 ...
##  $ Crest Elementary : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Parker Elementary: int  0 0 0 0 0 0 1 1 1 1 ...
##  $ modelOrig        : num  306 342 341 350 178 ...
##  $ isTest           : logi  FALSE FALSE FALSE FALSE FALSE TRUE ...
## NULL

# both Elem and Lot are categorical variables
print(summary(as.factor(d$Elem)))

##  adams  crest   edge edison harris parker 
##      3      6     26     12     14     15

print(summary(d$Lot))

##  1  2  3  4  5  6  7  8 11 
##  7  4 12 29 18  1  3  1  1

# Lot has unique levels : 6, 8, and 11
# This means (since we are not using test/train split)
# the model can use these levels to simulate perfectly predicting the prices
# of these three houses.

# The unique level effect also makes straightforward holdout testing
# harder.
# Build a function to crudely perform 1-way hold-out testing (even in the presence of unique levels)
oneHoldPreds <- function(d,fitFn,varsToCheck) {
  preds <- double(nrow(d))
  for(i in seq_len(nrow(d))) {
    di <- d
    for(v in varsToCheck) {
      if(!(di[i,v] %in% d[-i,v])) {
         # if there is a new level in test knock the variable out 
         # of the scratch frame to prevent it from being in the model
         # this is a hack to avoid having to manipulate the 
         # formula string.
         di[,v] <- 0
      }
    }
    mi <- fitFn(di[-i,,drop=FALSE])
    preds[[i]] <- predict(mi,newdata=di[i,,drop=FALSE])
  }
  preds
}

riskyVars <- c('Lot')

# run the original model in a 1-way hold out fasion.
d$modelOrig <- oneHoldPreds(d,function(df) { lm(fOrig,data=df) },riskyVars)

## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

sqrt(mean((d$Price-d$modelOrig)^2))

## [1] 55.8

# RMS error increases from original 35.0 to 55.8!
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-6

# start our own modeling effort modeling
f1 <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
print(f1)

## [1] "Price ~ Size + Lot + Bath + Bed + Year + Age + Garage + Status + Active + Elem"

model1 <- lm(f1,data=d[!d$isTest,,drop=FALSE])
# lm() has a lot of useful summaries, might as well use them!
# look at model summary
print(summary(model1))

## 
## Call:
## lm(formula = f1, data = d[!d$isTest, , drop = FALSE])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -79.6  -18.7    0.0   18.0   72.6 
## 
## Coefficients: (2 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -168.1335   828.3288   -0.20   0.8403   
## Size          86.3639    44.2142    1.95   0.0584 . 
## Lot2          11.8156    37.3481    0.32   0.7535   
## Lot3         -23.2935    29.3248   -0.79   0.4321   
## Lot4          -2.9312    24.8637   -0.12   0.9068   
## Lot5           9.7546    27.0203    0.36   0.7201   
## Lot6         193.7890    66.4969    2.91   0.0060 **
## Lot7          43.3679    36.3048    1.19   0.2399   
## Lot8          30.0212    57.2718    0.52   0.6033   
## Bath          14.2422    13.0673    1.09   0.2828   
## Bed            2.0346    12.7757    0.16   0.8743   
## Year           0.0918     0.4159    0.22   0.8265   
## Age                NA         NA      NA       NA   
## Garage         1.8983    12.3260    0.15   0.8784   
## Statuspen    -20.2186    22.5142   -0.90   0.3750   
## Statussld    -53.4298    15.7659   -3.39   0.0017 **
## Active             NA         NA      NA       NA   
## Elemcrest     88.0788    55.3753    1.59   0.1202   
## Elemedge      81.4649    51.6739    1.58   0.1234   
## Elemedison   177.4645    53.4951    3.32   0.0020 **
## Elemharris   107.2090    49.5866    2.16   0.0372 * 
## Elemparker    48.5333    51.0053    0.95   0.3475   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.9 on 37 degrees of freedom
## Multiple R-squared:  0.699,  Adjusted R-squared:  0.544 
## F-statistic: 4.52 on 19 and 37 DF,  p-value: 4.38e-05

# First problem: 

table(d$Status,d$Active)

##      
##        0  1
##   act  0 25
##   pen 13  0
##   sld 38  0

# Active is redundant, implied by Status

checkM <- lm(Age ~ Year,data=d)
print(checkM)

## 
## Call:
## lm(formula = Age ~ Year, data = d)
## 
## Coefficients:
## (Intercept)         Year  
##      -197.0          0.1

print(summary(d$Age - predict(checkM,newdata=d)))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.42e-13 -1.14e-13 -1.08e-13 -1.07e-13 -9.52e-14 -8.53e-14

# Age = 0.1*(Year - 1970)

# remove these variables
vars <- setdiff(vars,c('Active','Age'))

# re-fit
f2 <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
d$model2 <- oneHoldPreds(d,function(df) { lm(f2,data=df) },riskyVars)

## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

sqrt(mean((d$Price-d$model2)^2))

## [1] 54.55

# RMS held-out error 54.6 (instead of 55.8), something is supplying
# a small amount of over-fit.
ggplot(data=d,aes_string(x='model2',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-8

# try picking variables from original model
modelOrig <- lm(as.formula(fOrig),data=d[!d$isTest,,drop=FALSE])
steppedModel <- step(modelOrig)

## Start:  AIC=432.2
## Price ~ Size + Lot + I(Bath * Bed) + Age + I(Age * Age) + Garage + 
##     Status + Elem
## 
##                 Df Sum of Sq    RSS AIC
## - Age            1       284  55774 431
## - Garage         1       661  56152 431
## <none>                        55491 432
## - I(Bath * Bed)  1      4309  59800 434
## - I(Age * Age)   1      6046  61537 436
## - Size           1      6982  62473 437
## - Lot            7     28590  84081 442
## - Status         2     16329  71820 443
## - Elem           5     47861 103352 458
## 
## Step:  AIC=430.5
## Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) + Garage + 
##     Status + Elem
## 
##                 Df Sum of Sq    RSS AIC
## - Garage         1      1533  57308 430
## <none>                        55774 431
## - I(Bath * Bed)  1      4809  60583 433
## - I(Age * Age)   1      5882  61656 434
## - Size           1      7085  62859 435
## - Lot            7     28308  84082 440
## - Status         2     17102  72876 442
## - Elem           5     47722 103496 456
## 
## Step:  AIC=430.1
## Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) + Status + 
##     Elem
## 
##                 Df Sum of Sq    RSS AIC
## <none>                        57308 430
## - I(Bath * Bed)  1      3917  61225 432
## - I(Age * Age)   1      4845  62153 433
## - Size           1      8040  65348 436
## - Lot            7     28136  85444 439
## - Status         2     27755  85063 449
## - Elem           5     54652 111960 458

print(steppedModel)

## 
## Call:
## lm(formula = Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) + 
##     Status + Elem, data = d[!d$isTest, , drop = FALSE])
## 
## Coefficients:
##   (Intercept)           Size           Lot2           Lot3           Lot4  
##         0.769         94.532         35.014        -10.905          8.712  
##          Lot5           Lot6           Lot7           Lot8  I(Bath * Bed)  
##        26.336        201.483         60.332         61.289          3.896  
##  I(Age * Age)      Statuspen      Statussld      Elemcrest       Elemedge  
##         1.858        -35.047        -57.295         78.438         77.299  
##    Elemedison     Elemharris     Elemparker  
##       151.134        102.868         37.356

# try to impose a model structure appropriate for the problem
d$PriceSqFoot <- d$Price/d$Size
d$BathsPerBed <- d$Bath/d$Bed
f4  <- 'log(PriceSqFoot) ~ BathsPerBed + Lot + Age + Garage + Elem + Status'
model4 <- lm(as.formula(f4),data=d[!d$isTest,,drop=FALSE])
d$model4 <- exp(oneHoldPreds(d,function(df) { lm(f4,data=df) },riskyVars))*d$Size

## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

sqrt(mean((d$Price-d$model4)^2))

## [1] 59.16

# RMS error larger 59.2
ggplot(data=d,aes_string(x='model4',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-10

# look for why our "good idea" did not work
ggplot(data=d,aes(x=Size,y=Price)) + 
  geom_point() + geom_smooth() + 
  scale_x_continuous(breaks=seq(1.5,3.0,0.1))

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-11

# At best relation only holds for Size range 1.8 through 2.2
# Notice we looked at all the data to make this observation, so 
# the analyst them selves adds an (undesirable) bias as they
# pick things that look good.  If we had more data we would 
# build a proper test/train split and make modeling decision based
# only on looking at the training portion of the data.
# Also note the analyst introduces a "multiple comparison" problem
# on the training data: they try many variations of method and
# keep a best one, which means they may be scoring their own
# performance higher than is going to be seen on test data or
# later in production.
# The more data and more careful procedures you have, the less
# you see of such problems.

# Try again, combining our ideas with sources ideas
d$collaredSize <- pmax(1.8,pmin(2.2,d$Size))
d$PriceSqFoot <- d$Price/d$collaredSize
f5 <- 'log(PriceSqFoot) ~ Bed*Bath + Lot + Age + Garage + Elem + Status'
d$model5 <- exp(oneHoldPreds(d,function(df) { lm(f5,data=df) },riskyVars))*d$collaredSize

## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

sqrt(mean((d$Price-d$model5)^2))

## [1] 52.82

# RMS error 52.8
ggplot(data=d,aes_string(x='model5',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-12