# libraries
library('ggplot2')
# load the data
# as with the starting with R lesson
d <- read.table('http://www.amstat.org/publications/jse/datasets/homes76.dat.txt',
header=TRUE,sep='\t',stringsAsFactor=FALSE)
colnames(d) <- c('id', 'Price', 'Size', 'Lot', 'Bath', 'Bed', 'BathBed', 'Year', 'Age',
'Agesq', 'Garage', 'Status', 'Active', 'Elem', 'Edison Elementary',
'Harris Elementary', 'Adams Elementary', 'Crest Elementary', 'Parker Elementary')
# perform some elemenatry data treatment
d$Lot <- as.factor(d$Lot)
# define columns we will be working with
yColumn <- 'Price'
vars <- c("Size", "Lot", "Bath", "Bed", "Year", "Age",
"Garage", "Status", "Active", "Elem")
# look at the data
print(dim(d))
## [1] 76 19
print(head(d))
## id Price Size Lot Bath Bed BathBed Year Age Agesq Garage Status Active
## 1 1 388.0 2.180 4 3 4 12 1940 -3.0 9.00 0 sld 0
## 2 2 450.0 2.054 5 3 4 12 1957 -1.3 1.69 2 sld 0
## 3 3 386.0 2.112 5 2 4 8 1955 -1.5 2.25 2 sld 0
## 4 4 350.0 1.442 6 1 2 2 1956 -1.4 1.96 1 act 1
## 5 5 155.5 1.800 1 2 4 8 1994 2.4 5.76 1 sld 0
## 6 6 220.0 1.965 5 2 3 6 1940 -3.0 9.00 1 sld 0
## Elem Edison Elementary Harris Elementary Adams Elementary
## 1 edison 1 0 0
## 2 edison 1 0 0
## 3 edison 1 0 0
## 4 adams 0 0 1
## 5 adams 0 0 1
## 6 adams 0 0 1
## Crest Elementary Parker Elementary
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
print(head(d[,c(vars,yColumn)]))
## Size Lot Bath Bed Year Age Garage Status Active Elem Price
## 1 2.180 4 3 4 1940 -3.0 0 sld 0 edison 388.0
## 2 2.054 5 3 4 1957 -1.3 2 sld 0 edison 450.0
## 3 2.112 5 2 4 1955 -1.5 2 sld 0 edison 386.0
## 4 1.442 6 1 2 1956 -1.4 1 act 1 adams 350.0
## 5 1.800 1 2 4 1994 2.4 1 sld 0 adams 155.5
## 6 1.965 5 2 3 1940 -3.0 1 sld 0 adams 220.0
print(summary(d[,c(vars,yColumn)]))
## Size Lot Bath Bed Year
## Min. :1.44 4 :29 Min. :1.00 Min. :2.00 Min. :1905
## 1st Qu.:1.86 5 :18 1st Qu.:2.00 1st Qu.:3.00 1st Qu.:1958
## Median :1.97 3 :12 Median :2.00 Median :3.00 Median :1970
## Mean :1.97 1 : 7 Mean :2.21 Mean :3.45 Mean :1969
## 3rd Qu.:2.11 2 : 4 3rd Qu.:3.00 3rd Qu.:4.00 3rd Qu.:1980
## Max. :2.90 7 : 3 Max. :3.10 Max. :6.00 Max. :2005
## (Other): 3
## Age Garage Status Active
## Min. :-6.500 Min. :0.00 Length:76 Min. :0.000
## 1st Qu.:-1.225 1st Qu.:1.00 Class :character 1st Qu.:0.000
## Median :-0.050 Median :2.00 Mode :character Median :0.000
## Mean :-0.059 Mean :1.57 Mean :0.329
## 3rd Qu.: 1.000 3rd Qu.:2.00 3rd Qu.:1.000
## Max. : 3.500 Max. :3.00 Max. :1.000
##
## Elem Price
## Length:76 Min. :156
## Class :character 1st Qu.:243
## Mode :character Median :276
## Mean :286
## 3rd Qu.:337
## Max. :450
##
# guess at reproducting original analysis
fOrig <- "Price ~ Size + Lot + I(Bath*Bed) + Age + I(Age*Age) + Garage + Status + Elem"
modelOrig <- lm(as.formula(fOrig),data=d)
d$modelOrig <- predict(modelOrig,newdata=d)
sqrt(mean((d$Price-d$modelOrig)^2)) # comes out to 35.0
## [1] 35.04
# data scientists always want to see the fit. Statisticians are a little more patient
# and look at diagnostics first.
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) +
geom_point() + geom_abline(slope=1)
# Look at some of the classic diagnostic
print(summary(modelOrig))
##
## Call:
## lm(formula = as.formula(fOrig), data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.25 -22.55 -0.42 20.63 107.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.50 59.23 1.34 0.18504
## Size 48.59 29.68 1.64 0.10725
## Lot2 6.59 30.21 0.22 0.82821
## Lot3 -8.89 25.58 -0.35 0.72949
## Lot4 10.78 22.44 0.48 0.63300
## Lot5 29.91 23.94 1.25 0.21689
## Lot6 185.05 56.20 3.29 0.00174 **
## Lot7 24.25 31.32 0.77 0.44213
## Lot8 11.41 49.99 0.23 0.82031
## Lot11 174.17 48.31 3.61 0.00067 ***
## I(Bath * Bed) 1.17 2.20 0.53 0.59750
## Age 4.12 3.22 1.28 0.20575
## I(Age * Age) 1.88 0.80 2.35 0.02214 *
## Garage 15.12 8.90 1.70 0.09501 .
## Statuspen -21.99 17.38 -1.27 0.21107
## Statussld -34.16 14.01 -2.44 0.01805 *
## Elemcrest 66.29 37.90 1.75 0.08581 .
## Elemedge 63.50 33.56 1.89 0.06376 .
## Elemedison 125.14 34.32 3.65 0.00059 ***
## Elemharris 94.88 33.00 2.88 0.00573 **
## Elemparker 43.29 33.81 1.28 0.20581
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.2 on 55 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.534
## F-statistic: 5.3 on 20 and 55 DF, p-value: 4.51e-07
plot(modelOrig)
## Warning: not plotting observations with leverage one:
## 4, 21, 74
## Warning: not plotting observations with leverage one:
## 4, 21, 74
# We are going to be comparing different models with different
# model complexity (in this case numbers of variables and levels).
# So try to add a test/train split for scoring.
set.seed(123)
d$isTest <- runif(nrow(d))<=0.25
modelSplit <- lm(as.formula(fOrig),data=d[!d$isTest,,drop=FALSE])
tryCatch(
d$modelSplit <- predict(modelSplit,newdata=d),
warning = function(w) {print(paste('warn:',w))},
error = function(e) {print(paste('error:',e))}
)
## [1] "error: Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor Lot has new levels 11\n"
# Find out what failed in the test/train split
# look at the variable types a bit
print(str(d))
## 'data.frame': 76 obs. of 21 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Price : num 388 450 386 350 156 ...
## $ Size : num 2.18 2.05 2.11 1.44 1.8 ...
## $ Lot : Factor w/ 9 levels "1","2","3","4",..: 4 5 5 6 1 5 4 4 4 5 ...
## $ Bath : num 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ Bed : int 4 4 4 2 4 3 4 4 4 3 ...
## $ BathBed : num 12 12 8 2 8 6 4.4 8 8.4 6.3 ...
## $ Year : int 1940 1957 1955 1956 1994 1940 1958 1961 1965 1968 ...
## $ Age : num -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ Agesq : num 9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
## $ Garage : int 0 2 2 1 1 1 1 2 2 2 ...
## $ Status : chr "sld" "sld" "sld" "act" ...
## $ Active : int 0 0 0 1 0 0 1 0 1 0 ...
## $ Elem : chr "edison" "edison" "edison" "adams" ...
## $ Edison Elementary: int 1 1 1 0 0 0 0 0 0 0 ...
## $ Harris Elementary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Adams Elementary : int 0 0 0 1 1 1 0 0 0 0 ...
## $ Crest Elementary : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Parker Elementary: int 0 0 0 0 0 0 1 1 1 1 ...
## $ modelOrig : num 306 342 341 350 178 ...
## $ isTest : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## NULL
# both Elem and Lot are categorical variables
print(summary(as.factor(d$Elem)))
## adams crest edge edison harris parker
## 3 6 26 12 14 15
print(summary(d$Lot))
## 1 2 3 4 5 6 7 8 11
## 7 4 12 29 18 1 3 1 1
# Lot has unique levels : 6, 8, and 11
# This means (since we are not using test/train split)
# the model can use these levels to simulate perfectly predicting the prices
# of these three houses.
# The unique level effect also makes straightforward holdout testing
# harder.
# Build a function to crudely perform 1-way hold-out testing (even in the presence of unique levels)
oneHoldPreds <- function(d,fitFn,varsToCheck) {
preds <- double(nrow(d))
for(i in seq_len(nrow(d))) {
di <- d
for(v in varsToCheck) {
if(!(di[i,v] %in% d[-i,v])) {
# if there is a new level in test knock the variable out
# of the scratch frame to prevent it from being in the model
# this is a hack to avoid having to manipulate the
# formula string.
di[,v] <- 0
}
}
mi <- fitFn(di[-i,,drop=FALSE])
preds[[i]] <- predict(mi,newdata=di[i,,drop=FALSE])
}
preds
}
riskyVars <- c('Lot')
# run the original model in a 1-way hold out fasion.
d$modelOrig <- oneHoldPreds(d,function(df) { lm(fOrig,data=df) },riskyVars)
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
sqrt(mean((d$Price-d$modelOrig)^2))
## [1] 55.8
# RMS error increases from original 35.0 to 55.8!
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) +
geom_point() + geom_abline(slope=1)
# start our own modeling effort modeling
f1 <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
print(f1)
## [1] "Price ~ Size + Lot + Bath + Bed + Year + Age + Garage + Status + Active + Elem"
model1 <- lm(f1,data=d[!d$isTest,,drop=FALSE])
# lm() has a lot of useful summaries, might as well use them!
# look at model summary
print(summary(model1))
##
## Call:
## lm(formula = f1, data = d[!d$isTest, , drop = FALSE])
##
## Residuals:
## Min 1Q Median 3Q Max
## -79.6 -18.7 0.0 18.0 72.6
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -168.1335 828.3288 -0.20 0.8403
## Size 86.3639 44.2142 1.95 0.0584 .
## Lot2 11.8156 37.3481 0.32 0.7535
## Lot3 -23.2935 29.3248 -0.79 0.4321
## Lot4 -2.9312 24.8637 -0.12 0.9068
## Lot5 9.7546 27.0203 0.36 0.7201
## Lot6 193.7890 66.4969 2.91 0.0060 **
## Lot7 43.3679 36.3048 1.19 0.2399
## Lot8 30.0212 57.2718 0.52 0.6033
## Bath 14.2422 13.0673 1.09 0.2828
## Bed 2.0346 12.7757 0.16 0.8743
## Year 0.0918 0.4159 0.22 0.8265
## Age NA NA NA NA
## Garage 1.8983 12.3260 0.15 0.8784
## Statuspen -20.2186 22.5142 -0.90 0.3750
## Statussld -53.4298 15.7659 -3.39 0.0017 **
## Active NA NA NA NA
## Elemcrest 88.0788 55.3753 1.59 0.1202
## Elemedge 81.4649 51.6739 1.58 0.1234
## Elemedison 177.4645 53.4951 3.32 0.0020 **
## Elemharris 107.2090 49.5866 2.16 0.0372 *
## Elemparker 48.5333 51.0053 0.95 0.3475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.9 on 37 degrees of freedom
## Multiple R-squared: 0.699, Adjusted R-squared: 0.544
## F-statistic: 4.52 on 19 and 37 DF, p-value: 4.38e-05
# First problem:
table(d$Status,d$Active)
##
## 0 1
## act 0 25
## pen 13 0
## sld 38 0
# Active is redundant, implied by Status
checkM <- lm(Age ~ Year,data=d)
print(checkM)
##
## Call:
## lm(formula = Age ~ Year, data = d)
##
## Coefficients:
## (Intercept) Year
## -197.0 0.1
print(summary(d$Age - predict(checkM,newdata=d)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.42e-13 -1.14e-13 -1.08e-13 -1.07e-13 -9.52e-14 -8.53e-14
# Age = 0.1*(Year - 1970)
# remove these variables
vars <- setdiff(vars,c('Active','Age'))
# re-fit
f2 <- paste(yColumn,paste(vars,collapse=' + '),sep=' ~ ')
d$model2 <- oneHoldPreds(d,function(df) { lm(f2,data=df) },riskyVars)
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
sqrt(mean((d$Price-d$model2)^2))
## [1] 54.55
# RMS held-out error 54.6 (instead of 55.8), something is supplying
# a small amount of over-fit.
ggplot(data=d,aes_string(x='model2',y=yColumn)) +
geom_point() + geom_abline(slope=1)
# try picking variables from original model
modelOrig <- lm(as.formula(fOrig),data=d[!d$isTest,,drop=FALSE])
steppedModel <- step(modelOrig)
## Start: AIC=432.2
## Price ~ Size + Lot + I(Bath * Bed) + Age + I(Age * Age) + Garage +
## Status + Elem
##
## Df Sum of Sq RSS AIC
## - Age 1 284 55774 431
## - Garage 1 661 56152 431
## <none> 55491 432
## - I(Bath * Bed) 1 4309 59800 434
## - I(Age * Age) 1 6046 61537 436
## - Size 1 6982 62473 437
## - Lot 7 28590 84081 442
## - Status 2 16329 71820 443
## - Elem 5 47861 103352 458
##
## Step: AIC=430.5
## Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) + Garage +
## Status + Elem
##
## Df Sum of Sq RSS AIC
## - Garage 1 1533 57308 430
## <none> 55774 431
## - I(Bath * Bed) 1 4809 60583 433
## - I(Age * Age) 1 5882 61656 434
## - Size 1 7085 62859 435
## - Lot 7 28308 84082 440
## - Status 2 17102 72876 442
## - Elem 5 47722 103496 456
##
## Step: AIC=430.1
## Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) + Status +
## Elem
##
## Df Sum of Sq RSS AIC
## <none> 57308 430
## - I(Bath * Bed) 1 3917 61225 432
## - I(Age * Age) 1 4845 62153 433
## - Size 1 8040 65348 436
## - Lot 7 28136 85444 439
## - Status 2 27755 85063 449
## - Elem 5 54652 111960 458
print(steppedModel)
##
## Call:
## lm(formula = Price ~ Size + Lot + I(Bath * Bed) + I(Age * Age) +
## Status + Elem, data = d[!d$isTest, , drop = FALSE])
##
## Coefficients:
## (Intercept) Size Lot2 Lot3 Lot4
## 0.769 94.532 35.014 -10.905 8.712
## Lot5 Lot6 Lot7 Lot8 I(Bath * Bed)
## 26.336 201.483 60.332 61.289 3.896
## I(Age * Age) Statuspen Statussld Elemcrest Elemedge
## 1.858 -35.047 -57.295 78.438 77.299
## Elemedison Elemharris Elemparker
## 151.134 102.868 37.356
# try to impose a model structure appropriate for the problem
d$PriceSqFoot <- d$Price/d$Size
d$BathsPerBed <- d$Bath/d$Bed
f4 <- 'log(PriceSqFoot) ~ BathsPerBed + Lot + Age + Garage + Elem + Status'
model4 <- lm(as.formula(f4),data=d[!d$isTest,,drop=FALSE])
d$model4 <- exp(oneHoldPreds(d,function(df) { lm(f4,data=df) },riskyVars))*d$Size
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
sqrt(mean((d$Price-d$model4)^2))
## [1] 59.16
# RMS error larger 59.2
ggplot(data=d,aes_string(x='model4',y=yColumn)) +
geom_point() + geom_abline(slope=1)
# look for why our "good idea" did not work
ggplot(data=d,aes(x=Size,y=Price)) +
geom_point() + geom_smooth() +
scale_x_continuous(breaks=seq(1.5,3.0,0.1))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# At best relation only holds for Size range 1.8 through 2.2
# Notice we looked at all the data to make this observation, so
# the analyst them selves adds an (undesirable) bias as they
# pick things that look good. If we had more data we would
# build a proper test/train split and make modeling decision based
# only on looking at the training portion of the data.
# Also note the analyst introduces a "multiple comparison" problem
# on the training data: they try many variations of method and
# keep a best one, which means they may be scoring their own
# performance higher than is going to be seen on test data or
# later in production.
# The more data and more careful procedures you have, the less
# you see of such problems.
# Try again, combining our ideas with sources ideas
d$collaredSize <- pmax(1.8,pmin(2.2,d$Size))
d$PriceSqFoot <- d$Price/d$collaredSize
f5 <- 'log(PriceSqFoot) ~ Bed*Bath + Lot + Age + Garage + Elem + Status'
d$model5 <- exp(oneHoldPreds(d,function(df) { lm(f5,data=df) },riskyVars))*d$collaredSize
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
sqrt(mean((d$Price-d$model5)^2))
## [1] 52.82
# RMS error 52.8
ggplot(data=d,aes_string(x='model5',y=yColumn)) +
geom_point() + geom_abline(slope=1)