# libraries
library('ggplot2')
# load the data
# as with the starting with R lesson
d <- read.table('http://www.amstat.org/publications/jse/datasets/homes76.dat.txt',
header=TRUE,sep='\t',stringsAsFactor=FALSE)
colnames(d) <- c('id', 'Price', 'Size', 'Lot', 'Bath', 'Bed', 'BathBed', 'Year', 'Age',
'Agesq', 'Garage', 'Status', 'Active', 'Elem', 'Edison Elementary',
'Harris Elementary', 'Adams Elementary', 'Crest Elementary', 'Parker Elementary')
# perform some elemenatry data treatment
d$Lot <- as.factor(d$Lot)
# define columns we will be working with
yColumn <- 'Price'
vars <- c("Size", "Lot", "Bath", "Bed", "Year", "Age",
"Garage", "Status", "Active", "Elem")
# look at the data
print(dim(d))
## [1] 76 19
print(head(d))
## id Price Size Lot Bath Bed BathBed Year Age Agesq Garage Status Active
## 1 1 388.0 2.180 4 3 4 12 1940 -3.0 9.00 0 sld 0
## 2 2 450.0 2.054 5 3 4 12 1957 -1.3 1.69 2 sld 0
## 3 3 386.0 2.112 5 2 4 8 1955 -1.5 2.25 2 sld 0
## 4 4 350.0 1.442 6 1 2 2 1956 -1.4 1.96 1 act 1
## 5 5 155.5 1.800 1 2 4 8 1994 2.4 5.76 1 sld 0
## 6 6 220.0 1.965 5 2 3 6 1940 -3.0 9.00 1 sld 0
## Elem Edison Elementary Harris Elementary Adams Elementary
## 1 edison 1 0 0
## 2 edison 1 0 0
## 3 edison 1 0 0
## 4 adams 0 0 1
## 5 adams 0 0 1
## 6 adams 0 0 1
## Crest Elementary Parker Elementary
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
print(head(d[,c(vars,yColumn)]))
## Size Lot Bath Bed Year Age Garage Status Active Elem Price
## 1 2.180 4 3 4 1940 -3.0 0 sld 0 edison 388.0
## 2 2.054 5 3 4 1957 -1.3 2 sld 0 edison 450.0
## 3 2.112 5 2 4 1955 -1.5 2 sld 0 edison 386.0
## 4 1.442 6 1 2 1956 -1.4 1 act 1 adams 350.0
## 5 1.800 1 2 4 1994 2.4 1 sld 0 adams 155.5
## 6 1.965 5 2 3 1940 -3.0 1 sld 0 adams 220.0
print(summary(d[,c(vars,yColumn)]))
## Size Lot Bath Bed Year
## Min. :1.44 4 :29 Min. :1.00 Min. :2.00 Min. :1905
## 1st Qu.:1.86 5 :18 1st Qu.:2.00 1st Qu.:3.00 1st Qu.:1958
## Median :1.97 3 :12 Median :2.00 Median :3.00 Median :1970
## Mean :1.97 1 : 7 Mean :2.21 Mean :3.45 Mean :1969
## 3rd Qu.:2.11 2 : 4 3rd Qu.:3.00 3rd Qu.:4.00 3rd Qu.:1980
## Max. :2.90 7 : 3 Max. :3.10 Max. :6.00 Max. :2005
## (Other): 3
## Age Garage Status Active
## Min. :-6.500 Min. :0.00 Length:76 Min. :0.000
## 1st Qu.:-1.225 1st Qu.:1.00 Class :character 1st Qu.:0.000
## Median :-0.050 Median :2.00 Mode :character Median :0.000
## Mean :-0.059 Mean :1.57 Mean :0.329
## 3rd Qu.: 1.000 3rd Qu.:2.00 3rd Qu.:1.000
## Max. : 3.500 Max. :3.00 Max. :1.000
##
## Elem Price
## Length:76 Min. :156
## Class :character 1st Qu.:243
## Mode :character Median :276
## Mean :286
## 3rd Qu.:337
## Max. :450
##
# guess at reproducting original analysis
fOrig <- "Price ~ Size + Lot + I(Bath*Bed) + Age + I(Age*Age) + Garage + Status + Elem"
modelOrig <- lm(as.formula(fOrig),data=d)
d$modelOrig <- predict(modelOrig,newdata=d)
sqrt(mean((d$Price-d$modelOrig)^2)) # comes out to 35.0
## [1] 35.04
# data scientists always want to see the fit. Statisticians are a little more patient
# and look at diagnostics first.
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) +
geom_point() + geom_abline(slope=1)

# Look at some of the classic diagnostic
print(summary(modelOrig))
##
## Call:
## lm(formula = as.formula(fOrig), data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.25 -22.55 -0.42 20.63 107.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.50 59.23 1.34 0.18504
## Size 48.59 29.68 1.64 0.10725
## Lot2 6.59 30.21 0.22 0.82821
## Lot3 -8.89 25.58 -0.35 0.72949
## Lot4 10.78 22.44 0.48 0.63300
## Lot5 29.91 23.94 1.25 0.21689
## Lot6 185.05 56.20 3.29 0.00174 **
## Lot7 24.25 31.32 0.77 0.44213
## Lot8 11.41 49.99 0.23 0.82031
## Lot11 174.17 48.31 3.61 0.00067 ***
## I(Bath * Bed) 1.17 2.20 0.53 0.59750
## Age 4.12 3.22 1.28 0.20575
## I(Age * Age) 1.88 0.80 2.35 0.02214 *
## Garage 15.12 8.90 1.70 0.09501 .
## Statuspen -21.99 17.38 -1.27 0.21107
## Statussld -34.16 14.01 -2.44 0.01805 *
## Elemcrest 66.29 37.90 1.75 0.08581 .
## Elemedge 63.50 33.56 1.89 0.06376 .
## Elemedison 125.14 34.32 3.65 0.00059 ***
## Elemharris 94.88 33.00 2.88 0.00573 **
## Elemparker 43.29 33.81 1.28 0.20581
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.2 on 55 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.534
## F-statistic: 5.3 on 20 and 55 DF, p-value: 4.51e-07
plot(modelOrig)
## Warning: not plotting observations with leverage one:
## 4, 21, 74


## Warning: not plotting observations with leverage one:
## 4, 21, 74
