# libraries
library('ggplot2')
# load the data
# as with the starting with R lesson
d <- read.table('http://www.amstat.org/publications/jse/datasets/homes76.dat.txt',
                header=TRUE,sep='\t',stringsAsFactor=FALSE)
colnames(d) <- c('id', 'Price', 'Size', 'Lot', 'Bath', 'Bed', 'BathBed', 'Year', 'Age',
   'Agesq', 'Garage', 'Status', 'Active', 'Elem', 'Edison Elementary',
   'Harris Elementary', 'Adams Elementary', 'Crest Elementary', 'Parker Elementary')

# perform some elemenatry data treatment
d$Lot <- as.factor(d$Lot)
# define columns we will be working with
yColumn <- 'Price'
vars <- c("Size", "Lot", "Bath", "Bed", "Year", "Age", 
     "Garage", "Status", "Active", "Elem")

# look at the data
print(dim(d))
## [1] 76 19
print(head(d))
##   id Price  Size Lot Bath Bed BathBed Year  Age Agesq Garage Status Active
## 1  1 388.0 2.180   4    3   4      12 1940 -3.0  9.00      0    sld      0
## 2  2 450.0 2.054   5    3   4      12 1957 -1.3  1.69      2    sld      0
## 3  3 386.0 2.112   5    2   4       8 1955 -1.5  2.25      2    sld      0
## 4  4 350.0 1.442   6    1   2       2 1956 -1.4  1.96      1    act      1
## 5  5 155.5 1.800   1    2   4       8 1994  2.4  5.76      1    sld      0
## 6  6 220.0 1.965   5    2   3       6 1940 -3.0  9.00      1    sld      0
##     Elem Edison Elementary Harris Elementary Adams Elementary
## 1 edison                 1                 0                0
## 2 edison                 1                 0                0
## 3 edison                 1                 0                0
## 4  adams                 0                 0                1
## 5  adams                 0                 0                1
## 6  adams                 0                 0                1
##   Crest Elementary Parker Elementary
## 1                0                 0
## 2                0                 0
## 3                0                 0
## 4                0                 0
## 5                0                 0
## 6                0                 0
print(head(d[,c(vars,yColumn)]))
##    Size Lot Bath Bed Year  Age Garage Status Active   Elem Price
## 1 2.180   4    3   4 1940 -3.0      0    sld      0 edison 388.0
## 2 2.054   5    3   4 1957 -1.3      2    sld      0 edison 450.0
## 3 2.112   5    2   4 1955 -1.5      2    sld      0 edison 386.0
## 4 1.442   6    1   2 1956 -1.4      1    act      1  adams 350.0
## 5 1.800   1    2   4 1994  2.4      1    sld      0  adams 155.5
## 6 1.965   5    2   3 1940 -3.0      1    sld      0  adams 220.0
print(summary(d[,c(vars,yColumn)]))
##       Size           Lot          Bath           Bed            Year     
##  Min.   :1.44   4      :29   Min.   :1.00   Min.   :2.00   Min.   :1905  
##  1st Qu.:1.86   5      :18   1st Qu.:2.00   1st Qu.:3.00   1st Qu.:1958  
##  Median :1.97   3      :12   Median :2.00   Median :3.00   Median :1970  
##  Mean   :1.97   1      : 7   Mean   :2.21   Mean   :3.45   Mean   :1969  
##  3rd Qu.:2.11   2      : 4   3rd Qu.:3.00   3rd Qu.:4.00   3rd Qu.:1980  
##  Max.   :2.90   7      : 3   Max.   :3.10   Max.   :6.00   Max.   :2005  
##                 (Other): 3                                               
##       Age             Garage        Status              Active     
##  Min.   :-6.500   Min.   :0.00   Length:76          Min.   :0.000  
##  1st Qu.:-1.225   1st Qu.:1.00   Class :character   1st Qu.:0.000  
##  Median :-0.050   Median :2.00   Mode  :character   Median :0.000  
##  Mean   :-0.059   Mean   :1.57                      Mean   :0.329  
##  3rd Qu.: 1.000   3rd Qu.:2.00                      3rd Qu.:1.000  
##  Max.   : 3.500   Max.   :3.00                      Max.   :1.000  
##                                                                    
##      Elem               Price    
##  Length:76          Min.   :156  
##  Class :character   1st Qu.:243  
##  Mode  :character   Median :276  
##                     Mean   :286  
##                     3rd Qu.:337  
##                     Max.   :450  
## 
# guess at reproducting original analysis
fOrig <- "Price ~ Size + Lot + I(Bath*Bed) + Age + I(Age*Age) + Garage + Status + Elem"
modelOrig <- lm(as.formula(fOrig),data=d)
d$modelOrig <- predict(modelOrig,newdata=d)
sqrt(mean((d$Price-d$modelOrig)^2)) # comes out to 35.0
## [1] 35.04
# data scientists always want to see the fit.  Statisticians are a little more patient
# and look at diagnostics first.
ggplot(data=d,aes_string(x='modelOrig',y=yColumn)) + 
  geom_point() + geom_abline(slope=1)

plot of chunk unnamed-chunk-3

# Look at some of the classic diagnostic
print(summary(modelOrig))
## 
## Call:
## lm(formula = as.formula(fOrig), data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67.25 -22.55  -0.42  20.63 107.69 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      79.50      59.23    1.34  0.18504    
## Size             48.59      29.68    1.64  0.10725    
## Lot2              6.59      30.21    0.22  0.82821    
## Lot3             -8.89      25.58   -0.35  0.72949    
## Lot4             10.78      22.44    0.48  0.63300    
## Lot5             29.91      23.94    1.25  0.21689    
## Lot6            185.05      56.20    3.29  0.00174 ** 
## Lot7             24.25      31.32    0.77  0.44213    
## Lot8             11.41      49.99    0.23  0.82031    
## Lot11           174.17      48.31    3.61  0.00067 ***
## I(Bath * Bed)     1.17       2.20    0.53  0.59750    
## Age               4.12       3.22    1.28  0.20575    
## I(Age * Age)      1.88       0.80    2.35  0.02214 *  
## Garage           15.12       8.90    1.70  0.09501 .  
## Statuspen       -21.99      17.38   -1.27  0.21107    
## Statussld       -34.16      14.01   -2.44  0.01805 *  
## Elemcrest        66.29      37.90    1.75  0.08581 .  
## Elemedge         63.50      33.56    1.89  0.06376 .  
## Elemedison      125.14      34.32    3.65  0.00059 ***
## Elemharris       94.88      33.00    2.88  0.00573 ** 
## Elemparker       43.29      33.81    1.28  0.20581    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.2 on 55 degrees of freedom
## Multiple R-squared:  0.658,  Adjusted R-squared:  0.534 
## F-statistic:  5.3 on 20 and 55 DF,  p-value: 4.51e-07
plot(modelOrig)
## Warning: not plotting observations with leverage one:
##   4, 21, 74

plot of chunk unnamed-chunk-4plot of chunk unnamed-chunk-4

## Warning: not plotting observations with leverage one:
##   4, 21, 74