Lesson on Test/Train Split
The first thing to do is load the data.
salaryData = readRDS("salaryData.rds")
# look at the data
dim(salaryData)
## [1] 337 19
summary(salaryData)
## Salary batting_average OBP runs
## Min. : 109 Min. :0.0630 Min. :0.063 Min. : 0.0
## 1st Qu.: 230 1st Qu.:0.2380 1st Qu.:0.297 1st Qu.: 22.0
## Median : 740 Median :0.2600 Median :0.323 Median : 41.0
## Mean :1249 Mean :0.2578 Mean :0.324 Mean : 46.7
## 3rd Qu.:2150 3rd Qu.:0.2810 3rd Qu.:0.354 3rd Qu.: 69.0
## Max. :6100 Max. :0.4570 Max. :0.486 Max. :133.0
## hits doubles triples homeruns
## Min. : 1.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 51.00 1st Qu.: 9.00 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 91.00 Median :15.00 Median : 2.000 Median : 6.000
## Mean : 92.83 Mean :16.67 Mean : 2.338 Mean : 9.098
## 3rd Qu.:136.00 3rd Qu.:23.00 3rd Qu.: 3.000 3rd Qu.:15.000
## Max. :216.00 Max. :49.00 Max. :15.000 Max. :44.000
## RBI walks strikeouts stolenbases
## Min. : 0.00 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.: 21.00 1st Qu.: 15.00 1st Qu.: 31.00 1st Qu.: 1.000
## Median : 39.00 Median : 30.00 Median : 49.00 Median : 4.000
## Mean : 44.02 Mean : 35.02 Mean : 56.71 Mean : 8.246
## 3rd Qu.: 66.00 3rd Qu.: 49.00 3rd Qu.: 78.00 3rd Qu.:11.000
## Max. :133.00 Max. :138.00 Max. :175.00 Max. :76.000
## errors free_agency_eligibility free_agent_in_1991_2
## Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 5.000 Median :0.0000 Median :0.0000
## Mean : 6.772 Mean :0.3976 Mean :0.1157
## 3rd Qu.: 9.000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :31.000 Max. :1.0000 Max. :1.0000
## arbitration_eligibility arbitration_in_1991_2 Player
## Min. :0.0000 Min. :0.00000 Length:337
## 1st Qu.:0.0000 1st Qu.:0.00000 Class :character
## Median :0.0000 Median :0.00000 Mode :character
## Mean :0.1929 Mean :0.02967
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
## logSalary
## Min. :2.037
## 1st Qu.:2.362
## Median :2.869
## Mean :2.838
## 3rd Qu.:3.332
## Max. :3.785
Let’s set the outcome variable, and the input variables
outcome = "logSalary"
vars = setdiff(colnames(salaryData), c("Salary", "Player", "logSalary"))
vars
## [1] "batting_average" "OBP"
## [3] "runs" "hits"
## [5] "doubles" "triples"
## [7] "homeruns" "RBI"
## [9] "walks" "strikeouts"
## [11] "stolenbases" "errors"
## [13] "free_agency_eligibility" "free_agent_in_1991_2"
## [15] "arbitration_eligibility" "arbitration_in_1991_2"
Now we can split the data into a training set and a test set.
set.seed(45433622) # set the random number generator seed, so the random assignments are the same every time
nr = nrow(salaryData)
# make the train/test assignments (set aside 25% of the data for test)
is.test = runif(nr)<=0.25
summary(is.test)
## Mode FALSE TRUE NA's
## logical 261 76 0
# split the data
test = salaryData[is.test,]
train = salaryData[!is.test, ]
salaryData$is.test = is.test # put the test marker back in the data, for reproducibility
Run the model on the training set
fmla = paste(outcome, "~", paste(vars, collapse="+")) # set up the variables
fmla
## [1] "logSalary ~ batting_average+OBP+runs+hits+doubles+triples+homeruns+RBI+walks+strikeouts+stolenbases+errors+free_agency_eligibility+free_agent_in_1991_2+arbitration_eligibility+arbitration_in_1991_2"
model = lm(fmla, data=train)
summary(model)
##
## Call:
## lm(formula = fmla, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09000 -0.12836 -0.01524 0.14299 0.55490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2631291 0.1210689 18.693 < 2e-16 ***
## batting_average 0.8327396 0.9743267 0.855 0.39357
## OBP -1.1435733 0.8556458 -1.337 0.18263
## runs -0.0010018 0.0020816 -0.481 0.63075
## hits 0.0035977 0.0012131 2.966 0.00332 **
## doubles -0.0031576 0.0032206 -0.980 0.32784
## triples -0.0122794 0.0082382 -1.491 0.13737
## homeruns 0.0054996 0.0047118 1.167 0.24427
## RBI 0.0025508 0.0019029 1.340 0.18135
## walks 0.0033186 0.0016985 1.954 0.05186 .
## strikeouts -0.0018785 0.0007916 -2.373 0.01842 *
## stolenbases 0.0029310 0.0017296 1.695 0.09142 .
## errors -0.0049083 0.0028484 -1.723 0.08612 .
## free_agency_eligibility 0.6521692 0.0413078 15.788 < 2e-16 ***
## free_agent_in_1991_2 -0.0704410 0.0498863 -1.412 0.15921
## arbitration_eligibility 0.5667131 0.0460688 12.301 < 2e-16 ***
## arbitration_in_1991_2 -0.0542029 0.0914759 -0.593 0.55404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2287 on 244 degrees of freedom
## Multiple R-squared: 0.8147, Adjusted R-squared: 0.8025
## F-statistic: 67.04 on 16 and 244 DF, p-value: < 2.2e-16
Now we’ll evaluate the model. We will look at the root mean squared error of the prediction.
# make the predictions on the salaryData frame
salPred = predict(model, newdata=salaryData)
# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]],
pred = salPred, is.test=salaryData$is.test)
sqerr = (perf$logSalary - perf$pred)^2
# training error
sqrt(mean(sqerr[!is.test]))
## [1] 0.2211685
# test error
sqrt(mean(sqerr[is.test]))
## [1] 0.2571019
And we can plot, it too.
library(ggplot2)
ggplot(perf, aes(x=pred, y=logSalary, color=is.test)) +
geom_point(aes(shape=is.test)) +
geom_abline(slope=1) +
scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
coord_fixed()
The x axis is the predicted log salary, and the y axis is the actual log salary. The blue triangles are the test data, and the gray circles are the training data.
Now let’s try Random Forest on the exact same training and test sets
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
mod = randomForest(train[,vars], y=train[,outcome])
# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]],
pred = as.numeric(predict(mod, newdata=salaryData)), is.test=salaryData$is.test)
sqerr = (perf$logSalary - perf$pred)^2
# training error
sqrt(mean(sqerr[!is.test]))
## [1] 0.1098961
# test error
sqrt(mean(sqerr[is.test]))
## [1] 0.2852029
Random Forest looks better in training, but may be slightly worse in test.