Lesson on Test/Train Split

The first thing to do is load the data.

salaryData = readRDS("salaryData.rds")
# look at the data
dim(salaryData)
## [1] 337  19
summary(salaryData)
##      Salary     batting_average       OBP             runs      
##  Min.   : 109   Min.   :0.0630   Min.   :0.063   Min.   :  0.0  
##  1st Qu.: 230   1st Qu.:0.2380   1st Qu.:0.297   1st Qu.: 22.0  
##  Median : 740   Median :0.2600   Median :0.323   Median : 41.0  
##  Mean   :1249   Mean   :0.2578   Mean   :0.324   Mean   : 46.7  
##  3rd Qu.:2150   3rd Qu.:0.2810   3rd Qu.:0.354   3rd Qu.: 69.0  
##  Max.   :6100   Max.   :0.4570   Max.   :0.486   Max.   :133.0  
##       hits           doubles         triples          homeruns     
##  Min.   :  1.00   Min.   : 0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 51.00   1st Qu.: 9.00   1st Qu.: 0.000   1st Qu.: 2.000  
##  Median : 91.00   Median :15.00   Median : 2.000   Median : 6.000  
##  Mean   : 92.83   Mean   :16.67   Mean   : 2.338   Mean   : 9.098  
##  3rd Qu.:136.00   3rd Qu.:23.00   3rd Qu.: 3.000   3rd Qu.:15.000  
##  Max.   :216.00   Max.   :49.00   Max.   :15.000   Max.   :44.000  
##       RBI             walks          strikeouts      stolenbases    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  1.00   Min.   : 0.000  
##  1st Qu.: 21.00   1st Qu.: 15.00   1st Qu.: 31.00   1st Qu.: 1.000  
##  Median : 39.00   Median : 30.00   Median : 49.00   Median : 4.000  
##  Mean   : 44.02   Mean   : 35.02   Mean   : 56.71   Mean   : 8.246  
##  3rd Qu.: 66.00   3rd Qu.: 49.00   3rd Qu.: 78.00   3rd Qu.:11.000  
##  Max.   :133.00   Max.   :138.00   Max.   :175.00   Max.   :76.000  
##      errors       free_agency_eligibility free_agent_in_1991_2
##  Min.   : 0.000   Min.   :0.0000          Min.   :0.0000      
##  1st Qu.: 3.000   1st Qu.:0.0000          1st Qu.:0.0000      
##  Median : 5.000   Median :0.0000          Median :0.0000      
##  Mean   : 6.772   Mean   :0.3976          Mean   :0.1157      
##  3rd Qu.: 9.000   3rd Qu.:1.0000          3rd Qu.:0.0000      
##  Max.   :31.000   Max.   :1.0000          Max.   :1.0000      
##  arbitration_eligibility arbitration_in_1991_2    Player         
##  Min.   :0.0000          Min.   :0.00000       Length:337        
##  1st Qu.:0.0000          1st Qu.:0.00000       Class :character  
##  Median :0.0000          Median :0.00000       Mode  :character  
##  Mean   :0.1929          Mean   :0.02967                         
##  3rd Qu.:0.0000          3rd Qu.:0.00000                         
##  Max.   :1.0000          Max.   :1.00000                         
##    logSalary    
##  Min.   :2.037  
##  1st Qu.:2.362  
##  Median :2.869  
##  Mean   :2.838  
##  3rd Qu.:3.332  
##  Max.   :3.785

Let’s set the outcome variable, and the input variables

outcome = "logSalary"
vars = setdiff(colnames(salaryData), c("Salary", "Player", "logSalary"))
vars
##  [1] "batting_average"         "OBP"                    
##  [3] "runs"                    "hits"                   
##  [5] "doubles"                 "triples"                
##  [7] "homeruns"                "RBI"                    
##  [9] "walks"                   "strikeouts"             
## [11] "stolenbases"             "errors"                 
## [13] "free_agency_eligibility" "free_agent_in_1991_2"   
## [15] "arbitration_eligibility" "arbitration_in_1991_2"

Now we can split the data into a training set and a test set.

set.seed(45433622) # set the random number generator seed, so the random assignments are the same every time
nr = nrow(salaryData)
# make the train/test assignments (set aside 25% of the data for test)
is.test = runif(nr)<=0.25
summary(is.test)
##    Mode   FALSE    TRUE    NA's 
## logical     261      76       0
# split the data
test = salaryData[is.test,]
train = salaryData[!is.test, ]
salaryData$is.test = is.test  # put the test marker back in the data, for reproducibility

Run the model on the training set

fmla = paste(outcome, "~", paste(vars, collapse="+")) # set up the variables
fmla
## [1] "logSalary ~ batting_average+OBP+runs+hits+doubles+triples+homeruns+RBI+walks+strikeouts+stolenbases+errors+free_agency_eligibility+free_agent_in_1991_2+arbitration_eligibility+arbitration_in_1991_2"
model = lm(fmla, data=train)
summary(model)
## 
## Call:
## lm(formula = fmla, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09000 -0.12836 -0.01524  0.14299  0.55490 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.2631291  0.1210689  18.693  < 2e-16 ***
## batting_average          0.8327396  0.9743267   0.855  0.39357    
## OBP                     -1.1435733  0.8556458  -1.337  0.18263    
## runs                    -0.0010018  0.0020816  -0.481  0.63075    
## hits                     0.0035977  0.0012131   2.966  0.00332 ** 
## doubles                 -0.0031576  0.0032206  -0.980  0.32784    
## triples                 -0.0122794  0.0082382  -1.491  0.13737    
## homeruns                 0.0054996  0.0047118   1.167  0.24427    
## RBI                      0.0025508  0.0019029   1.340  0.18135    
## walks                    0.0033186  0.0016985   1.954  0.05186 .  
## strikeouts              -0.0018785  0.0007916  -2.373  0.01842 *  
## stolenbases              0.0029310  0.0017296   1.695  0.09142 .  
## errors                  -0.0049083  0.0028484  -1.723  0.08612 .  
## free_agency_eligibility  0.6521692  0.0413078  15.788  < 2e-16 ***
## free_agent_in_1991_2    -0.0704410  0.0498863  -1.412  0.15921    
## arbitration_eligibility  0.5667131  0.0460688  12.301  < 2e-16 ***
## arbitration_in_1991_2   -0.0542029  0.0914759  -0.593  0.55404    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2287 on 244 degrees of freedom
## Multiple R-squared:  0.8147, Adjusted R-squared:  0.8025 
## F-statistic: 67.04 on 16 and 244 DF,  p-value: < 2.2e-16

Now we’ll evaluate the model. We will look at the root mean squared error of the prediction.

# make the predictions on the salaryData frame
salPred = predict(model, newdata=salaryData)

# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]], 
                  pred = salPred, is.test=salaryData$is.test)

sqerr = (perf$logSalary - perf$pred)^2
# training error
sqrt(mean(sqerr[!is.test])) 
## [1] 0.2211685
# test error
sqrt(mean(sqerr[is.test])) 
## [1] 0.2571019

And we can plot, it too.

library(ggplot2)

ggplot(perf, aes(x=pred, y=logSalary, color=is.test)) + 
  geom_point(aes(shape=is.test)) +  
  geom_abline(slope=1) + 
  scale_color_manual(values = c("FALSE" = "darkgray", "TRUE" = "darkblue")) +
  coord_fixed()

The x axis is the predicted log salary, and the y axis is the actual log salary. The blue triangles are the test data, and the gray circles are the training data.

Now let’s try Random Forest on the exact same training and test sets

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
mod = randomForest(train[,vars], y=train[,outcome])

# set up a frame with the outcomes
perf = data.frame(logSalary = salaryData[[outcome]], 
                  pred = as.numeric(predict(mod, newdata=salaryData)), is.test=salaryData$is.test)
sqerr = (perf$logSalary - perf$pred)^2
# training error
sqrt(mean(sqerr[!is.test])) 
## [1] 0.1098961
# test error
sqrt(mean(sqerr[is.test])) 
## [1] 0.2852029

Random Forest looks better in training, but may be slightly worse in test.