predicting runs scored using caret R package in RStudio and based on Lahman DB

Posted: 2014/07/22 in Data Science, Ubuntu

I have tried to improve the R2 (coefficient of determination) value of regression models that predict the runs scored by a given baseball team. 

Here’s the best model I could find:

# best model: cforest with R2=0.9572 (=coefficient of determination value) with n=1384 samples in dataset
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 =0.9572 with n=1384 samples in dataset
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor=0.9783788
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

print(lin_more_weights, digits = 3)
trellis.par.set(caretTheme())
plot(lin_more_weights, metric=lin_more_weights$metric)
names(lin_more_weights)
ggplot(lin_more_weights,metric=lin_more_weights$metric)
getTrainPerf(lin_more_weights)

And the code below shows the journey I took to get to that conclusion:

# predicting runs scored using caret R package:
# tested models from http://caret.r-forge.r-project.org/modelList.html

library(RMySQL)
library(party)
library(grid)
library(zoo)
library(sandwich)
library(strucchange)
library(modeltools)
library(stats4)
lahmanDb = dbConnect(MySQL(), user=”root”,
host=”localhost”)
result = dbGetQuery(lahmanDb,”show databases;”);dbDisconnect(lahmanDb);
result
lahman = dbConnect(MySQL(), user=”root”, db=”lahman”,
host=”localhost”)

# Import data
year_team_full <- dbGetQuery(lahman,”
SELECT teamID, yearID, SUM(R) AS R, SUM(AB) AS AB, SUM(H) AS H, SUM(2B) AS 2B,
SUM(3B) AS 3B, SUM(HR) AS HR, SUM(BB) AS BB, SUM(HBP) AS HBP, SUM(SF) AS SF,
SUM(SH) AS SH, SUM(SB) AS SB, SUM(CS) AS CS
FROM batting
WHERE yearID >= 1954 and 2011 >= yearID AND yearID != 1981 and yearID != 1994
GROUP BY teamID, yearID
;”)

year_team <- year_team_full

year_team$X2B = year_team[,6]
year_team$X3B = year_team[,7]

# Then delete invalid column names 2B and 3B:
year_team[,6] = NULL
year_team[,6] = NULL

# Calculate AVG, SLG, OPS; then insert into data frame
year_team$AVG <- with(year_team, (H/(AB)))
year_team$OBP <- with(year_team, ((H+BB+HBP)/(AB+BB+HBP+SF)))
year_team$SLG <- with(year_team, ((H+X2B+2*X3B+3*HR)/(AB)))
year_team$OPS <- with(year_team, OBP + SLG)

# Correlation Plot Function
# Draws a scatter plot of 2 variables from a dataframe
# displaying a best-fit line and R^2 value in the legend
corr_plot <- function(v1, v2, df) {
plot(df[[v1]], df[[v2]], xlab=v1, ylab=v2) # Draw scatter Plot
linfit <- lm(df[[v2]]~df[[v1]]) # Calculate best-fit line
abline(linfit) # Draw best-fit line
# Add R^2 value in legend
legend(“topleft”, legend = paste(“R^2:”, signif(summary(linfit)$r.squared, 4)))
}

# Add 1B for calculation simplicity
year_team$X1B <- with(year_team, H-X2B-X3B-HR)

# add RC
year_team$RC <- with(year_team,(((H+BB)*(H+X2B+2*X3B+3*HR))/(AB+BB)))

# add XRR
year_team$XRR <- with(year_team,(.5*(H-HR-X3B-X2B))+(.72*X2B)+(1.04*X3B)+(1.44*HR)+.33*(HBP+BB)+.18*SB-.32*CS-.098*(AB-H))

year_team_full <- year_team
########################################################################################

# First we’ll only use different types of hits
lin_basic_weights <- lm(R ~ X1B + X2B + X3B + HR, data=year_team)

# Apply model’s coefficients to predict past runs
year_team$linRBasic <- predict(lin_basic_weights)

# Now let’s add in BB, HBP, and SB to improve the regression’s accuracy.
lin_more_weights <- lm(R ~ X1B + X2B + X3B + HR + I(BB + HBP) + SB, data=year_team)
year_team$linRMore <- predict(lin_more_weights)

# Now let’s use blackboost to improve the regression’s accuracy.
# R2 = 0.9495
library(mboost)
lin_more_weights <- blackboost(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# R2 = 0.9227
library(randomForest)
lin_more_weights <- randomForest(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# R2 = 0.9431
library(stats)
lin_more_weights <- glm(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# seems that XRR is better predictor than OPS or OBP in model below:
lin_more_weights <- lm(R ~ XRR, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
# cor = 0.967535
cor(year_team$linRMore, year_team$R)

# relationship between R(runs scored) and OBP (on base percentage)
# also mentioned in the movie “Moneyball”
# R2 = 0.8101 = coefficient of determination
library(party)
lin_more_weights <- cforest(R ~ OBP, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
# cor = 0.9000299
cor(year_team$linRMore, year_team$R)

#######################################################################################
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9627 with n=200 samples in dataset
# cor=0.981167
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# using repeated cross-validation to reduce risks of overfitting the model
# n=500 (n = sample size)
# R2 = 0.96 with n=500 samples in dataset
# cor= 0.9797793
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=500)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# using repeated cross-validation to reduce risks of overfitting the model
# n=1000 (n = sample size)
# R2 = 0.9598 with n=1000 samples in dataset
# cor= 0.9796931
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=1000)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# with n=1000:
# might be a good idea to create an ensemble model that combines the
# strengths of cforest models and clustering models because I can
# destinguish at least 7 clusters in the plot of the following R command:
# corr_plot(‘linRMore’, ‘R’, year_team)

# best model: cforest with R2=0.9572 with n=1384 samples in dataset
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 =0.9572 with n=1384 samples in dataset
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor=0.9783788
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# model WITHOUT using repeated cross-validation -> danger of overfitting:
# n=1384 (n = sample size)
# R2 = 0.9572 = coefficient of determination
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor = 0.978368
# seems that XRR is better predictor than SLG, OPS or OBP in model below:
library(party)
lin_more_weights <- cforest(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# blackboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9655 with n=200 samples in dataset
# cor=0.9825951
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “blackboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# model: bstTree
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9779 with n=200 samples in dataset
# cor=0.9888938
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “bstTree”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# You can apply this runs scored model to both opposing teams to
# calculate the winning percentage.
# It would be interesting to know how often we can
# correctly predict who will win the game after the 3rd inning
# has been played by applying model cforest(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF)
# to all the data – from both teams – from the first, second and third inning.

# Focusing only on following boosted models thanks to success of previous
# blackboost and bstTree models: gamboost, gbm,glmboost

# gamboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.9567 with n=200 samples in dataset
# cor=0.9780998
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “gamboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# gbm
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.2895 with n=200 samples in dataset
# cor=0.5380201
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “gbm”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# glmboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.9543 with n=200 samples in dataset
# cor=0.976859
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “glmboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# Best boosted model SEEMS to be bstTree
# Expand to full data set with n=1384

# model: bstTree
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 = 0.9509 with n=1384 samples in dataset
# cor=0.9751362
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “bstTree”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# blackboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 = 0.9489 with n=200 samples in dataset
# cor=0.9741078
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “blackboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# So best model is cforest with R2=0.9572 with n=1384 samples in dataset.
# Interestingly enough, cforest was also one of the best performers during the
# ‘Show of Hands’ Kaggle competition (part of Analytics Edge course).
# source: https://www.kaggle.com/c/the-analytics-edge-mit-15-071x
# Show of Hands was not a regression problem, but a classification problem.
# second source: https://static.squarespace.com/static/51156277e4b0b8b2ffe11c00/t/53ad86e5e4b0b52e4e71cfab/1403881189332/Applied_Predictive_Modeling_in_R.pdf

Here is the result of using sample.split to split the data set into a train and test set:

#####################################################################################

# train set R2=0.9615
# test set R2=0.9032
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
library(mboost)
year_team_full$teamID <- as.factor(year_team_full$teamID)
year_team_full$teamID <- as.integer(year_team_full$teamID)
year_team_full$teamID <- as.numeric(year_team_full$teamID)
year_team_full$yearID <- as.numeric(year_team_full$yearID)

if (!require(“caTools”)) {
install.packages(“caTools”, repos=”http://cran.rstudio.com/&#8221;)
library(“caTools”)
}

# Randomly split data
set.seed(88)
split = sample.split(year_team_full$R, SplitRatio = 0.75)

# Create training and testing sets
year_team_train = subset(year_team_full, split == TRUE)
nrow(year_team_train)
year_team_test = subset(year_team_full, split == FALSE)
nrow(year_team_test)
year_team_test_with_R <- year_team_test
year_team_test$R = NULL

ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team_train,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

# calculate train set R2 = 0.9615, cor=0.9805769
year_team_train$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team_train)
cor(year_team_train$linRMore, year_team_train$R)

# calculate test set R2= 0.9032, cor=0.9503854
year_team_test_with_R$linRMore <- predict(lin_more_weights,newdata = year_team_test)
corr_plot(‘linRMore’, ‘R’, year_team_test_with_R)
cor(year_team_test_with_R$linRMore, year_team_test_with_R$R)

#####################################################################################

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s