Archive for the ‘Data Science’ Category

# .R script showing capabilities of sparklyr R package
# Prerequisites before running this R script: 
# Ubuntu 16.04.3 LTS 64-bit, r-base (version 3.4.1 or newer), 
# RStudio 64-bit version, libssl-dev, libcurl4-openssl-dev, libxml2-dev
install.packages("httr")
install.packages("xml2")
# New features in sparklyr 0.6:
# https://blog.rstudio.com/2017/07/31/sparklyr-0-6/
install.packages("sparklyr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("tidyr")
library(sparklyr)
library(dplyr)
library(ggplot2)
library(tidyr)
set.seed(100)
# sparklyr cheat sheet: https://github.com/rstudio/cheatsheets/raw/master/source/pdfs/sparklyr.pdf
# dplyr+tidyr: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# sparklyr currently (2017-08-19) only supports Apache Spark version 2.2.0 or older
# Install Spark locally:
sc_version <- "2.2.0"
spark_install(sc_version)
config <- spark_config()
# number of CPU cores to use:
config$spark.executor.cores <- 6
# amount of RAM to use for Apache Spark executors:
config$spark.executor.memory <- "4G"
# Connect to local version:
sc <- spark_connect (master = "local",
 config = config, version = sc_version)
# Copy data to Spark memory:
import_iris <- sdf_copy_to(sc, iris, "spark_iris", overwrite = TRUE) 
# partition data:
partition_iris <- sdf_partition(import_iris,training=0.5, testing=0.5) 
# Create a hive metadata for each partition:
sdf_register(partition_iris,c("spark_iris_training","spark_iris_test")) 
# Create reference to training data in Spark table
tidy_iris <- tbl(sc,"spark_iris_training") %>% select(Species, Petal_Length, Petal_Width) 
# Spark ML Decision Tree Model
model_iris <- tidy_iris %>% ml_decision_tree(response="Species", features=c("Petal_Length","Petal_Width")) 
# Create reference to test data in Spark table
test_iris <- tbl(sc,"spark_iris_test") 
# Bring predictions data back into R memory for plotting:
pred_iris <- sdf_predict(model_iris, test_iris) %>% collect
pred_iris %>%
 inner_join(data.frame(prediction=0:2,
 lab=model_iris$model.parameters$labels)) %>%
 ggplot(aes(Petal_Length, Petal_Width, col=lab)) +
 geom_point() 
spark_disconnect(sc)
# bash shell script for Ubuntu 14.04 LTS 64-bit:
# Free disk space required: around 5 GB
# Minimum internal memory/RAM requirements: 4 GB RAM minimum
# Time required to execute 'sudo make' compilation of RStudio source code: around 45 minutes, maybe even more 
# sudo checkinstall process should take around 20 minutes to finish
# REQUIRES: newest version of wget compiled from Github sources
# Copy-paste the following commands into the Terminal one by one:
# install newest version of wget from Github sources in order to solve following wget issue in Ubuntu 14.04 LTS : 
# https://github.com/chapmanb/bcbio-nextgen/issues/1133
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install libqtwebkit-dev checkinstall qtbase5-dev pandoc r-base
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install libboost-all-dev cmake libqt4-dev build-essential default-jdk
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes build-dep wget
cd
sudo rm -rf wget
sudo rm /usr/bin/rstudio*
git clone https://github.com/mirror/wget.git
cd wget
./bootstrap
./configure
sudo make
sudo checkinstall
# Press 3 and ENTER and then set version to 1.17.1.13
apt-cache show wget
# Terminal output should look like this:
# apt-cache show wget
# Package: wget
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 3864
# Maintainer: root
# Architecture: amd64
# Version: 1.17.1.13-1
# Provides: wget
# Conffiles:
# /etc/wgetrc 618c05b4106ad20141dcf6deada2e87f obsolete
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
# compile and install RStudio Server from source code:
cd
git clone https://github.com/rstudio/rstudio.git
cd rstudio/
mkdir build
cd build/
cd ~/rstudio/dependencies/common
bash install-common
bash install-common
cd
bash ~/rstudio/dependencies/linux/install-dependencies-debian
bash ~/rstudio/dependencies/linux/install-dependencies-debian
cd /tmp
wget http://dl.google.com/closure-compiler/compiler-latest.zip
unzip compiler-latest.zip
rm COPYING README.md compiler-latest.zip
sudo mv compiler.jar ~/rstudio/src/gwt/tools/compiler/compiler.jar
cd ~/rstudio
sudo rm -rf build
sudo cmake -DRSTUDIO_TARGET=Server -DCMAKE_BUILD_TYPE=Release
time sudo make
# sudo make install process should take around 45 minutes to finish
time sudo checkinstall
# sudo checkinstall process should take around 20 minutes to finish
apt-cache show rstudio
# Terminal output should look like this:
# Package: rstudio
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 293492
# Maintainer: root
# Architecture: amd64
# Version: 20160206-1
# Provides: rstudio
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
sudo ln -s /usr/local/lib/rstudio-server/bin/rserver /usr/bin
nohup rserver &
# then use a web browser to navigate to http://127.0.0.1:8787/ to access the RStudio Server interface
# Reproducible research .R script to run in RStudio in Ubuntu 14.04 LTS 64-bit
# Prerequisites to install: 
# https://mark911.wordpress.com/2014/11/06/how-to-install-newest-version-of-r-and-rstudio-in-ubuntu-14-04-lts-using-a-bash-script/
# Further prerequisites to install in R or RStudio:
install.packages(c("Quandl", "dplyr", "ggvis", "lubridate"))
# Data set: 
# https://www.quandl.com/data/ODA/MOZ_PPPSH-Mozambique-Share-of-World-GDP-based-on-PPP
library(Quandl)
library(dplyr)
library(ggvis)
library(lubridate)
data <- Quandl("ODA/MOZ_PPPSH", authcode="FiHHoC-Gnx3CHzr9385J")
str(data)
dplyr::glimpse(data)
head(data)
tail(data)
data$year <- lubridate::year(data$Date)
min_year <- min(data$year)
max_year <- max(data$year)
# source: https://github.com/rstudio/ggvis/blob/master/vignettes/ggvis-basics.Rmd
# source: Data Wrangling Cheat Sheet:
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# source: http://ggvis.rstudio.com/0.1/quick-examples.html
data %>% 
 ggvis(~Date,~Value) %>% 
 layer_points() %>% 
 layer_model_predictions(model = input_select(
 c("loess" = "loess",
 "lm" = "lm",
 "MASS::rlm" = "MASS::rlm"),
 label = "model")) %>%
 layer_smooths(se = TRUE,
 span = input_slider(min = 0.3, max = 1, value = 0.8, step = 0.1,
 label = "Smoothing span")) %>%
 add_axis("x", title = "Date") %>%
 add_axis("y", title = "Mozambique Share of World GDP based on PPP, %")

!!!! Following procedure is risky, because manually installing the 6 library dependencies below can potentially break your package manager dependencies !!!!!
!!!!! Use following bash script at your own risk !!!!!
!!!!! It is highly recommended to run this script in Ubuntu 14.04 LTS 64-bit in a virtual machine (Virtualbox or VMWare) and not in your host Ubuntu installation !!!!!!
However, the build of scilab was tested and works on my own real install of Ubuntu 14.04 LTS 64-bit
It is probably safer to follow this procedure in Ubuntu 15.04 final beta, but I have not tested Ubuntu 15.04 final beta myself, as I am sticking to LTS releases of Ubuntu.

Contents of bash script:

# !!!! Following procedure is risky, because manually installing the 6 library dependencies below can potentially break your package manager dependencies !!!!!
# !!!!! Use following bash script at your own risk !!!!!
# !!!!! It is highly recommended to run this script in Ubuntu 14.04 LTS 64-bit in a virtual machine (Virtualbox or VMWare) and not in your host Ubuntu installation !!!!!!
# However, the build of scilab was tested and works on my own real install of Ubuntu 14.04 LTS 64-bit
# It is probably safer to follow this procedure in Ubuntu 15.04 final beta, but I have not tested Ubuntu 15.04 final beta myself, as I am sticking to LTS releases of Ubuntu.
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get purge scilab
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install checkinstall build-essential unp git
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes build-dep scilab
# manually install 6 library dependencies for scilab in Ubuntu 14.04 LTS 64-bit:
# these dependencies are available in the standard Ubuntu repositories for vivid (Ubuntu 15.04 final beta)
cd /tmp
wget --no-check-certificate https://launchpad.net/ubuntu/+source/libjogl2-java/2.2.4+dfsg-1/+build/6556107/+files/libjogl2-java-doc_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate https://launchpad.net/ubuntu/+source/libjogl2-java/2.2.4+dfsg-1/+build/6556107/+files/libjogl2-java_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate http://launchpadlibrarian.net/189706070/libjogl2-jni_2.2.4%2Bdfsg-1_amd64.deb
wget --no-check-certificate http://launchpadlibrarian.net/189706067/libjogl2-toolkits_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate http://francisdavey.co.uk/ubuntu/ubuntu/pool/universe/g/gluegen2/libgluegen2-jni_2.2.4-2_amd64.deb
wget --no-check-certificate http://francisdavey.co.uk/ubuntu/ubuntu/pool/universe/g/gluegen2/libgluegen2-rt-java_2.2.4-2_all.deb
rm *i386*.deb
sudo dpkg -i libgluegen2-jni_*.deb
sudo dpkg -i libgluegen2-rt-java*.deb
sudo dpkg -i libjogl2-jni_*.deb
sudo dpkg -i libjogl2-java_*.deb
sudo dpkg -i libjogl2-toolkits_*.deb
sudo dpkg -i libjogl2-java-doc_*.deb
#compile and install newest version of scilab in Ubuntu 14.04 LTS 64-bit:
cd
sudo rm -rf ~/scilab
git clone https://github.com/opencollab/scilab.git
cd ~/scilab/scilab
sudo ./configure
sudo make
sudo make install
/usr/local/bin/scilab -version
# Terminal output should look similar to this output:
# Scilab version "5.6.0.0"
# scilab-branch-master
/usr/local/bin/scilab
# hardware requirements: see https://github.com/wnd-charm/wnd-charm
# Wndchrm - an open source utility for biological image analysis
# WND-CHARM is an acronym that stands for "Weighted Neighbor Distance using Compound Hierarchy of Algorithms Representing Morphology."
cd
rm -rf wnd-charm
# install prerequisites:
sudo apt-get update
sudo apt-get install build-essential libtiff4-dev libfftw3-dev libX11-dev libxt-dev libxaw7-dev phylip
sudo apt-get install python-networkx python-skimage python-sklearn checkinstall
sudo apt-get install python-matplotlib python-numpy python-pandas
sudo apt-get install python-jsonschema openbox ipython python-scipy
sudo apt-get install spyder unp python-pip build-essential python-dev swig
sudo pip install --upgrade beautifulsoup4 numpy scipy matplotlib argparse
sudo pip install --upgrade ipython mistune networkx pandas py4j runipy
sudo pip install --upgrade scikit-image scikit-learn scipy-data_fitting
sudo pip install --upgrade statsmodels jsonschema
# install wnd-charm
git clone https://github.com/wnd-charm/wnd-charm.git
cd wnd-charm
sudo ./configure
sudo make
sudo make check
# change version from 'charm' to 1 when running sudo checkinstall to avoid errors!
# so checkinstall build values should be as follows:
# 0 - Maintainer: [ root ]
# 1 - Summary: [ Package created with checkinstall 1.6.2 ]
# 2 - Name: [ wnd ]
# 3 - Version: [ 1 ] -> instead of invalid Version: [ charm ]
# 4 - Release: [ 1 ]
# 5 - License: [ GPL ]
# 6 - Group: [ checkinstall ]
# 7 - Architecture: [ amd64 ]
# 8 - Source location: [ wnd-charm ]
# 9 - Alternate source location: [ ]
# 10 - Requires: [ ]
# 11 - Provides: [ wnd ]
# 12 - Conflicts: [ ]
# 13 - Replaces: [ ]
sudo checkinstall
apt-cache show wnd
# output of apt-cache show wnd should similar to this:
# Package: wnd
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 1076
# Maintainer: root
# Architecture: amd64
# Version: 1-1
# Provides: wnd
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
wndchrm

Run following bash script in a bash Terminal in Ubuntu 14.04 LTS 64-bit:

#!/bin/bash
# Based on: https://spark.apache.org/docs/1.1.0/building-with-maven.html
# Purpose: this script will automatically compile and install
# the newest version of maven and Apache Spark via the github sources
# Software requirements: Ubuntu 14.04 LTS 64-bit, git, build-essential,
# ant, unp, python2.7, java 1.7.0 or higher
# Minimum RAM requirements for this script: 2 Gigabytes of RAM (maybe even more) 
# Please make sure to close any web browser windows and any other 
# memory hogging applications before running this memory intensive bash script.

# First uninstall any conflicting binary packages of maven and maven2:
cd
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository --yes ppa:marutter/rrutter
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository --yes ppa:marutter/c2d4u
sudo DEBIAN_FRONTEND=noninteractive apt-get update

# Install tools required to build maven and Apache Spark with sparkR support:
sudo apt-get build-dep maven maven2
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  r-base-core r-base
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  git build-essential python-protobuf protobuf-compiler
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  ant unp python2.7 openjdk-7-jre-headless 
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  purge maven maven2

# Also remove any previously installed versions of Apache Spark:
sudo rm -rf spark*
sudo rm -rf /usr/local/spark*

# install newest version of maven
rm -rf maven*
git clone https://github.com/apache/maven.git
cd maven
ant -Dmaven.home="$HOME/apps/maven/apache-maven-SNAPSHOT"
cd ~/maven/apache-maven/target
unp apache-maven-*-bin.tar.gz
sudo rm /usr/bin/mvn
sudo ln -s ~/maven/apache-maven/target/apache-maven-*/bin/mvn  /usr/bin/mvn
mvn -v

# example of Terminal output:
#Apache Maven 3.3.2-SNAPSHOT
#Maven home: $HOME/maven/apache-maven/target/apache-maven-3.3.2-SNAPSHOT
#Java version: 1.7.0_76, vendor: Oracle Corporation
#Java home: /usr/lib/jvm/java-7-oracle/jre
#Default locale: en_US, platform encoding: UTF-8
#OS name: "linux", version: "4.0.0-040000rc3-lowlatency", arch: "amd64", family: "unix"

# install SparkR-pkg
cd
rm -rf SparkR-pkg/
git clone https://github.com/amplab-extras/SparkR-pkg.git
cd SparkR-pkg/
SPARK_VERSION=1.5.0 USE_MAVEN=1 ./install-dev.sh
# ./sparkR examples/pi.R local[2]

# install newest version of Apache Spark:
cd
git clone git://github.com/apache/spark.git
cd spark

# increase MaxPermSize to avoid out-of-memory errors during compile process:
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
mvn -PsparkR -DskipTests clean package

# End result of Apache Spark build process should look
# something like this (without any memory errors):
# [INFO] ------------------------------------------------------------------------
# [INFO] Reactor Summary:
# [INFO] 
# [INFO] Spark Project Parent POM ........................... SUCCESS [  9.048 s]
# [INFO] Spark Launcher Project ............................. SUCCESS [ 19.509 s]
# [INFO] Spark Project Networking ........................... SUCCESS [ 14.113 s]
# [INFO] Spark Project Shuffle Streaming Service ............ SUCCESS [  7.626 s]
# [INFO] Spark Project Core ................................. SUCCESS [05:46 min]
# [INFO] Spark Project Bagel ................................ SUCCESS [ 33.517 s]
# [INFO] Spark Project GraphX ............................... SUCCESS [01:45 min]
# [INFO] Spark Project Streaming ............................ SUCCESS [02:35 min]
# [INFO] Spark Project Catalyst ............................. SUCCESS [02:38 min]
# [INFO] Spark Project SQL .................................. SUCCESS [03:40 min]
# [INFO] Spark Project ML Library ........................... SUCCESS [03:46 min]
# [INFO] Spark Project Tools ................................ SUCCESS [ 19.095 s]
# [INFO] Spark Project Hive ................................. SUCCESS [03:00 min]
# [INFO] Spark Project REPL ................................. SUCCESS [01:07 min]
# [INFO] Spark Project Assembly ............................. SUCCESS [02:12 min]
# [INFO] Spark Project External Twitter ..................... SUCCESS [ 26.990 s]
# [INFO] Spark Project External Flume Sink .................. SUCCESS [ 41.008 s]
# [INFO] Spark Project External Flume ....................... SUCCESS [ 42.961 s]
# [INFO] Spark Project External MQTT ........................ SUCCESS [ 41.138 s]
# [INFO] Spark Project External ZeroMQ ...................... SUCCESS [ 27.237 s]
# [INFO] Spark Project External Kafka ....................... SUCCESS [01:04 min]
# [INFO] Spark Project Examples ............................. SUCCESS [03:53 min]
# [INFO] Spark Project External Kafka Assembly .............. SUCCESS [ 41.333 s]
# [INFO] ------------------------------------------------------------------------
# [INFO] BUILD SUCCESS
# [INFO] ------------------------------------------------------------------------
# [INFO] Total time: 36:57 min
# [INFO] Finished at: 2015-03-21T02:19:07+01:00
# [INFO] Final Memory: 83M/1292M
# [INFO] ------------------------------------------------------------------------
# Based on: https://github.com/databricks/spark-csv

# As an example, load cars.csv from github into Apache Spark using pyspark and databricks package
# com.databricks:spark-csv
cd ~/spark

# first clean up any previously downloaded files:
rm cars.csv
rm spark-csv
wget --no-check-certificate https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv
wget --no-check-certificate  https://github.com/databricks/spark-csv
groupId=`grep groupId spark-csv|cut -d":" -f2|cut -d" " -f2|tail -n 1`
artifactId=`grep artifactId spark-csv|cut -d":" -f2|cut -d" " -f2|tail -n 1`
version=`grep version spark-csv|tail -n 1|cut -d":" -f2|cut -d" " -f2`

# Use following command to run pyspark using four CPU cores on the local machine
# while also loading the spark-csv databricks package:
# source: https://spark.apache.org/docs/latest/programming-guide.html
bin/pyspark -v --master local[4]  --packages `echo $groupId`:`echo $artifactId`:`echo $version`

Then run following commands in the pyspark Terminal in Ubuntu 14.04 LTS 64-bit:

# manually copy-paste following commands into the pyspark Terminal session:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.load(source="com.databricks.spark.csv", header="true",path = "cars.csv")
df.select("year", "model").show()
# output of last command should be similar to this:
# year model
# 2012 S 
# 1997 E350 
# 2015 Volt
# Press CTRL-D to end the pyspark session
# useful links: 
# http://ramhiser.com/2015/02/01/configuring-ipython-notebook-support-for-pyspark/
# https://spark.apache.org/docs/1.1.1/api/python/pyspark.rdd.RDD-class.html
# install R 
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/rrutter
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/c2d4u
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install r-base-core r-base
# install RStudio :
# Free disk space required: around 5 GB
# Mac OS X users should use RStudio instead of R to avoid the following UNIX child process forking error:
# THE_PROCESS_HAS_FORKED_AND_YOU_CANNOT_USE_THIS_COREFOUNDATION_FUNCTIONALITY_YOU_MUST_EXEC__() to debug.
MACHINE_TYPE=`uname -m`
cd /tmp
rm rstudio*.deb
rm index.html
if [ ${MACHINE_TYPE} == 'x86_64' ]; then
 # 64-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep amd64\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
else
 # 32-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep i386\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
fi
cd $HOME
# troubleshooting information to check the rstudio installation:
uname -m
file /usr/lib/rstudio/bin/rstudio
ldd `which rstudio`

Purpose of these lists is to clarify which applications and R packages need to be installed on your PC to be able to perform bioinformatics research in Ubuntu 14.04 LTS or Windows (64-bit versions)


Order of installation of applications in Windows, Mac OS X or GNU/Linux before installing packages in R 3.1.1 or newer:

* libxml2-devel in Fedora 20 (=GNU/Linux distribution)
* gcc gcc47-c++ gcc-fortran in OpenSuse (=GNU/Linux distribution)
* gcc g++ gfortran xmlsec1 libxml2-dev libcurl4-openssl-dev r-cran-rcurl curl in Ubuntu 14.04 LTS 32-bit or 64-bit (=GNU/Linux distribution)
* R 3.1.1 or newer
* Windows 64-bit users should run the following .cmd script as administrator:

https://courses.edx.org/courses/KIx/KIexploRx/3T2014/discussion/forum/i4x-kiX-KIexploRx-course-2014_Practicalities/threads/5451f72635c79c749c000906

The previous .cmd script takes care of installing curl,R,RStudio and other applications in Windows. Or you can download curl here:

https://mark911.wordpress.com/2015/09/27/how-to-compile-and-install-newest-version-of-curl-from-github-in-ubuntu-14-04-lts-64-bit/

https://courses.edx.org/courses/KIx/KIexploRx/3T2014/discussion/forum/4ba16c478c9948cca206bf240bcd185d/threads/541873c00579cf8ead0005db

http://stackoverflow.com/questions/23198204/curl-is-not-recognized-as-an-internal-or-external-command

* Google Chrome browser or any other browser that uses the newest version of Adobe Flash in Windows, Mac OS X and GNU/Linux: https://www.google.com/chrome/browser/
* RStudio is optional, but recommended: http://www.rstudio.com/products/rstudio/download/
* I recommend avoiding RKWard (as R extension), as it seems to have issues with R 3.1.1 in combination with BioConductor packages.


Order of installation of R packages in R 3.1.1 (or newer) after installing previous applications in Ubuntu 14.04 LTS or Windows (64-bit versions):

* XML (requires install first of libxml2-dev or libxml2-devel in GNU/Linux)
* RJSONIO (requires install first of gcc and gcc47-c++ in OpenSuse and install of g++ in Ubuntu)
* Rcpp (requires R version >= 3.0.0 !)
* httr (requires R version >= 3.0.0 !)
* maps
* reshape2
* RCurl (requires install first of curl binary on operating system level)
* devtools (is required to install sweSCB R package from Github)
* sweSCB (install from github, requires install first of devtools R package)
* calibrate
* pheatmap
* gplots
* Biobase,BioConductor package, installed through biocLite()
* DESeq2 ,BioConductor package, installed through biocLite()
* biomaRt ,BioConductor package, installed through biocLite()
* org.Hs.eg.db ,BioConductor package, installed through biocLite()
* topGO ,BioConductor package, installed through biocLite()
* for those who want to do the extended, non-mandatory version of the lab: Rsubread is a BioConductor package which is not available on Windows. If you are on Windows, just do the short version of the lab. Rsubread only works on Linux and Mac OS X 32-bit and 64-bit systems.
* Rgraphviz,BioConductor package, installed through biocLite()
* GEOquery,BioConductor package, installed through biocLite()
* limma,BioConductor package, installed through biocLite()

Copy-paste each line below one line at a time into R or RStudio. Hit the ENTER key between each line.

install.packages("XML")
install.packages("RJSONIO")
install.packages("Rcpp", dependencies = TRUE)
install.packages("httr")
install.packages("maps")
install.packages("reshape2")
install.packages("RCurl")
install.packages("devtools", dependencies = TRUE)
devtools::install_github("sweSCB","rOpenGov")
install.packages("calibrate")
install.packages("pheatmap")
install.packages("gplots")

# install bioconductor packages
# warning: compiling DESeq2 may take a long time
# do NOT interrupt the process!

source("http://bioconductor.org/biocLite.R")

# run following biocLite("BiocUpgrade") 
# if you are upgrading from an older Bioconductor version
# to Bioconductor version 3.0 (BiocInstaller 1.16.0)
# The current release of Bioconductor is version 3.0 
# it works with R version 3.1.1
# Users of older R and Bioconductor versions must update their installation
# to take advantage of new features.

biocLite("BiocUpgrade")
biocLite("Biobase",dependencies=TRUE)
biocLite("DESeq2",dependencies=TRUE) 
biocLite("biomaRt")
biocLite("org.Hs.eg.db")
biocLite("topGO",dependencies=TRUE)
biocLite("Rsubread")
biocLite("Rgraphviz")
biocLite("GEOquery")
biocLite("limma")

Here is more info about installing the curl binary:

https://mark911.wordpress.com/2015/09/27/how-to-compile-and-install-newest-version-of-curl-from-github-in-ubuntu-14-04-lts-64-bit/

http://www.confusedbycode.com/curl/

http://www.confusedbycode.com/curl/

https://courses.edx.org/courses/KIx/KIexploRx/3T2014/discussion/forum/4ba16c478c9948cca206bf240bcd185d/threads/541873c00579cf8ead0005db

Source: https://courses.edx.org/courses/KIx/KIexploRx/3T2014/discussion/forum/i4x-kiX-KIexploRx-course-2014_Technical_issues/threads/541fda54f346085445000bc8

Prerequisites for all platforms (Windows, Mac OSX and GNU/Linux):

First make sure that R version 3.1.1 is installed.

Prerequisites for Windows:

Install the curl binary for Windows before importing List_P_3D_data.csv.

Windows 64-bit users should run the following .cmd script as administrator:
https://courses.edx.org/courses/KIx/KIexploRx/3T2014/discussion/forum/i4x-kiX-KIexploRx-course-2014_Practicalities/threads/5451f72635c79c749c000906

The previous .cmd script takes care of installing curl,R,RStudio and other applications in Windows. Or you can download curl here:

http://www.confusedbycode.com/curl/

I successfully installed and tested the import using this Windows curl binary:

http://www.confusedbycode.com/curl/curl-7.38.0-win64.msi

Make sure to close RGui, close RStudio and then restart RGui or Rstudio before proceeding with the next steps.

Prerequisites for Fedora 20:

Run following Terminal commands before importing List_P_3D_data.csv:

sudo yum update
sudo yum install curl curl-devel

Prerequisites for Ubuntu 14.04 LTS/Linux Mint:

Run following Terminal commands before importing List_P_3D_data.csv:

sudo apt-get update
sudo apt-get install libcurl4-openssl-dev r-cran-rcurl curl

Then upgrade to newest version of curl using this procedure:

https://mark911.wordpress.com/2015/09/27/how-to-compile-and-install-newest-version-of-curl-from-github-in-ubuntu-14-04-lts-64-bit/

Then run the following commands in R or RStudio:

install.packages("RCurl")

library(RCurl)

URL <- "https://courses.edx.org/c4x/KIx/KIexploRx/asset/List_P_3D_data.csv"

destfile <- "List_P_3D_data.csv"

download.file(URL, destfile = destfile, method = "curl")

a <- read.csv2(destfile)

This approach improves the portability, traceability and the reproducible research quality of the R code…

I have tried to improve the R2 (coefficient of determination) value of regression models that predict the runs scored by a given baseball team. 

Here’s the best model I could find:

# best model: cforest with R2=0.9572 (=coefficient of determination value) with n=1384 samples in dataset
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 =0.9572 with n=1384 samples in dataset
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor=0.9783788
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

print(lin_more_weights, digits = 3)
trellis.par.set(caretTheme())
plot(lin_more_weights, metric=lin_more_weights$metric)
names(lin_more_weights)
ggplot(lin_more_weights,metric=lin_more_weights$metric)
getTrainPerf(lin_more_weights)

And the code below shows the journey I took to get to that conclusion:

# predicting runs scored using caret R package:
# tested models from http://caret.r-forge.r-project.org/modelList.html

library(RMySQL)
library(party)
library(grid)
library(zoo)
library(sandwich)
library(strucchange)
library(modeltools)
library(stats4)
lahmanDb = dbConnect(MySQL(), user=”root”,
host=”localhost”)
result = dbGetQuery(lahmanDb,”show databases;”);dbDisconnect(lahmanDb);
result
lahman = dbConnect(MySQL(), user=”root”, db=”lahman”,
host=”localhost”)

# Import data
year_team_full <- dbGetQuery(lahman,”
SELECT teamID, yearID, SUM(R) AS R, SUM(AB) AS AB, SUM(H) AS H, SUM(2B) AS 2B,
SUM(3B) AS 3B, SUM(HR) AS HR, SUM(BB) AS BB, SUM(HBP) AS HBP, SUM(SF) AS SF,
SUM(SH) AS SH, SUM(SB) AS SB, SUM(CS) AS CS
FROM batting
WHERE yearID >= 1954 and 2011 >= yearID AND yearID != 1981 and yearID != 1994
GROUP BY teamID, yearID
;”)

year_team <- year_team_full

year_team$X2B = year_team[,6]
year_team$X3B = year_team[,7]

# Then delete invalid column names 2B and 3B:
year_team[,6] = NULL
year_team[,6] = NULL

# Calculate AVG, SLG, OPS; then insert into data frame
year_team$AVG <- with(year_team, (H/(AB)))
year_team$OBP <- with(year_team, ((H+BB+HBP)/(AB+BB+HBP+SF)))
year_team$SLG <- with(year_team, ((H+X2B+2*X3B+3*HR)/(AB)))
year_team$OPS <- with(year_team, OBP + SLG)

# Correlation Plot Function
# Draws a scatter plot of 2 variables from a dataframe
# displaying a best-fit line and R^2 value in the legend
corr_plot <- function(v1, v2, df) {
plot(df[[v1]], df[[v2]], xlab=v1, ylab=v2) # Draw scatter Plot
linfit <- lm(df[[v2]]~df[[v1]]) # Calculate best-fit line
abline(linfit) # Draw best-fit line
# Add R^2 value in legend
legend(“topleft”, legend = paste(“R^2:”, signif(summary(linfit)$r.squared, 4)))
}

# Add 1B for calculation simplicity
year_team$X1B <- with(year_team, H-X2B-X3B-HR)

# add RC
year_team$RC <- with(year_team,(((H+BB)*(H+X2B+2*X3B+3*HR))/(AB+BB)))

# add XRR
year_team$XRR <- with(year_team,(.5*(H-HR-X3B-X2B))+(.72*X2B)+(1.04*X3B)+(1.44*HR)+.33*(HBP+BB)+.18*SB-.32*CS-.098*(AB-H))

year_team_full <- year_team
########################################################################################

# First we’ll only use different types of hits
lin_basic_weights <- lm(R ~ X1B + X2B + X3B + HR, data=year_team)

# Apply model’s coefficients to predict past runs
year_team$linRBasic <- predict(lin_basic_weights)

# Now let’s add in BB, HBP, and SB to improve the regression’s accuracy.
lin_more_weights <- lm(R ~ X1B + X2B + X3B + HR + I(BB + HBP) + SB, data=year_team)
year_team$linRMore <- predict(lin_more_weights)

# Now let’s use blackboost to improve the regression’s accuracy.
# R2 = 0.9495
library(mboost)
lin_more_weights <- blackboost(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# R2 = 0.9227
library(randomForest)
lin_more_weights <- randomForest(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# R2 = 0.9431
library(stats)
lin_more_weights <- glm(R ~ AB+H+BB+HBP+SF+SB+X2B+SLG, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)

# seems that XRR is better predictor than OPS or OBP in model below:
lin_more_weights <- lm(R ~ XRR, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
# cor = 0.967535
cor(year_team$linRMore, year_team$R)

# relationship between R(runs scored) and OBP (on base percentage)
# also mentioned in the movie “Moneyball”
# R2 = 0.8101 = coefficient of determination
library(party)
lin_more_weights <- cforest(R ~ OBP, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
# cor = 0.9000299
cor(year_team$linRMore, year_team$R)

#######################################################################################
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9627 with n=200 samples in dataset
# cor=0.981167
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# using repeated cross-validation to reduce risks of overfitting the model
# n=500 (n = sample size)
# R2 = 0.96 with n=500 samples in dataset
# cor= 0.9797793
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=500)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# using repeated cross-validation to reduce risks of overfitting the model
# n=1000 (n = sample size)
# R2 = 0.9598 with n=1000 samples in dataset
# cor= 0.9796931
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=1000)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# with n=1000:
# might be a good idea to create an ensemble model that combines the
# strengths of cforest models and clustering models because I can
# destinguish at least 7 clusters in the plot of the following R command:
# corr_plot(‘linRMore’, ‘R’, year_team)

# best model: cforest with R2=0.9572 with n=1384 samples in dataset
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 =0.9572 with n=1384 samples in dataset
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor=0.9783788
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# model WITHOUT using repeated cross-validation -> danger of overfitting:
# n=1384 (n = sample size)
# R2 = 0.9572 = coefficient of determination
# R2 seems to remain the same with and without repeated cross-validation
# when using cforest model
# cor = 0.978368
# seems that XRR is better predictor than SLG, OPS or OBP in model below:
library(party)
lin_more_weights <- cforest(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF, data=year_team)
year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# blackboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9655 with n=200 samples in dataset
# cor=0.9825951
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “blackboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# model: bstTree
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2 = 0.9779 with n=200 samples in dataset
# cor=0.9888938
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “bstTree”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# You can apply this runs scored model to both opposing teams to
# calculate the winning percentage.
# It would be interesting to know how often we can
# correctly predict who will win the game after the 3rd inning
# has been played by applying model cforest(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF)
# to all the data – from both teams – from the first, second and third inning.

# Focusing only on following boosted models thanks to success of previous
# blackboost and bstTree models: gamboost, gbm,glmboost

# gamboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.9567 with n=200 samples in dataset
# cor=0.9780998
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “gamboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# gbm
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.2895 with n=200 samples in dataset
# cor=0.5380201
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “gbm”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# glmboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=200 (n = sample size)
# R2=0.9543 with n=200 samples in dataset
# cor=0.976859
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- head(year_team_full,n=200)
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “glmboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# Best boosted model SEEMS to be bstTree
# Expand to full data set with n=1384

# model: bstTree
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 = 0.9509 with n=1384 samples in dataset
# cor=0.9751362
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “bstTree”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# blackboost
# using repeated cross-validation to reduce risks of overfitting the model
# n=1384 (n = sample size)
# R2 = 0.9489 with n=200 samples in dataset
# cor=0.9741078
# enable use of 2 CPU cores while using train function which is part of caret package
# use 100% of the resources
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
year_team <- year_team_full
year_team$teamID <- as.factor(year_team$teamID)
year_team$teamID <- as.integer(year_team$teamID)
year_team$teamID <- as.numeric(year_team$teamID)
year_team$yearID <- as.numeric(year_team$yearID)
ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team,
method = “blackboost”,
tuneLength = 7,
trControl = ctrl)

year_team$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team)
cor(year_team$linRMore, year_team$R)

# So best model is cforest with R2=0.9572 with n=1384 samples in dataset.
# Interestingly enough, cforest was also one of the best performers during the
# ‘Show of Hands’ Kaggle competition (part of Analytics Edge course).
# source: https://www.kaggle.com/c/the-analytics-edge-mit-15-071x
# Show of Hands was not a regression problem, but a classification problem.
# second source: https://static.squarespace.com/static/51156277e4b0b8b2ffe11c00/t/53ad86e5e4b0b52e4e71cfab/1403881189332/Applied_Predictive_Modeling_in_R.pdf

Here is the result of using sample.split to split the data set into a train and test set:

#####################################################################################

# train set R2=0.9615
# test set R2=0.9032
library(doSNOW)
registerDoSNOW(makeCluster(2, type = “SOCK”))
library(caret)
library(mboost)
year_team_full$teamID <- as.factor(year_team_full$teamID)
year_team_full$teamID <- as.integer(year_team_full$teamID)
year_team_full$teamID <- as.numeric(year_team_full$teamID)
year_team_full$yearID <- as.numeric(year_team_full$yearID)

if (!require(“caTools”)) {
install.packages(“caTools”, repos=”http://cran.rstudio.com/&#8221;)
library(“caTools”)
}

# Randomly split data
set.seed(88)
split = sample.split(year_team_full$R, SplitRatio = 0.75)

# Create training and testing sets
year_team_train = subset(year_team_full, split == TRUE)
nrow(year_team_train)
year_team_test = subset(year_team_full, split == FALSE)
nrow(year_team_test)
year_team_test_with_R <- year_team_test
year_team_test$R = NULL

ctrl <- trainControl(method = “repeatedcv”, repeats = 5)
lin_more_weights <- train(R ~ XRR+HR+H+BB+HBP+X2B+SB+SF,
data=year_team_train,
method = “cforest”,
metric=”Rsquared”,
tuneLength = 7,
trControl = ctrl)

# calculate train set R2 = 0.9615, cor=0.9805769
year_team_train$linRMore <- predict(lin_more_weights)
corr_plot(‘linRMore’, ‘R’, year_team_train)
cor(year_team_train$linRMore, year_team_train$R)

# calculate test set R2= 0.9032, cor=0.9503854
year_team_test_with_R$linRMore <- predict(lin_more_weights,newdata = year_team_test)
corr_plot(‘linRMore’, ‘R’, year_team_test_with_R)
cor(year_team_test_with_R$linRMore, year_team_test_with_R$R)

#####################################################################################