Posts Tagged ‘R’

# .R script showing capabilities of sparklyr R package
# Prerequisites before running this R script: 
# Ubuntu 16.04 LTS 64-bit, r-base (version 3.3.3 or newer), RStudio 64-bit version
install.packages("sparklyr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("tidyr")
library(sparklyr)
library(dplyr)
library(ggplot2)
library(tidyr)
set.seed(100)
# sparklyr cheat sheet: https://github.com/rstudio/cheatsheets/raw/master/source/pdfs/sparklyr.pdf
# dplyr+tidyr: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# sparklyr currently (2017-04-22) only supports Spark version: 2.0.1 or 2.0.2 (not 2.1.0!!)
# Install Spark locally:
spark_install("2.0.2")
config <- spark_config()
# number of CPU cores to use:
config$spark.executor.cores <- 6
config$spark.executor.memory <- "4G"
# Connect to local version:
sc <- spark_connect (master = "local",
 config = config, version = "2.0.2")
# Copy data to Spark memory:
import_iris <- copy_to(sc, iris, "spark_iris", overwrite = TRUE) 
# partition data:
partition_iris <- sdf_partition(import_iris,training=0.5, testing=0.5) 
# Create a hive metadata for each partition:
sdf_register(partition_iris,c("spark_iris_training","spark_iris_test")) 
# Create reference to training data in Spark table
tidy_iris <- tbl(sc,"spark_iris_training") %>% select(Species, Petal_Length, Petal_Width) 
# Spark ML Decision Tree Model
model_iris <- tidy_iris %>% ml_decision_tree(response="Species", features=c("Petal_Length","Petal_Width")) 
# Create reference to test data in Spark table
test_iris <- tbl(sc,"spark_iris_test") 
# Bring predictions data back into R memory for plotting:
pred_iris <- sdf_predict(model_iris, test_iris) %>% collect
pred_iris %>%
inner_join(data.frame(prediction=0:2,
lab=model_iris$model.parameters$labels)) %>%
ggplot(aes(Petal_Length, Petal_Width, col=lab)) +
geom_point() 
spark_disconnect(sc)
# Reproducible research .R script to run in RStudio in Ubuntu 14.04 LTS 64-bit
# Prerequisites to install: 
# https://mark911.wordpress.com/2014/11/06/how-to-install-newest-version-of-r-and-rstudio-in-ubuntu-14-04-lts-using-a-bash-script/
# Further prerequisites to install in R or RStudio:
install.packages(c("Quandl", "dplyr", "ggvis", "lubridate"))
# Data set: 
# https://www.quandl.com/data/ODA/MOZ_PPPSH-Mozambique-Share-of-World-GDP-based-on-PPP
library(Quandl)
library(dplyr)
library(ggvis)
library(lubridate)
data <- Quandl("ODA/MOZ_PPPSH", authcode="FiHHoC-Gnx3CHzr9385J")
str(data)
dplyr::glimpse(data)
head(data)
tail(data)
data$year <- lubridate::year(data$Date)
min_year <- min(data$year)
max_year <- max(data$year)
# source: https://github.com/rstudio/ggvis/blob/master/vignettes/ggvis-basics.Rmd
# source: Data Wrangling Cheat Sheet:
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# source: http://ggvis.rstudio.com/0.1/quick-examples.html
data %>% 
 ggvis(~Date,~Value) %>% 
 layer_points() %>% 
 layer_model_predictions(model = input_select(
 c("loess" = "loess",
 "lm" = "lm",
 "MASS::rlm" = "MASS::rlm"),
 label = "model")) %>%
 layer_smooths(se = TRUE,
 span = input_slider(min = 0.3, max = 1, value = 0.8, step = 0.1,
 label = "Smoothing span")) %>%
 add_axis("x", title = "Date") %>%
 add_axis("y", title = "Mozambique Share of World GDP based on PPP, %")
# install R 
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/rrutter
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/c2d4u
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install r-base-core r-base
# install RStudio :
# Free disk space required: around 5 GB
# Mac OS X users should use RStudio instead of R to avoid the following UNIX child process forking error:
# THE_PROCESS_HAS_FORKED_AND_YOU_CANNOT_USE_THIS_COREFOUNDATION_FUNCTIONALITY_YOU_MUST_EXEC__() to debug.
MACHINE_TYPE=`uname -m`
cd /tmp
rm rstudio*.deb
rm index.html
if [ ${MACHINE_TYPE} == 'x86_64' ]; then
 # 64-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep amd64\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
else
 # 32-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep i386\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
fi
cd $HOME
# troubleshooting information to check the rstudio installation:
uname -m
file /usr/lib/rstudio/bin/rstudio
ldd `which rstudio`