Archive for the ‘Data Science’ Category

# .R script showing capabilities of sparklyr R package
# Prerequisites before running this R script: 
# Ubuntu 16.04.3 LTS 64-bit, r-base (version 3.4.1 or newer), 
# RStudio 64-bit version, libssl-dev, libcurl4-openssl-dev, libxml2-dev
install.packages("httr")
install.packages("xml2")
# New features in sparklyr 0.6:
# https://blog.rstudio.com/2017/07/31/sparklyr-0-6/
install.packages("sparklyr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("tidyr")
library(sparklyr)
library(dplyr)
library(ggplot2)
library(tidyr)
set.seed(100)
# sparklyr cheat sheet: https://github.com/rstudio/cheatsheets/raw/master/source/pdfs/sparklyr.pdf
# dplyr+tidyr: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# sparklyr currently (2017-08-19) only supports Apache Spark version 2.2.0 or older
# Install Spark locally:
sc_version <- "2.2.0"
spark_install(sc_version)
config <- spark_config()
# number of CPU cores to use:
config$spark.executor.cores <- 6
# amount of RAM to use for Apache Spark executors:
config$spark.executor.memory <- "4G"
# Connect to local version:
sc <- spark_connect (master = "local",
 config = config, version = sc_version)
# Copy data to Spark memory:
import_iris <- sdf_copy_to(sc, iris, "spark_iris", overwrite = TRUE) 
# partition data:
partition_iris <- sdf_partition(import_iris,training=0.5, testing=0.5) 
# Create a hive metadata for each partition:
sdf_register(partition_iris,c("spark_iris_training","spark_iris_test")) 
# Create reference to training data in Spark table
tidy_iris <- tbl(sc,"spark_iris_training") %>% select(Species, Petal_Length, Petal_Width) 
# Spark ML Decision Tree Model
model_iris <- tidy_iris %>% ml_decision_tree(response="Species", features=c("Petal_Length","Petal_Width")) 
# Create reference to test data in Spark table
test_iris <- tbl(sc,"spark_iris_test") 
# Bring predictions data back into R memory for plotting:
pred_iris <- sdf_predict(model_iris, test_iris) %>% collect
pred_iris %>%
 inner_join(data.frame(prediction=0:2,
 lab=model_iris$model.parameters$labels)) %>%
 ggplot(aes(Petal_Length, Petal_Width, col=lab)) +
 geom_point() 
spark_disconnect(sc)
# bash shell script for Ubuntu 14.04 LTS 64-bit:
# Free disk space required: around 5 GB
# Minimum internal memory/RAM requirements: 4 GB RAM minimum
# Time required to execute 'sudo make' compilation of RStudio source code: around 45 minutes, maybe even more 
# sudo checkinstall process should take around 20 minutes to finish
# REQUIRES: newest version of wget compiled from Github sources
# Copy-paste the following commands into the Terminal one by one:
# install newest version of wget from Github sources in order to solve following wget issue in Ubuntu 14.04 LTS : 
# https://github.com/chapmanb/bcbio-nextgen/issues/1133
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install libqtwebkit-dev checkinstall qtbase5-dev pandoc r-base
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install libboost-all-dev cmake libqt4-dev build-essential default-jdk
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes build-dep wget
cd
sudo rm -rf wget
sudo rm /usr/bin/rstudio*
git clone https://github.com/mirror/wget.git
cd wget
./bootstrap
./configure
sudo make
sudo checkinstall
# Press 3 and ENTER and then set version to 1.17.1.13
apt-cache show wget
# Terminal output should look like this:
# apt-cache show wget
# Package: wget
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 3864
# Maintainer: root
# Architecture: amd64
# Version: 1.17.1.13-1
# Provides: wget
# Conffiles:
# /etc/wgetrc 618c05b4106ad20141dcf6deada2e87f obsolete
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
# compile and install RStudio Server from source code:
cd
git clone https://github.com/rstudio/rstudio.git
cd rstudio/
mkdir build
cd build/
cd ~/rstudio/dependencies/common
bash install-common
bash install-common
cd
bash ~/rstudio/dependencies/linux/install-dependencies-debian
bash ~/rstudio/dependencies/linux/install-dependencies-debian
cd /tmp
wget http://dl.google.com/closure-compiler/compiler-latest.zip
unzip compiler-latest.zip
rm COPYING README.md compiler-latest.zip
sudo mv compiler.jar ~/rstudio/src/gwt/tools/compiler/compiler.jar
cd ~/rstudio
sudo rm -rf build
sudo cmake -DRSTUDIO_TARGET=Server -DCMAKE_BUILD_TYPE=Release
time sudo make
# sudo make install process should take around 45 minutes to finish
time sudo checkinstall
# sudo checkinstall process should take around 20 minutes to finish
apt-cache show rstudio
# Terminal output should look like this:
# Package: rstudio
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 293492
# Maintainer: root
# Architecture: amd64
# Version: 20160206-1
# Provides: rstudio
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
sudo ln -s /usr/local/lib/rstudio-server/bin/rserver /usr/bin
nohup rserver &
# then use a web browser to navigate to http://127.0.0.1:8787/ to access the RStudio Server interface
# Reproducible research .R script to run in RStudio in Ubuntu 14.04 LTS 64-bit
# Prerequisites to install: 
# https://mark911.wordpress.com/2014/11/06/how-to-install-newest-version-of-r-and-rstudio-in-ubuntu-14-04-lts-using-a-bash-script/
# Further prerequisites to install in R or RStudio:
install.packages(c("Quandl", "dplyr", "ggvis", "lubridate"))
# Data set: 
# https://www.quandl.com/data/ODA/MOZ_PPPSH-Mozambique-Share-of-World-GDP-based-on-PPP
library(Quandl)
library(dplyr)
library(ggvis)
library(lubridate)
data <- Quandl("ODA/MOZ_PPPSH", authcode="FiHHoC-Gnx3CHzr9385J")
str(data)
dplyr::glimpse(data)
head(data)
tail(data)
data$year <- lubridate::year(data$Date)
min_year <- min(data$year)
max_year <- max(data$year)
# source: https://github.com/rstudio/ggvis/blob/master/vignettes/ggvis-basics.Rmd
# source: Data Wrangling Cheat Sheet:
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# source: http://ggvis.rstudio.com/0.1/quick-examples.html
data %>% 
 ggvis(~Date,~Value) %>% 
 layer_points() %>% 
 layer_model_predictions(model = input_select(
 c("loess" = "loess",
 "lm" = "lm",
 "MASS::rlm" = "MASS::rlm"),
 label = "model")) %>%
 layer_smooths(se = TRUE,
 span = input_slider(min = 0.3, max = 1, value = 0.8, step = 0.1,
 label = "Smoothing span")) %>%
 add_axis("x", title = "Date") %>%
 add_axis("y", title = "Mozambique Share of World GDP based on PPP, %")

!!!! Following procedure is risky, because manually installing the 6 library dependencies below can potentially break your package manager dependencies !!!!!
!!!!! Use following bash script at your own risk !!!!!
!!!!! It is highly recommended to run this script in Ubuntu 14.04 LTS 64-bit in a virtual machine (Virtualbox or VMWare) and not in your host Ubuntu installation !!!!!!
However, the build of scilab was tested and works on my own real install of Ubuntu 14.04 LTS 64-bit
It is probably safer to follow this procedure in Ubuntu 15.04 final beta, but I have not tested Ubuntu 15.04 final beta myself, as I am sticking to LTS releases of Ubuntu.

Contents of bash script:

# !!!! Following procedure is risky, because manually installing the 6 library dependencies below can potentially break your package manager dependencies !!!!!
# !!!!! Use following bash script at your own risk !!!!!
# !!!!! It is highly recommended to run this script in Ubuntu 14.04 LTS 64-bit in a virtual machine (Virtualbox or VMWare) and not in your host Ubuntu installation !!!!!!
# However, the build of scilab was tested and works on my own real install of Ubuntu 14.04 LTS 64-bit
# It is probably safer to follow this procedure in Ubuntu 15.04 final beta, but I have not tested Ubuntu 15.04 final beta myself, as I am sticking to LTS releases of Ubuntu.
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get purge scilab
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install checkinstall build-essential unp git
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes build-dep scilab
# manually install 6 library dependencies for scilab in Ubuntu 14.04 LTS 64-bit:
# these dependencies are available in the standard Ubuntu repositories for vivid (Ubuntu 15.04 final beta)
cd /tmp
wget --no-check-certificate https://launchpad.net/ubuntu/+source/libjogl2-java/2.2.4+dfsg-1/+build/6556107/+files/libjogl2-java-doc_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate https://launchpad.net/ubuntu/+source/libjogl2-java/2.2.4+dfsg-1/+build/6556107/+files/libjogl2-java_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate http://launchpadlibrarian.net/189706070/libjogl2-jni_2.2.4%2Bdfsg-1_amd64.deb
wget --no-check-certificate http://launchpadlibrarian.net/189706067/libjogl2-toolkits_2.2.4%2Bdfsg-1_all.deb
wget --no-check-certificate http://francisdavey.co.uk/ubuntu/ubuntu/pool/universe/g/gluegen2/libgluegen2-jni_2.2.4-2_amd64.deb
wget --no-check-certificate http://francisdavey.co.uk/ubuntu/ubuntu/pool/universe/g/gluegen2/libgluegen2-rt-java_2.2.4-2_all.deb
rm *i386*.deb
sudo dpkg -i libgluegen2-jni_*.deb
sudo dpkg -i libgluegen2-rt-java*.deb
sudo dpkg -i libjogl2-jni_*.deb
sudo dpkg -i libjogl2-java_*.deb
sudo dpkg -i libjogl2-toolkits_*.deb
sudo dpkg -i libjogl2-java-doc_*.deb
#compile and install newest version of scilab in Ubuntu 14.04 LTS 64-bit:
cd
sudo rm -rf ~/scilab
git clone https://github.com/opencollab/scilab.git
cd ~/scilab/scilab
sudo ./configure
sudo make
sudo make install
/usr/local/bin/scilab -version
# Terminal output should look similar to this output:
# Scilab version "5.6.0.0"
# scilab-branch-master
/usr/local/bin/scilab
# hardware requirements: see https://github.com/wnd-charm/wnd-charm
# Wndchrm - an open source utility for biological image analysis
# WND-CHARM is an acronym that stands for "Weighted Neighbor Distance using Compound Hierarchy of Algorithms Representing Morphology."
cd
rm -rf wnd-charm
# install prerequisites:
sudo apt-get update
sudo apt-get install build-essential libtiff4-dev libfftw3-dev libX11-dev libxt-dev libxaw7-dev phylip
sudo apt-get install python-networkx python-skimage python-sklearn checkinstall
sudo apt-get install python-matplotlib python-numpy python-pandas
sudo apt-get install python-jsonschema openbox ipython python-scipy
sudo apt-get install spyder unp python-pip build-essential python-dev swig
sudo pip install --upgrade beautifulsoup4 numpy scipy matplotlib argparse
sudo pip install --upgrade ipython mistune networkx pandas py4j runipy
sudo pip install --upgrade scikit-image scikit-learn scipy-data_fitting
sudo pip install --upgrade statsmodels jsonschema
# install wnd-charm
git clone https://github.com/wnd-charm/wnd-charm.git
cd wnd-charm
sudo ./configure
sudo make
sudo make check
# change version from 'charm' to 1 when running sudo checkinstall to avoid errors!
# so checkinstall build values should be as follows:
# 0 - Maintainer: [ root ]
# 1 - Summary: [ Package created with checkinstall 1.6.2 ]
# 2 - Name: [ wnd ]
# 3 - Version: [ 1 ] -> instead of invalid Version: [ charm ]
# 4 - Release: [ 1 ]
# 5 - License: [ GPL ]
# 6 - Group: [ checkinstall ]
# 7 - Architecture: [ amd64 ]
# 8 - Source location: [ wnd-charm ]
# 9 - Alternate source location: [ ]
# 10 - Requires: [ ]
# 11 - Provides: [ wnd ]
# 12 - Conflicts: [ ]
# 13 - Replaces: [ ]
sudo checkinstall
apt-cache show wnd
# output of apt-cache show wnd should similar to this:
# Package: wnd
# Status: install ok installed
# Priority: extra
# Section: checkinstall
# Installed-Size: 1076
# Maintainer: root
# Architecture: amd64
# Version: 1-1
# Provides: wnd
# Description: Package created with checkinstall 1.6.2
# Description-md5: 556b8d22567101c7733f37ce6557412e
wndchrm

Run following bash script in a bash Terminal in Ubuntu 14.04 LTS 64-bit:

#!/bin/bash
# Based on: https://spark.apache.org/docs/1.1.0/building-with-maven.html
# Purpose: this script will automatically compile and install
# the newest version of maven and Apache Spark via the github sources
# Software requirements: Ubuntu 14.04 LTS 64-bit, git, build-essential,
# ant, unp, python2.7, java 1.7.0 or higher
# Minimum RAM requirements for this script: 2 Gigabytes of RAM (maybe even more) 
# Please make sure to close any web browser windows and any other 
# memory hogging applications before running this memory intensive bash script.

# First uninstall any conflicting binary packages of maven and maven2:
cd
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository --yes ppa:marutter/rrutter
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository --yes ppa:marutter/c2d4u
sudo DEBIAN_FRONTEND=noninteractive apt-get update

# Install tools required to build maven and Apache Spark with sparkR support:
sudo apt-get build-dep maven maven2
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  r-base-core r-base
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  git build-essential python-protobuf protobuf-compiler
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  install  ant unp python2.7 openjdk-7-jre-headless 
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes  purge maven maven2

# Also remove any previously installed versions of Apache Spark:
sudo rm -rf spark*
sudo rm -rf /usr/local/spark*

# install newest version of maven
rm -rf maven*
git clone https://github.com/apache/maven.git
cd maven
ant -Dmaven.home="$HOME/apps/maven/apache-maven-SNAPSHOT"
cd ~/maven/apache-maven/target
unp apache-maven-*-bin.tar.gz
sudo rm /usr/bin/mvn
sudo ln -s ~/maven/apache-maven/target/apache-maven-*/bin/mvn  /usr/bin/mvn
mvn -v

# example of Terminal output:
#Apache Maven 3.3.2-SNAPSHOT
#Maven home: $HOME/maven/apache-maven/target/apache-maven-3.3.2-SNAPSHOT
#Java version: 1.7.0_76, vendor: Oracle Corporation
#Java home: /usr/lib/jvm/java-7-oracle/jre
#Default locale: en_US, platform encoding: UTF-8
#OS name: "linux", version: "4.0.0-040000rc3-lowlatency", arch: "amd64", family: "unix"

# install SparkR-pkg
cd
rm -rf SparkR-pkg/
git clone https://github.com/amplab-extras/SparkR-pkg.git
cd SparkR-pkg/
SPARK_VERSION=1.5.0 USE_MAVEN=1 ./install-dev.sh
# ./sparkR examples/pi.R local[2]

# install newest version of Apache Spark:
cd
git clone git://github.com/apache/spark.git
cd spark

# increase MaxPermSize to avoid out-of-memory errors during compile process:
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
mvn -PsparkR -DskipTests clean package

# End result of Apache Spark build process should look
# something like this (without any memory errors):
# [INFO] ------------------------------------------------------------------------
# [INFO] Reactor Summary:
# [INFO] 
# [INFO] Spark Project Parent POM ........................... SUCCESS [  9.048 s]
# [INFO] Spark Launcher Project ............................. SUCCESS [ 19.509 s]
# [INFO] Spark Project Networking ........................... SUCCESS [ 14.113 s]
# [INFO] Spark Project Shuffle Streaming Service ............ SUCCESS [  7.626 s]
# [INFO] Spark Project Core ................................. SUCCESS [05:46 min]
# [INFO] Spark Project Bagel ................................ SUCCESS [ 33.517 s]
# [INFO] Spark Project GraphX ............................... SUCCESS [01:45 min]
# [INFO] Spark Project Streaming ............................ SUCCESS [02:35 min]
# [INFO] Spark Project Catalyst ............................. SUCCESS [02:38 min]
# [INFO] Spark Project SQL .................................. SUCCESS [03:40 min]
# [INFO] Spark Project ML Library ........................... SUCCESS [03:46 min]
# [INFO] Spark Project Tools ................................ SUCCESS [ 19.095 s]
# [INFO] Spark Project Hive ................................. SUCCESS [03:00 min]
# [INFO] Spark Project REPL ................................. SUCCESS [01:07 min]
# [INFO] Spark Project Assembly ............................. SUCCESS [02:12 min]
# [INFO] Spark Project External Twitter ..................... SUCCESS [ 26.990 s]
# [INFO] Spark Project External Flume Sink .................. SUCCESS [ 41.008 s]
# [INFO] Spark Project External Flume ....................... SUCCESS [ 42.961 s]
# [INFO] Spark Project External MQTT ........................ SUCCESS [ 41.138 s]
# [INFO] Spark Project External ZeroMQ ...................... SUCCESS [ 27.237 s]
# [INFO] Spark Project External Kafka ....................... SUCCESS [01:04 min]
# [INFO] Spark Project Examples ............................. SUCCESS [03:53 min]
# [INFO] Spark Project External Kafka Assembly .............. SUCCESS [ 41.333 s]
# [INFO] ------------------------------------------------------------------------
# [INFO] BUILD SUCCESS
# [INFO] ------------------------------------------------------------------------
# [INFO] Total time: 36:57 min
# [INFO] Finished at: 2015-03-21T02:19:07+01:00
# [INFO] Final Memory: 83M/1292M
# [INFO] ------------------------------------------------------------------------
# Based on: https://github.com/databricks/spark-csv

# As an example, load cars.csv from github into Apache Spark using pyspark and databricks package
# com.databricks:spark-csv
cd ~/spark

# first clean up any previously downloaded files:
rm cars.csv
rm spark-csv
wget --no-check-certificate https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv
wget --no-check-certificate  https://github.com/databricks/spark-csv
groupId=`grep groupId spark-csv|cut -d":" -f2|cut -d" " -f2|tail -n 1`
artifactId=`grep artifactId spark-csv|cut -d":" -f2|cut -d" " -f2|tail -n 1`
version=`grep version spark-csv|tail -n 1|cut -d":" -f2|cut -d" " -f2`

# Use following command to run pyspark using four CPU cores on the local machine
# while also loading the spark-csv databricks package:
# source: https://spark.apache.org/docs/latest/programming-guide.html
bin/pyspark -v --master local[4]  --packages `echo $groupId`:`echo $artifactId`:`echo $version`

Then run following commands in the pyspark Terminal in Ubuntu 14.04 LTS 64-bit:

# manually copy-paste following commands into the pyspark Terminal session:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.load(source="com.databricks.spark.csv", header="true",path = "cars.csv")
df.select("year", "model").show()
# output of last command should be similar to this:
# year model
# 2012 S 
# 1997 E350 
# 2015 Volt
# Press CTRL-D to end the pyspark session
# useful links: 
# http://ramhiser.com/2015/02/01/configuring-ipython-notebook-support-for-pyspark/
# https://spark.apache.org/docs/1.1.1/api/python/pyspark.rdd.RDD-class.html
# install R 
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/rrutter
sudo DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:marutter/c2d4u
sudo DEBIAN_FRONTEND=noninteractive apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install r-base-core r-base
# install RStudio :
# Free disk space required: around 5 GB
# Mac OS X users should use RStudio instead of R to avoid the following UNIX child process forking error:
# THE_PROCESS_HAS_FORKED_AND_YOU_CANNOT_USE_THIS_COREFOUNDATION_FUNCTIONALITY_YOU_MUST_EXEC__() to debug.
MACHINE_TYPE=`uname -m`
cd /tmp
rm rstudio*.deb
rm index.html
if [ ${MACHINE_TYPE} == 'x86_64' ]; then
 # 64-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep amd64\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
else
 # 32-bit stuff here
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes install gdebi-core pandoc libssl0.9.8 libapparmor1
wget --no-check-certificate http://www.rstudio.com/products/rstudio/download/
wget --no-check-certificate `cat index.html|grep -v tar|grep i386\.deb|cut -d"\"" -f2`
sudo dpkg -i rstudio*.deb
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --force-yes -f install
fi
cd $HOME
# troubleshooting information to check the rstudio installation:
uname -m
file /usr/lib/rstudio/bin/rstudio
ldd `which rstudio`