Online Learning Platform

Big Data > Spark > Installing Spark and Some Simple Application

Installing Spark with R

Download and install R and RStudio

Download OpenJDK 17 and save it in the folder “C:\Program Files\openjdk”

Set environmental variable for JDK:

JAVA_HOME=C:\Program Files\openjdk
Add new path: %JAVA_HOME%\bin

Restart your PC and check in command prompt: java -version
Run RStudio and run the commands:

install.packages("sparklyr")

library(sparklyr)

To install Spark

spark_install(version = "3.4.1")

Directly download Spark from https://spark.apache.org/downloads
Unzip it into c:\Users\AppData\local\spark\spark-3.55-bin-hadoop3
Set Environment Variable:
- SPARK_HOME=c:\Users\AppData\local\spark\spark-3.55-bin-hadoop3
- Add new path: %SPARK_HOME%\bin
Restart your PC
Connect R to the Installed Spark:

sc <- spark_connect(master = "local",

version = "3.5.5",

spark_home = " c:\Users\AppData\local\spark\spark-3.55-bin-hadoop3")

To check Spark is working

spark_version(sc)

library(sparklyr)

# Connect to local Spark

sc <- spark_connect(master = "local")

# Test Spark with some simple data

library(dplyr)

df <- sdf_copy_to(sc, iris, overwrite = TRUE)

Example 1: Load and View Data

# to see all data whos Sepal_Length is less than 5

df %>% filter(Sepal_Length > 5) %>% collect()

Example 2: Group and Summarize

library(sparklyr)

# Connect to local Spark

sc <- spark_connect(master = "local")

# to see average mpg according to cylinder size

library(dplyr)

df <- sdf_copy_to(sc, mtcars, overwrite = TRUE)

df %>% group_by(cyl) %>%

summarize(avg_mpg = mean(mpg)) %>% collect()

Example 3: Tokenize Text

text_data <- data.frame(

id = 1:3,

text = c("Spark is awesome", "R is powerful", "Big data rules")

)

text_tbl <- copy_to(sc, text_data, overwrite = TRUE)

tokenized <- text_tbl %>% ft_tokenizer(input_col = "text", output_col = "words")

tokenized %>% collect()

Example 4: Simple Machine Learning

# Predict mpg from weight

library(sparklyr)

# Connect to local Spark

sc <- spark_connect(master = "local")

# to construct a model dependant is mpg on independent is weight i.e. mpg= a + b*weight

library(dplyr)

df <- sdf_copy_to(sc, mtcars, overwrite = TRUE)

model <- ml_linear_regression(df, mpg ~ wt)

summary(model)

predictions <- ml_predict(model, df)

predictions %>% select(mpg, prediction) %>% collect()

d. Kubernetes

No More

Feedback

ABOUT

Statlearner

Statlearner STUDY

Statlearner