Installing Spark with R
install.packages("sparklyr")
library(sparklyr)
spark_install(version = "3.4.1")
or
sc <- spark_connect(master = "local",
version = "3.5.5",
spark_home = " c:\Users\AppData\local\spark\spark-3.55-bin-hadoop3")
spark_version(sc)
library(sparklyr)
# Connect to local Spark
sc <- spark_connect(master = "local")
# Test Spark with some simple data
library(dplyr)
df <- sdf_copy_to(sc, iris, overwrite = TRUE)
Example 1: Load and View Data
# to see all data whos Sepal_Length is less than 5
df %>% filter(Sepal_Length > 5) %>% collect()
Example 2: Group and Summarize
library(sparklyr)
# Connect to local Spark
sc <- spark_connect(master = "local")
# to see average mpg according to cylinder size
library(dplyr)
df <- sdf_copy_to(sc, mtcars, overwrite = TRUE)
df %>% group_by(cyl) %>%
summarize(avg_mpg = mean(mpg)) %>% collect()
Example 3: Tokenize Text
text_data <- data.frame(
id = 1:3,
text = c("Spark is awesome", "R is powerful", "Big data rules")
)
text_tbl <- copy_to(sc, text_data, overwrite = TRUE)
tokenized <- text_tbl %>% ft_tokenizer(input_col = "text", output_col = "words")
tokenized %>% collect()
Example 4: Simple Machine Learning
# Predict mpg from weight
library(sparklyr)
# Connect to local Spark
sc <- spark_connect(master = "local")
# to construct a model dependant is mpg on independent is weight i.e. mpg= a + b*weight
library(dplyr)
df <- sdf_copy_to(sc, mtcars, overwrite = TRUE)
model <- ml_linear_regression(df, mpg ~ wt)
summary(model)
predictions <- ml_predict(model, df)
predictions %>% select(mpg, prediction) %>% collect()
No More
Statlearner
Statlearner