-
Notifications
You must be signed in to change notification settings - Fork 5
Open
Description
Spark 4.0 added support for pyspark.ml in Connect. This expands the available ML functions, but the wrappers need to be written for the other models, as well as to update the existing ones.
Here is an example script showing the steps to fit a Linear Regression model:
library(sparklyr)
pysparklyr::spark_connect_service_start("4.0", "2.13")
#> Starting Spark Connect locally ...
#> starting org.apache.spark.sql.connect.service.SparkConnectServer, logging to
#> /Users/edgar/spark/spark-4.0.0-bin-hadoop3/logs/spark-edgar-org.apache.spark.sql.connect.service.SparkConnectServer-1-edgarruiz-WL57.out
Sys.sleep(1)
sc <- spark_connect(
master = "sc://localhost",
method = "spark_connect",
version = "4.0"
)
#> ℹ Attempting to load 'r-sparklyr-pyspark-4.0'
#> ✔ Python environment: 'r-sparklyr-pyspark-4.0' [410ms]
#>
tbl_mtcars <- copy_to(sc, mtcars, overwrite = TRUE)
# New ML methodology
ps <- reticulate::import("pyspark")
va <- ps$ml$feature$VectorAssembler()
as <- va$copy()
as$setInputCols(c("wt", "am", "vs"))
#> VectorAssembler_4c696c095e90
as$setOutputCol("features")
#> VectorAssembler_4c696c095e90
sdf_prep <- spark_dataframe(tbl_mtcars)
prep_mtcars <- as$transform(sdf_prep$pyspark_obj)
lr <- ps$ml$regression$LinearRegression()
lr$setLabelCol("mpg")
#> LinearRegression_2ed7947ac6f3
model <- lr$fit(prep_mtcars)
model$coefficients
#> DenseVector([-3.7845, 1.4913, 3.615])
spark_disconnect(sc)
pysparklyr::spark_connect_service_stop()
#>
#> ── Stopping Spark Connect
#> - Shutdown command sentMetadata
Metadata
Assignees
Labels
No labels