diff --git a/Project.toml b/Project.toml
index d52a0a4..cd0cb0a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJModels"
 uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.17.6"
+version = "0.17.7"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
diff --git a/src/registry/Metadata.toml b/src/registry/Metadata.toml
index bc83760..e8d4551 100644
--- a/src/registry/Metadata.toml
+++ b/src/registry/Metadata.toml
@@ -719,6 +719,114 @@
 ":reporting_operations" = "`()`"
 ":constructor" = "`nothing`"
 
+[GLM.LinearBinaryClassifier]
+":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`AbstractVector{<:ScientificTypesBase.Binary}`"
+":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{<:ScientificTypesBase.Binary}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{<:ScientificTypesBase.Binary}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`ScientificTypesBase.Unknown`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`true`"
+":package_name" = "GLM"
+":package_license" = "MIT"
+":load_path" = "MLJGLMInterface.LinearBinaryClassifier"
+":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+":package_url" = "https://github.com/JuliaStats/GLM.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`true`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nLinearBinaryClassifier\n```\n\nA model type for constructing a linear binary classifier, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearBinaryClassifier = @load LinearBinaryClassifier pkg=GLM\n```\n\nDo `model = LinearBinaryClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearBinaryClassifier(fit_intercept=...)`.\n\n`LinearBinaryClassifier` is a [generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model#Variance_function), specialised to the case of a binary target variable, with a user-specified link function. Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype with `schema(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model.  If set to false,  no intercept will be calculated (e.g. the data is expected to be centered)\n  * `link=GLM.LogitLink`: The function which links the linear prediction function to the  probability of a particular outcome or class. This must have type `GLM.Link01`. Options  include `GLM.LogitLink()`, `GLM.ProbitLink()`, `CloglogLink(),`CauchitLink()`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a  variable which is known to have a coefficient of 1.\n  * `maxiter::Integer=30`: The maximum number of iterations allowed to achieve convergence.\n  * `atol::Real=1e-6`: Absolute threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `rtol::Real=1e-6`: Relative threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew` having the same scitype as `X` above. Predictions are probabilistic.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned  above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features used during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nimport GLM # namespace must be available\n\nLinearBinaryClassifier = @load LinearBinaryClassifier pkg=GLM\nclf = LinearBinaryClassifier(fit_intercept=false, link=GLM.ProbitLink())\n\nX, y = @load_crabs\n\nmach = machine(clf, X, y) |> fit!\n\nXnew = (;FL = [8.1, 24.8, 7.2],\n        RW = [5.1, 25.7, 6.4],\n        CL = [15.9, 46.7, 14.3],\n        CW = [18.7, 59.7, 12.2],\n        BD = [6.2, 23.6, 8.4],)\n\nyhat = predict(mach, Xnew) # probabilistic predictions\npdf(yhat, levels(y)) # probability matrix\np_B = pdf.(yhat, \"B\")\nclass_labels = predict_mode(mach, Xnew)\n\nfitted_params(mach).features\nfitted_params(mach).coef\nfitted_params(mach).intercept\n\nreport(mach)\n```\n\nSee also [`LinearRegressor`](@ref), [`LinearCountRegressor`](@ref)\n"""
+":name" = "LinearBinaryClassifier"
+":human_name" = "linear binary classifier"
+":is_supervised" = "`true`"
+":prediction_type" = ":probabilistic"
+":abstract_type" = "`MLJModelInterface.Probabilistic`"
+":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict"]
+":hyperparameters" = "`(:fit_intercept, :link, :offsetcol, :maxiter, :atol, :rtol, :minstepfac, :report_keys)`"
+":hyperparameter_types" = "`(\"Bool\", \"GLM.Link01\", \"Union{Nothing, Symbol}\", \"Integer\", \"Real\", \"Real\", \"Real\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
+[GLM.LinearCountRegressor]
+":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`AbstractVector{ScientificTypesBase.Count}`"
+":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Count}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Count}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`AbstractVector{ScientificTypesBase.Density{ScientificTypesBase.Count}}`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`true`"
+":package_name" = "GLM"
+":package_license" = "MIT"
+":load_path" = "MLJGLMInterface.LinearCountRegressor"
+":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+":package_url" = "https://github.com/JuliaStats/GLM.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`true`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nLinearCountRegressor\n```\n\nA model type for constructing a linear count regressor, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearCountRegressor = @load LinearCountRegressor pkg=GLM\n```\n\nDo `model = LinearCountRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearCountRegressor(fit_intercept=...)`.\n\n`LinearCountRegressor` is a [generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model#Variance_function), specialised to the case of a `Count` target variable (non-negative, unbounded integer) with user-specified link function. Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `Count`; check the scitype with `schema(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model. If set to false,  no intercept will be calculated (e.g. the data is expected to be centered)\n  * `distribution=Distributions.Poisson()`: The distribution which the residuals/errors of the  model should fit.\n  * `link=GLM.LogLink()`: The function which links the linear prediction function to the  probability of a particular outcome or class. This should be one of the following:  `GLM.IdentityLink()`, `GLM.InverseLink()`, `GLM.InverseSquareLink()`, `GLM.LogLink()`,  `GLM.SqrtLink()`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a  variable which is known to have a coefficient of 1.\n  * `maxiter::Integer=30`: The maximum number of iterations allowed to achieve convergence.\n  * `atol::Real=1e-6`: Absolute threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `rtol::Real=1e-6`: Relative threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew` having  the same Scitype as `X` above. Predictions are probabilistic.\n  * `predict_mean(mach, Xnew)`: instead return the mean of each prediction above\n  * `predict_median(mach, Xnew)`: instead return the median of each prediction above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features encountered during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nimport MLJ.Distributions.Poisson\n\n# Generate some data whose target y looks Poisson when conditioned on\n# X:\nN = 10_000\nw = [1.0, -2.0, 3.0]\nmu(x) = exp(w'x) # mean for a log link function\nXmat = rand(N, 3)\nX = MLJ.table(Xmat)\ny = map(1:N) do i\n    x = Xmat[i, :]\n    rand(Poisson(mu(x)))\nend;\n\nCountRegressor = @load LinearCountRegressor pkg=GLM\nmodel = CountRegressor(fit_intercept=false)\nmach = machine(model, X, y)\nfit!(mach)\n\nXnew = MLJ.table(rand(3, 3))\nyhat = predict(mach, Xnew)\nyhat_point = predict_mean(mach, Xnew)\n\n# get coefficients approximating `w`:\njulia> fitted_params(mach).coef\n3-element Vector{Float64}:\n  0.9969008753103842\n -2.0255901752504775\n  3.014407534033522\n\nreport(mach)\n```\n\nSee also [`LinearRegressor`](@ref), [`LinearBinaryClassifier`](@ref)\n"""
+":name" = "LinearCountRegressor"
+":human_name" = "linear count regressor"
+":is_supervised" = "`true`"
+":prediction_type" = ":probabilistic"
+":abstract_type" = "`MLJModelInterface.Probabilistic`"
+":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict", ":predict_mean"]
+":hyperparameters" = "`(:fit_intercept, :distribution, :link, :offsetcol, :maxiter, :atol, :rtol, :minstepfac, :report_keys)`"
+":hyperparameter_types" = "`(\"Bool\", \"Distributions.Distribution\", \"GLM.Link\", \"Union{Nothing, Symbol}\", \"Integer\", \"Real\", \"Real\", \"Real\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
+[GLM.LinearRegressor]
+":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`AbstractVector{ScientificTypesBase.Continuous}`"
+":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Continuous}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Continuous}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`AbstractVector{ScientificTypesBase.Density{ScientificTypesBase.Continuous}}`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`true`"
+":package_name" = "GLM"
+":package_license" = "MIT"
+":load_path" = "MLJGLMInterface.LinearRegressor"
+":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+":package_url" = "https://github.com/JuliaStats/GLM.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`true`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nLinearRegressor\n```\n\nA model type for constructing a linear regressor, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearRegressor = @load LinearRegressor pkg=GLM\n```\n\nDo `model = LinearRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearRegressor(fit_intercept=...)`.\n\n`LinearRegressor` assumes the target is a continuous variable whose conditional distribution is normal with constant variance, and whose expected value is a linear combination of the features (identity link function). Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `Continuous`; check the scitype with `scitype(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model.  If set to false, no intercept will be calculated (e.g. the data is expected  to be centered)\n  * `dropcollinear=false`: Whether to drop features in the training data to ensure linear independence.  If true , only the first of each set of linearly-dependent features is used. The coefficient for redundant linearly dependent features is `0.0` and all associated statistics are set to `NaN`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a variable which is known to have a coefficient of 1.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new  features `Xnew` having the same Scitype as `X` above. Predictions are  probabilistic.\n  * `predict_mean(mach, Xnew)`: instead return the mean of  each prediction above\n  * `predict_median(mach, Xnew)`: instead return the median of  each prediction above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features encountered during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nWhen all keys are enabled in `report_keys`, the following fields are available in `report(mach)`:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nLinearRegressor = @load LinearRegressor pkg=GLM\nglm = LinearRegressor()\n\nX, y = make_regression(100, 2) # synthetic data\nmach = machine(glm, X, y) |> fit!\n\nXnew, _ = make_regression(3, 2)\nyhat = predict(mach, Xnew) # new predictions\nyhat_point = predict_mean(mach, Xnew) # new predictions\n\nfitted_params(mach).features\nfitted_params(mach).coef # x1, x2, intercept\nfitted_params(mach).intercept\n\nreport(mach)\n```\n\nSee also [`LinearCountRegressor`](@ref), [`LinearBinaryClassifier`](@ref)\n"""
+":name" = "LinearRegressor"
+":human_name" = "linear regressor"
+":is_supervised" = "`true`"
+":prediction_type" = ":probabilistic"
+":abstract_type" = "`MLJModelInterface.Probabilistic`"
+":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict", ":predict_mean"]
+":hyperparameters" = "`(:fit_intercept, :dropcollinear, :offsetcol, :report_keys)`"
+":hyperparameter_types" = "`(\"Bool\", \"Bool\", \"Union{Nothing, Symbol}\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
 [CatBoost.CatBoostRegressor]
 ":input_scitype" = "`Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}, AbstractVector{<:ScientificTypesBase.OrderedFactor}, AbstractVector{<:ScientificTypesBase.Multiclass}}}, AbstractMatrix{ScientificTypesBase.Continuous}}`"
 ":output_scitype" = "`ScientificTypesBase.Unknown`"
@@ -3995,78 +4103,6 @@
 ":reporting_operations" = "`()`"
 ":constructor" = "`IteratedModel`"
 
-[PartialLeastSquaresRegressor.KPLSRegressor]
-":input_scitype" = "`ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":fit_data_scitype" = "`Tuple{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}}`"
-":predict_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":transform_scitype" = "`ScientificTypesBase.Unknown`"
-":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
-":target_in_fit" = "`true`"
-":is_pure_julia" = "`true`"
-":package_name" = "PartialLeastSquaresRegressor"
-":package_license" = "MIT"
-":load_path" = "PartialLeastSquaresRegressor.KPLSRegressor"
-":package_uuid" = "f4b1acfe-f311-436c-bb79-8483f53c17d5"
-":package_url" = "https://github.com/lalvim/PartialLeastSquaresRegressor.jl"
-":is_wrapper" = "`false`"
-":supports_weights" = "`false`"
-":supports_class_weights" = "`false`"
-":supports_online" = "`false`"
-":docstring" = "A Kernel Partial Least Squares Regressor. A Kernel PLS2 NIPALS algorithms. Can be used mainly for regression."
-":name" = "KPLSRegressor"
-":human_name" = "kpls regressor"
-":is_supervised" = "`true`"
-":prediction_type" = ":deterministic"
-":abstract_type" = "`MLJModelInterface.Deterministic`"
-":implemented_methods" = [":clean!", ":fit", ":predict"]
-":hyperparameters" = "`(:n_factors, :kernel, :width)`"
-":hyperparameter_types" = "`(\"Integer\", \"String\", \"Real\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing)`"
-":iteration_parameter" = "`nothing`"
-":supports_training_losses" = "`false`"
-":reports_feature_importances" = "`false`"
-":deep_properties" = "`()`"
-":reporting_operations" = "`()`"
-":constructor" = "`nothing`"
-
-[PartialLeastSquaresRegressor.PLSRegressor]
-":input_scitype" = "`ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":fit_data_scitype" = "`Tuple{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}}`"
-":predict_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}`"
-":transform_scitype" = "`ScientificTypesBase.Unknown`"
-":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
-":target_in_fit" = "`true`"
-":is_pure_julia" = "`true`"
-":package_name" = "PartialLeastSquaresRegressor"
-":package_license" = "MIT"
-":load_path" = "PartialLeastSquaresRegressor.PLSRegressor"
-":package_uuid" = "f4b1acfe-f311-436c-bb79-8483f53c17d5"
-":package_url" = "https://github.com/lalvim/PartialLeastSquaresRegressor.jl"
-":is_wrapper" = "`false`"
-":supports_weights" = "`false`"
-":supports_class_weights" = "`false`"
-":supports_online" = "`false`"
-":docstring" = "A Partial Least Squares Regressor. Contains PLS1, PLS2 (multi target) algorithms. Can be used mainly for regression."
-":name" = "PLSRegressor"
-":human_name" = "pls regressor"
-":is_supervised" = "`true`"
-":prediction_type" = ":deterministic"
-":abstract_type" = "`MLJModelInterface.Deterministic`"
-":implemented_methods" = [":clean!", ":fit", ":predict"]
-":hyperparameters" = "`(:n_factors,)`"
-":hyperparameter_types" = "`(\"Int64\",)`"
-":hyperparameter_ranges" = "`(nothing,)`"
-":iteration_parameter" = "`nothing`"
-":supports_training_losses" = "`false`"
-":reports_feature_importances" = "`false`"
-":deep_properties" = "`()`"
-":reporting_operations" = "`()`"
-":constructor" = "`nothing`"
-
 [PartitionedLS.PartLS]
 ":input_scitype" = "`Union{ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Continuous}}, AbstractMatrix{ScientificTypesBase.Continuous}}`"
 ":output_scitype" = "`ScientificTypesBase.Unknown`"
@@ -4463,6 +4499,42 @@
 ":reporting_operations" = "`()`"
 ":constructor" = "`nothing`"
 
+[Maxnet.MaxnetBinaryClassifier]
+":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`AbstractVector{<:ScientificTypesBase.Binary}`"
+":fit_data_scitype" = "`Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{<:ScientificTypesBase.Binary}}`"
+":predict_scitype" = "`ScientificTypesBase.Unknown`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`false`"
+":package_name" = "Maxnet"
+":package_license" = "MIT"
+":load_path" = "Maxnet.MaxnetBinaryClassifier"
+":package_uuid" = "81f79f80-22f2-4e41-ab86-00c11cf0f26f"
+":package_url" = "https://github.com/tiemvanderdeure/Maxnet.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`false`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nMaxnetBinaryClassifier\n```\n\nA model type for constructing a Maxnet, based on [Maxnet.jl](https://github.com/tiemvanderdeure/Maxnet.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nMaxnetBinaryClassifier = @load MaxnetBinaryClassifier pkg=Maxnet\n```\n\nDo `model = MaxnetBinaryClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `MaxnetBinaryClassifier(features=...)`.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nwhere\n\n  * `X`: any table of input features (eg, a `DataFrame`) whose columns each have one of the following element scitypes: `Continuous` or `<:Multiclass`. Check `scitypes` with `schema(X)`.\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `<:Binary`. The first class should refer to background values, and the second class to presence values.\n\n# Hyper-parameters\n\n  * `features`: Specifies which features classes to use in the model, e.g. \"lqh\" for linear, quadratic and hinge features.    See also [Maxnet.maxnet](@ref)\n  * `regularization_multiplier = 1.0`: 'Adjust how tight the model will fit. Increasing this will reduce overfitting.\n  * `regularization_function`: A function to compute the regularization of each feature class. Defaults to `Maxnet.default_regularization`\n  * `addsamplestobackground = true`: Controls wether to add presence values to the background.\n  * `n_knots = 50`: The number of knots used for Threshold and Hinge features. A higher number gives more flexibility for these features.\n  * `weight_factor = 100.0`: A `Float64` value to adjust the weight of the background samples.\n  * `link = Maxnet.CloglogLink()`: The link function to use when predicting. See `Maxnet.predict`\n  * `clamp = false`: Clamp values passed to `MLJBase.predict` to the range the model was trained on.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given features `Xnew` having the same scitype as `X` above. Predictions are  probabilistic and can be interpreted as the probability of presence.\n\n# Fitted Parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `fitresult`: A `Tuple` where the first entry is the `Maxnet.MaxnetModel` returned by the Maxnet algorithm   and the second the entry is the classes of `y`\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `selected_variables`: A `Vector` of `Symbols` of the variables that were selected.\n  * `selected_features`: A `Vector` of `Maxnet.ModelMatrixColumn` with the features that were selected.\n  * `complexity`: the number of selected features in the model.\n\n# Example\n\n```@example\nusing MLJBase, Maxnet\np_a, env = Maxnet.bradypus()\ny = coerce(p_a, Binary)\nX = coerce(env, Count => Continuous)\n\nmach = machine(MaxnetBinaryClassifier(features = \"lqp\"), X, y)\nfit!(mach)\nyhat = MLJBase.predict(mach, env)\n\n```\n"""
+":name" = "MaxnetBinaryClassifier"
+":human_name" = "Maxnet"
+":is_supervised" = "`true`"
+":prediction_type" = ":probabilistic"
+":abstract_type" = "`MLJModelInterface.Probabilistic`"
+":implemented_methods" = [":fit", ":predict"]
+":hyperparameters" = "`(:features, :regularization_multiplier, :regularization_function, :addsamplestobackground, :n_knots, :weight_factor, :link, :clamp, :kw)`"
+":hyperparameter_types" = "`(\"Union{String, Vector{<:Maxnet.AbstractFeatureClass}}\", \"Float64\", \"Any\", \"Bool\", \"Integer\", \"Float64\", \"GLM.Link\", \"Bool\", \"Any\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
 [ParallelKMeans.KMeans]
 ":input_scitype" = "`ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}`"
 ":output_scitype" = "`ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}`"
@@ -6731,12 +6803,84 @@
 ":reporting_operations" = "`()`"
 ":constructor" = "`nothing`"
 
+[SymbolicRegression.SRTestRegressor]
+":input_scitype" = "`Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`AbstractVector{<:ScientificTypesBase.Continuous}`"
+":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`AbstractVector{<:ScientificTypesBase.Continuous}`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`true`"
+":package_name" = "SymbolicRegression"
+":package_license" = "Apache-2.0"
+":load_path" = "SymbolicRegression.MLJInterfaceModule.SRTestRegressor"
+":package_uuid" = "8254be44-1295-4e6a-a16d-46603ac705cb"
+":package_url" = "https://github.com/MilesCranmer/SymbolicRegression.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`true`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nSRTestRegressor\n```\n\nA model type for constructing a Symbolic Regression via Evolutionary Search, based on\n[SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ\nmodel interface.\n\nFrom MLJ, the type can be imported using\n```\nSRTestRegressor = @load SRTestRegressor pkg=SymbolicRegression\n```\n\nDo `model = SRTestRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in\n`SRTestRegressor(defaults=...)`.\n# Hyper-parameters\n\n- `defaults = nothing`\n\n- `binary_operators = nothing`\n\n- `unary_operators = nothing`\n\n- `maxsize = nothing`\n\n- `maxdepth = nothing`\n\n- `expression_spec = nothing`\n\n- `populations = nothing`\n\n- `population_size = nothing`\n\n- `ncycles_per_iteration = nothing`\n\n- `elementwise_loss = nothing`\n\n- `loss_function = nothing`\n\n- `loss_function_expression = nothing`\n\n- `dimensional_constraint_penalty = nothing`\n\n- `parsimony = nothing`\n\n- `constraints = nothing`\n\n- `nested_constraints = nothing`\n\n- `complexity_of_operators = nothing`\n\n- `complexity_of_constants = nothing`\n\n- `complexity_of_variables = nothing`\n\n- `warmup_maxsize_by = nothing`\n\n- `adaptive_parsimony_scaling = nothing`\n\n- `mutation_weights = nothing`\n\n- `crossover_probability = nothing`\n\n- `annealing = nothing`\n\n- `alpha = nothing`\n\n- `tournament_selection_n = nothing`\n\n- `tournament_selection_p = nothing`\n\n- `early_stop_condition = nothing`\n\n- `batching = nothing`\n\n- `batch_size = nothing`\n\n- `dimensionless_constants_only = false`\n\n- `complexity_mapping = nothing`\n\n- `use_frequency = true`\n\n- `use_frequency_in_tournament = true`\n\n- `should_simplify = nothing`\n\n- `perturbation_factor = nothing`\n\n- `probability_negate_constant = nothing`\n\n- `skip_mutation_failures = true`\n\n- `optimizer_algorithm = Optim.BFGS{LineSearches.InitialStatic{Float64}, LineSearches.BackTracking{Float64, Int64}, Nothing, Nothing, Optim.Flat}(LineSearches.InitialStatic{Float64}\n  alpha: Float64 1.0\n  scaled: Bool false\n, LineSearches.BackTracking{Float64, Int64}\n  c_1: Float64 0.0001\n  ρ_hi: Float64 0.5\n  ρ_lo: Float64 0.1\n  iterations: Int64 1000\n  order: Int64 3\n  maxstep: Float64 Inf\n  cache: Nothing nothing\n, nothing, nothing, Optim.Flat())`\n\n- `optimizer_nrestarts = 2`\n\n- `optimizer_probability = 0.14`\n\n- `optimizer_iterations = nothing`\n\n- `optimizer_f_calls_limit = nothing`\n\n- `optimizer_options = nothing`\n\n- `should_optimize_constants = true`\n\n- `migration = true`\n\n- `hof_migration = true`\n\n- `fraction_replaced = nothing`\n\n- `fraction_replaced_hof = nothing`\n\n- `topn = nothing`\n\n- `timeout_in_seconds = nothing`\n\n- `max_evals = nothing`\n\n- `input_stream = Base.TTY(RawFD(9) paused, 0 bytes waiting)`\n\n- `turbo = false`\n\n- `bumper = false`\n\n- `autodiff_backend = nothing`\n\n- `deterministic = false`\n\n- `seed = nothing`\n\n- `verbosity = nothing`\n\n- `print_precision = 5`\n\n- `progress = nothing`\n\n- `output_directory = nothing`\n\n- `save_to_file = true`\n\n- `bin_constraints = nothing`\n\n- `una_constraints = nothing`\n\n- `terminal_width = nothing`\n\n- `use_recorder = false`\n\n- `recorder_file = pysr_recorder.json`\n\n- `define_helper_functions = true`\n\n- `expression_type = nothing`\n\n- `expression_options = nothing`\n\n- `node_type = nothing`\n\n- `output_file = nothing`\n\n- `fast_cycle = false`\n\n- `npopulations = nothing`\n\n- `npop = nothing`\n\n- `niterations = 1`\n\n- `parallelism = multithreading`\n\n- `numprocs = nothing`\n\n- `procs = nothing`\n\n- `addprocs_function = nothing`\n\n- `heap_size_hint_in_bytes = nothing`\n\n- `worker_imports = nothing`\n\n- `logger = nothing`\n\n- `runtests = true`\n\n- `run_id = nothing`\n\n- `loss_type = Nothing`\n\n- `selection_method = choose_best`\n\n- `dimensions_type = DynamicQuantities.SymbolicDimensions{DynamicQuantities.FixedRational{Int32, 25200}}`\n\n"""
+":name" = "SRTestRegressor"
+":human_name" = "Symbolic Regression via Evolutionary Search"
+":is_supervised" = "`true`"
+":prediction_type" = ":deterministic"
+":abstract_type" = "`MLJModelInterface.Deterministic`"
+":implemented_methods" = []
+":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_spec, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :loss_function_expression, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :probability_negate_constant, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :expression_type, :expression_options, :node_type, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
+":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, SymbolicRegression.CoreModule.ExpressionSpecModule.AbstractExpressionSpec}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}}\", \"Union{Nothing, NamedTuple}\", \"Union{Nothing, Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}}\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Type\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
+[SymbolicRegression.MultitargetSRTestRegressor]
+":input_scitype" = "`Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
+":output_scitype" = "`ScientificTypesBase.Unknown`"
+":target_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
+":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
+":transform_scitype" = "`ScientificTypesBase.Unknown`"
+":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
+":target_in_fit" = "`true`"
+":is_pure_julia" = "`true`"
+":package_name" = "SymbolicRegression"
+":package_license" = "Apache-2.0"
+":load_path" = "SymbolicRegression.MLJInterfaceModule.MultitargetSRTestRegressor"
+":package_uuid" = "8254be44-1295-4e6a-a16d-46603ac705cb"
+":package_url" = "https://github.com/MilesCranmer/SymbolicRegression.jl"
+":is_wrapper" = "`false`"
+":supports_weights" = "`true`"
+":supports_class_weights" = "`false`"
+":supports_online" = "`false`"
+":docstring" = """```\nMultitargetSRTestRegressor\n```\n\nA model type for constructing a Multi-Target Symbolic Regression via Evolutionary Search, based on\n[SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ\nmodel interface.\n\nFrom MLJ, the type can be imported using\n```\nMultitargetSRTestRegressor = @load MultitargetSRTestRegressor pkg=SymbolicRegression\n```\n\nDo `model = MultitargetSRTestRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in\n`MultitargetSRTestRegressor(defaults=...)`.\n# Hyper-parameters\n\n- `defaults = nothing`\n\n- `binary_operators = nothing`\n\n- `unary_operators = nothing`\n\n- `maxsize = nothing`\n\n- `maxdepth = nothing`\n\n- `expression_spec = nothing`\n\n- `populations = nothing`\n\n- `population_size = nothing`\n\n- `ncycles_per_iteration = nothing`\n\n- `elementwise_loss = nothing`\n\n- `loss_function = nothing`\n\n- `loss_function_expression = nothing`\n\n- `dimensional_constraint_penalty = nothing`\n\n- `parsimony = nothing`\n\n- `constraints = nothing`\n\n- `nested_constraints = nothing`\n\n- `complexity_of_operators = nothing`\n\n- `complexity_of_constants = nothing`\n\n- `complexity_of_variables = nothing`\n\n- `warmup_maxsize_by = nothing`\n\n- `adaptive_parsimony_scaling = nothing`\n\n- `mutation_weights = nothing`\n\n- `crossover_probability = nothing`\n\n- `annealing = nothing`\n\n- `alpha = nothing`\n\n- `tournament_selection_n = nothing`\n\n- `tournament_selection_p = nothing`\n\n- `early_stop_condition = nothing`\n\n- `batching = nothing`\n\n- `batch_size = nothing`\n\n- `dimensionless_constants_only = false`\n\n- `complexity_mapping = nothing`\n\n- `use_frequency = true`\n\n- `use_frequency_in_tournament = true`\n\n- `should_simplify = nothing`\n\n- `perturbation_factor = nothing`\n\n- `probability_negate_constant = nothing`\n\n- `skip_mutation_failures = true`\n\n- `optimizer_algorithm = Optim.BFGS{LineSearches.InitialStatic{Float64}, LineSearches.BackTracking{Float64, Int64}, Nothing, Nothing, Optim.Flat}(LineSearches.InitialStatic{Float64}\n  alpha: Float64 1.0\n  scaled: Bool false\n, LineSearches.BackTracking{Float64, Int64}\n  c_1: Float64 0.0001\n  ρ_hi: Float64 0.5\n  ρ_lo: Float64 0.1\n  iterations: Int64 1000\n  order: Int64 3\n  maxstep: Float64 Inf\n  cache: Nothing nothing\n, nothing, nothing, Optim.Flat())`\n\n- `optimizer_nrestarts = 2`\n\n- `optimizer_probability = 0.14`\n\n- `optimizer_iterations = nothing`\n\n- `optimizer_f_calls_limit = nothing`\n\n- `optimizer_options = nothing`\n\n- `should_optimize_constants = true`\n\n- `migration = true`\n\n- `hof_migration = true`\n\n- `fraction_replaced = nothing`\n\n- `fraction_replaced_hof = nothing`\n\n- `topn = nothing`\n\n- `timeout_in_seconds = nothing`\n\n- `max_evals = nothing`\n\n- `input_stream = Base.TTY(RawFD(9) paused, 0 bytes waiting)`\n\n- `turbo = false`\n\n- `bumper = false`\n\n- `autodiff_backend = nothing`\n\n- `deterministic = false`\n\n- `seed = nothing`\n\n- `verbosity = nothing`\n\n- `print_precision = 5`\n\n- `progress = nothing`\n\n- `output_directory = nothing`\n\n- `save_to_file = true`\n\n- `bin_constraints = nothing`\n\n- `una_constraints = nothing`\n\n- `terminal_width = nothing`\n\n- `use_recorder = false`\n\n- `recorder_file = pysr_recorder.json`\n\n- `define_helper_functions = true`\n\n- `expression_type = nothing`\n\n- `expression_options = nothing`\n\n- `node_type = nothing`\n\n- `output_file = nothing`\n\n- `fast_cycle = false`\n\n- `npopulations = nothing`\n\n- `npop = nothing`\n\n- `niterations = 1`\n\n- `parallelism = multithreading`\n\n- `numprocs = nothing`\n\n- `procs = nothing`\n\n- `addprocs_function = nothing`\n\n- `heap_size_hint_in_bytes = nothing`\n\n- `worker_imports = nothing`\n\n- `logger = nothing`\n\n- `runtests = true`\n\n- `run_id = nothing`\n\n- `loss_type = Nothing`\n\n- `selection_method = choose_best`\n\n- `dimensions_type = DynamicQuantities.SymbolicDimensions{DynamicQuantities.FixedRational{Int32, 25200}}`\n\n"""
+":name" = "MultitargetSRTestRegressor"
+":human_name" = "Multi-Target Symbolic Regression via Evolutionary Search"
+":is_supervised" = "`true`"
+":prediction_type" = ":deterministic"
+":abstract_type" = "`MLJModelInterface.Deterministic`"
+":implemented_methods" = []
+":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_spec, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :loss_function_expression, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :probability_negate_constant, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :expression_type, :expression_options, :node_type, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
+":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, SymbolicRegression.CoreModule.ExpressionSpecModule.AbstractExpressionSpec}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}}\", \"Union{Nothing, NamedTuple}\", \"Union{Nothing, Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}}\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Type\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":iteration_parameter" = "`nothing`"
+":supports_training_losses" = "`false`"
+":reports_feature_importances" = "`false`"
+":deep_properties" = "`()`"
+":reporting_operations" = "`()`"
+":constructor" = "`nothing`"
+
 [SymbolicRegression.MultitargetSRRegressor]
 ":input_scitype" = "`Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
 ":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`Union{AbstractMatrix, ScientificTypesBase.Table{<:AbstractVector}}`"
-":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{AbstractMatrix, ScientificTypesBase.Table{<:AbstractVector}}}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{AbstractMatrix, ScientificTypesBase.Table{<:AbstractVector}}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
-":predict_scitype" = "`Union{AbstractMatrix, ScientificTypesBase.Table{<:AbstractVector}}`"
+":target_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
+":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`Union{ScientificTypesBase.Table{<:AbstractVector{<:ScientificTypesBase.Continuous}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
 ":transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":target_in_fit" = "`true`"
@@ -6750,16 +6894,16 @@
 ":supports_weights" = "`true`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nMultitargetSRRegressor\n```\n\nA model type for constructing a Multi-Target Symbolic Regression via Evolutionary Search, based on [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nMultitargetSRRegressor = @load MultitargetSRRegressor pkg=SymbolicRegression\n```\n\nDo `model = MultitargetSRRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `MultitargetSRRegressor(defaults=...)`.\n\nMulti-target Symbolic Regression regressor (`MultitargetSRRegressor`) conducts several searches for expressions that predict each target variable from a set of input variables. All data is assumed to be `Continuous`. The search is performed using an evolutionary algorithm. This algorithm is described in the paper https://arxiv.org/abs/2305.01582.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nOR\n\n```\nmach = machine(model, X, y, w)\n```\n\nHere:\n\n  * `X` is any table of input features (eg, a `DataFrame`) whose columns are of scitype\n\n`Continuous`; check column scitypes with `schema(X)`. Variable names in discovered expressions will be taken from the column names of `X`, if available. Units in columns of `X` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n\n  * `y` is the target, which can be any table of target variables whose element scitype is `Continuous`; check the scitype with `schema(y)`. Units in columns of `y` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n  * `w` is the observation weights which can either be `nothing` (default) or an `AbstractVector` whose element scitype is `Count` or `Continuous`. The same weights are used for all targets.\n\nTrain the machine using `fit!(mach)`, inspect the discovered expressions with `report(mach)`, and predict on new data with `predict(mach, Xnew)`. Note that unlike other regressors, symbolic regression stores a list of lists of trained models. The models chosen from each of these lists is defined by the function `selection_method` keyword argument, which by default balances accuracy and complexity. You can override this at prediction time by passing a named tuple with keys `data` and `idx`.\n\n# Hyper-parameters\n\n  * `defaults`: What set of defaults to use for `Options`. The default,   `nothing`, will simply take the default options from the current version of SymbolicRegression.   However, you may also select the defaults from an earlier version, such as `v\"0.24.5\"`.\n  * `binary_operators`: Vector of binary operators (functions) to use.   Each operator should be defined for two input scalars,   and one output scalar. All operators   need to be defined over the entire real line (excluding infinity - these   are stopped before they are input), or return `NaN` where not defined.   For speed, define it so it takes two reals   of the same type as input, and outputs the same type. For the SymbolicUtils   simplification backend, you will need to define a generic method of the   operator so it takes arbitrary types.\n  * `unary_operators`: Same, but for   unary operators (one input scalar, gives an output scalar).\n  * `constraints`: Array of pairs specifying size constraints   for each operator. The constraints for a binary operator should be a 2-tuple   (e.g., `(-1, -1)`) and the constraints for a unary operator should be an `Int`.   A size constraint is a limit to the size of the subtree   in each argument of an operator. e.g., `[(^)=>(-1, 3)]` means that the   `^` operator can have arbitrary size (`-1`) in its left argument,   but a maximum size of `3` in its right argument. Default is   no constraints.\n  * `batching`: Whether to evolve based on small mini-batches of data,   rather than the entire dataset.\n  * `batch_size`: What batch size to use if using batching.\n  * `elementwise_loss`: What elementwise loss function to use. Can be one of   the following losses, or any other loss of type   `SupervisedLoss`. You can also pass a function that takes   a scalar target (left argument), and scalar predicted (right   argument), and returns a scalar. This will be averaged   over the predicted data. If weights are supplied, your   function should take a third argument for the weight scalar.   Included losses:       Regression:           - `LPDistLoss{P}()`,           - `L1DistLoss()`,           - `L2DistLoss()` (mean square),           - `LogitDistLoss()`,           - `HuberLoss(d)`,           - `L1EpsilonInsLoss(ϵ)`,           - `L2EpsilonInsLoss(ϵ)`,           - `PeriodicLoss(c)`,           - `QuantileLoss(τ)`,       Classification:           - `ZeroOneLoss()`,           - `PerceptronLoss()`,           - `L1HingeLoss()`,           - `SmoothedL1HingeLoss(γ)`,           - `ModifiedHuberLoss()`,           - `L2MarginLoss()`,           - `ExpLoss()`,           - `SigmoidLoss()`,           - `DWDMarginLoss(q)`.\n  * `loss_function`: Alternatively, you may redefine the loss used   as any function of `tree::AbstractExpressionNode{T}`, `dataset::Dataset{T}`,   and `options::AbstractOptions`, so long as you output a non-negative   scalar of type `T`. This is useful if you want to use a loss   that takes into account derivatives, or correlations across   the dataset. This also means you could use a custom evaluation   for a particular expression. If you are using   `batching=true`, then your function should   accept a fourth argument `idx`, which is either `nothing`   (indicating that the full dataset should be used), or a vector   of indices to use for the batch.   For example,\n\n    ```\n      function my_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L}\n          prediction, flag = eval_tree_array(tree, dataset.X, options)\n          if !flag\n              return L(Inf)\n          end\n          return sum((prediction .- dataset.y) .^ 2) / dataset.n\n      end\n    ```\n  * `expression_type::Type{E}=Expression`: The type of expression to use.   For example, `Expression`.\n  * `node_type::Type{N}=default_node_type(Expression)`: The type of node to use for the search.   For example, `Node` or `GraphNode`. The default is computed by `default_node_type(expression_type)`.\n  * `populations`: How many populations of equations to use.\n  * `population_size`: How many equations in each population.\n  * `ncycles_per_iteration`: How many generations to consider per iteration.\n  * `tournament_selection_n`: Number of expressions considered in each tournament.\n  * `tournament_selection_p`: The fittest expression in a tournament is to be   selected with probability `p`, the next fittest with probability `p*(1-p)`,   and so forth.\n  * `topn`: Number of equations to return to the host process, and to   consider for the hall of fame.\n  * `complexity_of_operators`: What complexity should be assigned to each operator,   and the occurrence of a constant or variable. By default, this is 1   for all operators. Can be a real number as well, in which case   the complexity of an expression will be rounded to the nearest integer.   Input this in the form of, e.g., [(^) => 3, sin => 2].\n  * `complexity_of_constants`: What complexity should be assigned to use of a constant.   By default, this is 1.\n  * `complexity_of_variables`: What complexity should be assigned to use of a variable,   which can also be a vector indicating different per-variable complexity.   By default, this is 1.\n  * `complexity_mapping`: Alternatively, you can pass a function that takes   the expression as input and returns the complexity. Make sure that   this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`),   and returns an integer.\n  * `alpha`: The probability of accepting an equation mutation   during regularized evolution is given by exp(-delta_loss/(alpha * T)),   where T goes from 1 to 0. Thus, alpha=infinite is the same as no annealing.\n  * `maxsize`: Maximum size of equations during the search.\n  * `maxdepth`: Maximum depth of equations during the search, by default   this is set equal to the maxsize.\n  * `parsimony`: A multiplicative factor for how much complexity is   punished.\n  * `dimensional_constraint_penalty`: An additive factor if the dimensional   constraint is violated.\n  * `dimensionless_constants_only`: Whether to only allow dimensionless   constants.\n  * `use_frequency`: Whether to use a parsimony that adapts to the   relative proportion of equations at each complexity; this will   ensure that there are a balanced number of equations considered   for every complexity.\n  * `use_frequency_in_tournament`: Whether to use the adaptive parsimony described   above inside the score, rather than just at the mutation accept/reject stage.\n  * `adaptive_parsimony_scaling`: How much to scale the adaptive parsimony term   in the loss. Increase this if the search is spending too much time   optimizing the most complex equations.\n  * `turbo`: Whether to use `LoopVectorization.@turbo` to evaluate expressions.   This can be significantly faster, but is only compatible with certain   operators. *Experimental!*\n  * `bumper`: Whether to use Bumper.jl for faster evaluation. *Experimental!*\n  * `migration`: Whether to migrate equations between processes.\n  * `hof_migration`: Whether to migrate equations from the hall of fame   to processes.\n  * `fraction_replaced`: What fraction of each population to replace with   migrated equations at the end of each cycle.\n  * `fraction_replaced_hof`: What fraction to replace with hall of fame   equations at the end of each cycle.\n  * `should_simplify`: Whether to simplify equations. If you   pass a custom objective, this will be set to `false`.\n  * `should_optimize_constants`: Whether to use an optimization algorithm   to periodically optimize constants in equations.\n  * `optimizer_algorithm`: Select algorithm to use for optimizing constants. Default   is `Optim.BFGS(linesearch=LineSearches.BackTracking())`.\n  * `optimizer_nrestarts`: How many different random starting positions to consider   for optimization of constants.\n  * `optimizer_probability`: Probability of performing optimization of constants at   the end of a given iteration.\n  * `optimizer_iterations`: How many optimization iterations to perform. This gets   passed to `Optim.Options` as `iterations`. The default is 8.\n  * `optimizer_f_calls_limit`: How many function calls to allow during optimization.   This gets passed to `Optim.Options` as `f_calls_limit`. The default is   `10_000`.\n  * `optimizer_options`: General options for the constant optimization. For details   we refer to the documentation on `Optim.Options` from the `Optim.jl` package.   Options can be provided here as `NamedTuple`, e.g. `(iterations=16,)`, as a   `Dict`, e.g. Dict(:x_tol => 1.0e-32,), or as an `Optim.Options` instance.\n  * `autodiff_backend`: The backend to use for differentiation, which should be   an instance of `AbstractADType` (see `ADTypes.jl`).   Default is `nothing`, which means `Optim.jl` will estimate gradients (likely   with finite differences). You can also pass a symbolic version of the backend   type, such as `:Zygote` for Zygote, `:Enzyme`, etc. Most backends will not   work, and many will never work due to incompatibilities, though support for some   is gradually being added.\n  * `perturbation_factor`: When mutating a constant, either   multiply or divide by (1+perturbation_factor)^(rand()+1).\n  * `probability_negate_constant`: Probability of negating a constant in the equation   when mutating it.\n  * `mutation_weights`: Relative probabilities of the mutations. The struct   `MutationWeights` (or any `AbstractMutationWeights`) should be passed to these options.   See its documentation on `MutationWeights` for the different weights.\n  * `crossover_probability`: Probability of performing crossover.\n  * `annealing`: Whether to use simulated annealing.\n  * `warmup_maxsize_by`: Whether to slowly increase the max size from 5 up to   `maxsize`. If nonzero, specifies the fraction through the search   at which the maxsize should be reached.\n  * `verbosity`: Whether to print debugging statements or   not.\n  * `print_precision`: How many digits to print when printing   equations. By default, this is 5.\n  * `output_directory`: The base directory to save output files to. Files   will be saved in a subdirectory according to the run ID. By default,   this is `./outputs`.\n  * `save_to_file`: Whether to save equations to a file during the search.\n  * `bin_constraints`: See `constraints`. This is the same, but specified for binary   operators only (for example, if you have an operator that is both a binary   and unary operator).\n  * `una_constraints`: Likewise, for unary operators.\n  * `seed`: What random seed to use. `nothing` uses no seed.\n  * `progress`: Whether to use a progress bar output (`verbosity` will   have no effect).\n  * `early_stop_condition`: Float - whether to stop early if the mean loss gets below this value.   Function - a function taking (loss, complexity) as arguments and returning true or false.\n  * `timeout_in_seconds`: Float64 - the time in seconds after which to exit (as an alternative to the number of iterations).\n  * `max_evals`: Int (or Nothing) - the maximum number of evaluations of expressions to perform.\n  * `input_stream`: the stream to read user input from. By default, this is `stdin`. If you encounter issues   with reading from `stdin`, like a hang, you can simply pass `devnull` to this argument.\n  * `skip_mutation_failures`: Whether to simply skip over mutations that fail or are rejected, rather than to replace the mutated   expression with the original expression and proceed normally.\n  * `nested_constraints`: Specifies how many times a combination of operators can be nested. For example,   `[sin => [cos => 0], cos => [cos => 2]]` specifies that `cos` may never appear within a `sin`,   but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos`   can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination   of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified,   it is assumed that it can be nested an unlimited number of times. This requires that there is no operator   which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation).   For binary operators, both arguments are treated the same way, and the max of each argument is constrained.\n  * `deterministic`: Use a global counter for the birth time, rather than calls to `time()`. This gives   perfect resolution, and is therefore deterministic. However, it is not thread safe, and must be used   in serial mode.\n  * `define_helper_functions`: Whether to define helper functions   for constructing and evaluating trees.\n  * `niterations::Int=10`: The number of iterations to perform the search.   More iterations will improve the results.\n  * `parallelism=:multithreading`: What parallelism mode to use.   The options are `:multithreading`, `:multiprocessing`, and `:serial`.   By default, multithreading will be used. Multithreading uses less memory,   but multiprocessing can handle multi-node compute. If using `:multithreading`   mode, the number of threads available to julia are used. If using   `:multiprocessing`, `numprocs` processes will be created dynamically if   `procs` is unset. If you have already allocated processes, pass them   to the `procs` argument and they will be used.   You may also pass a string instead of a symbol, like `\"multithreading\"`.\n  * `numprocs::Union{Int, Nothing}=nothing`:  The number of processes to use,   if you want `equation_search` to set this up automatically. By default   this will be `4`, but can be any number (you should pick a number <=   the number of cores available).\n  * `procs::Union{Vector{Int}, Nothing}=nothing`: If you have set up   a distributed run manually with `procs = addprocs()` and `@everywhere`,   pass the `procs` to this keyword argument.\n  * `addprocs_function::Union{Function, Nothing}=nothing`: If using multiprocessing   (`parallelism=:multithreading`), and are not passing `procs` manually,   then they will be allocated dynamically using `addprocs`. However,   you may also pass a custom function to use instead of `addprocs`.   This function should take a single positional argument,   which is the number of processes to use, as well as the `lazy` keyword argument.   For example, if set up on a slurm cluster, you could pass   `addprocs_function = addprocs_slurm`, which will set up slurm processes.\n  * `heap_size_hint_in_bytes::Union{Int,Nothing}=nothing`: On Julia 1.9+, you may set the `--heap-size-hint`   flag on Julia processes, recommending garbage collection once a process   is close to the recommended size. This is important for long-running distributed   jobs where each process has an independent memory, and can help avoid   out-of-memory errors. By default, this is set to `Sys.free_memory() / numprocs`.\n  * `worker_imports::Union{Vector{Symbol},Nothing}=nothing`: If you want to import   additional modules on each worker, pass them here as a vector of symbols.   By default some of the extensions will automatically be loaded when needed.\n  * `runtests::Bool=true`: Whether to run (quick) tests before starting the   search, to see if there will be any problems during the equation search   related to the host environment.\n  * `run_id::Union{String,Nothing}=nothing`: A unique identifier for the run.   This will be used to store outputs from the run in the `outputs` directory.   If not specified, a unique ID will be generated.\n  * `loss_type::Type=Nothing`: If you would like to use a different type   for the loss than for the data you passed, specify the type here.   Note that if you pass complex data `::Complex{L}`, then the loss   type will automatically be set to `L`.\n  * `selection_method::Function`: Function to selection expression from   the Pareto frontier for use in `predict`.   See `SymbolicRegression.MLJInterfaceModule.choose_best` for an example.   This function should return a single integer specifying   the index of the expression to use. By default, this maximizes   the score (a pound-for-pound rating) of expressions reaching the threshold   of 1.5x the minimum loss. To override this at prediction time, you can pass   a named tuple with keys `data` and `idx` to `predict`. See the Operations   section for details.\n  * `dimensions_type::AbstractDimensions`: The type of dimensions to use when storing   the units of the data. By default this is `DynamicQuantities.SymbolicDimensions`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which   should have same scitype as `X` above. The expression used for prediction is defined   by the `selection_method` function, which can be seen by viewing `report(mach).best_idx`.\n  * `predict(mach, (data=Xnew, idx=i))`: Return predictions of the target given features   `Xnew`, which should have same scitype as `X` above. By passing a named tuple with keys   `data` and `idx`, you are able to specify the equation you wish to evaluate in `idx`.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier, as determined by the `selection_method` function. Override in `predict` by passing a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). The outer vector is indexed by target variable, and the inner vector is ordered by increasing complexity. `T` is equal to the element type of the passed data.\n  * `equation_strings::Vector{Vector{String}}`: The expressions discovered by the search, represented as strings for easy inspection.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing  a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). The outer vector is indexed by target variable, and the inner vector is ordered by increasing complexity.\n  * `equation_strings::Vector{Vector{String}}`: The expressions discovered by the search, represented as strings for easy inspection.\n  * `complexities::Vector{Vector{Int}}`: The complexity of each expression in each Pareto frontier.\n  * `losses::Vector{Vector{L}}`: The loss of each expression in each Pareto frontier, according to the loss function specified in the model. The type `L` is the loss type, which is usually the same as the element type of data passed (i.e., `T`), but can differ if complex data types are passed.\n  * `scores::Vector{Vector{L}}`: A metric which considers both the complexity and loss of an expression, equal to the change in the log-loss divided by the change in complexity, relative to the previous expression along the Pareto frontier. A larger score aims to indicate an expression is more likely to be the true expression generating the data, but this is very problem-dependent and generally several other factors should be considered.\n\n# Examples\n\n```julia\nusing MLJ\nMultitargetSRRegressor = @load MultitargetSRRegressor pkg=SymbolicRegression\nX = (a=rand(100), b=rand(100), c=rand(100))\nY = (y1=(@. cos(X.c) * 2.1 - 0.9), y2=(@. X.a * X.b + X.c))\nmodel = MultitargetSRRegressor(binary_operators=[+, -, *], unary_operators=[exp], niterations=100)\nmach = machine(model, X, Y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equations used:\nr = report(mach)\nfor (output_index, (eq, i)) in enumerate(zip(r.equation_strings, r.best_idx))\n    println(\"Equation used for \", output_index, \": \", eq[i])\nend\n```\n\nSee also [`SRRegressor`](@ref).\n"""
+":docstring" = """```\nMultitargetSRRegressor\n```\n\nA model type for constructing a Multi-Target Symbolic Regression via Evolutionary Search, based on [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nMultitargetSRRegressor = @load MultitargetSRRegressor pkg=SymbolicRegression\n```\n\nDo `model = MultitargetSRRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `MultitargetSRRegressor(defaults=...)`.\n\nMulti-target Symbolic Regression regressor (`MultitargetSRRegressor`) conducts several searches for expressions that predict each target variable from a set of input variables. All data is assumed to be `Continuous`. The search is performed using an evolutionary algorithm. This algorithm is described in the paper https://arxiv.org/abs/2305.01582.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nOR\n\n```\nmach = machine(model, X, y, w)\n```\n\nHere:\n\n  * `X` is any table of input features (eg, a `DataFrame`) whose columns are of scitype\n\n`Continuous`; check column scitypes with `schema(X)`. Variable names in discovered expressions will be taken from the column names of `X`, if available. Units in columns of `X` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n\n  * `y` is the target, which can be any table of target variables whose element scitype is `Continuous`; check the scitype with `schema(y)`. Units in columns of `y` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n  * `w` is the observation weights which can either be `nothing` (default) or an `AbstractVector` whose element scitype is `Count` or `Continuous`. The same weights are used for all targets.\n\nTrain the machine using `fit!(mach)`, inspect the discovered expressions with `report(mach)`, and predict on new data with `predict(mach, Xnew)`. Note that unlike other regressors, symbolic regression stores a list of lists of trained models. The models chosen from each of these lists is defined by the function `selection_method` keyword argument, which by default balances accuracy and complexity. You can override this at prediction time by passing a named tuple with keys `data` and `idx`.\n\n# Hyper-parameters\n\n  * `defaults`: What set of defaults to use for `Options`. The default,   `nothing`, will simply take the default options from the current version of SymbolicRegression.   However, you may also select the defaults from an earlier version, such as `v\"0.24.5\"`.\n  * `binary_operators`: Vector of binary operators (functions) to use.   Each operator should be defined for two input scalars,   and one output scalar. All operators   need to be defined over the entire real line (excluding infinity - these   are stopped before they are input), or return `NaN` where not defined.   For speed, define it so it takes two reals   of the same type as input, and outputs the same type. For the SymbolicUtils   simplification backend, you will need to define a generic method of the   operator so it takes arbitrary types.\n  * `unary_operators`: Same, but for   unary operators (one input scalar, gives an output scalar).\n  * `constraints`: Array of pairs specifying size constraints   for each operator. The constraints for a binary operator should be a 2-tuple   (e.g., `(-1, -1)`) and the constraints for a unary operator should be an `Int`.   A size constraint is a limit to the size of the subtree   in each argument of an operator. e.g., `[(^)=>(-1, 3)]` means that the   `^` operator can have arbitrary size (`-1`) in its left argument,   but a maximum size of `3` in its right argument. Default is   no constraints.\n  * `batching`: Whether to evolve based on small mini-batches of data,   rather than the entire dataset.\n  * `batch_size`: What batch size to use if using batching.\n  * `elementwise_loss`: What elementwise loss function to use. Can be one of   the following losses, or any other loss of type   `SupervisedLoss`. You can also pass a function that takes   a scalar target (left argument), and scalar predicted (right   argument), and returns a scalar. This will be averaged   over the predicted data. If weights are supplied, your   function should take a third argument for the weight scalar.   Included losses:       Regression:           - `LPDistLoss{P}()`,           - `L1DistLoss()`,           - `L2DistLoss()` (mean square),           - `LogitDistLoss()`,           - `HuberLoss(d)`,           - `L1EpsilonInsLoss(ϵ)`,           - `L2EpsilonInsLoss(ϵ)`,           - `PeriodicLoss(c)`,           - `QuantileLoss(τ)`,       Classification:           - `ZeroOneLoss()`,           - `PerceptronLoss()`,           - `L1HingeLoss()`,           - `SmoothedL1HingeLoss(γ)`,           - `ModifiedHuberLoss()`,           - `L2MarginLoss()`,           - `ExpLoss()`,           - `SigmoidLoss()`,           - `DWDMarginLoss(q)`.\n  * `loss_function`: Alternatively, you may redefine the loss used   as any function of `tree::AbstractExpressionNode{T}`, `dataset::Dataset{T}`,   and `options::AbstractOptions`, so long as you output a non-negative   scalar of type `T`. This is useful if you want to use a loss   that takes into account derivatives, or correlations across   the dataset. This also means you could use a custom evaluation   for a particular expression. If you are using   `batching=true`, then your function should   accept a fourth argument `idx`, which is either `nothing`   (indicating that the full dataset should be used), or a vector   of indices to use for the batch.   For example,\n\n    ```\n      function my_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L}\n          prediction, flag = eval_tree_array(tree, dataset.X, options)\n          if !flag\n              return L(Inf)\n          end\n          return sum((prediction .- dataset.y) .^ 2) / dataset.n\n      end\n    ```\n  * `loss_function_expression`: Similar to `loss_function`, but takes `AbstractExpression` instead of `AbstractExpressionNode` as its first argument. Useful for `TemplateExpressionSpec`.\n  * `expression_spec::AbstractExpressionSpec`: A specification of what types of expressions to use in the   search. For example, `ExpressionSpec()` (default). You can also see `TemplateExpressionSpec` and   `ParametricExpressionSpec` for specialized cases.\n  * `populations`: How many populations of equations to use.\n  * `population_size`: How many equations in each population.\n  * `ncycles_per_iteration`: How many generations to consider per iteration.\n  * `tournament_selection_n`: Number of expressions considered in each tournament.\n  * `tournament_selection_p`: The fittest expression in a tournament is to be   selected with probability `p`, the next fittest with probability `p*(1-p)`,   and so forth.\n  * `topn`: Number of equations to return to the host process, and to   consider for the hall of fame.\n  * `complexity_of_operators`: What complexity should be assigned to each operator,   and the occurrence of a constant or variable. By default, this is 1   for all operators. Can be a real number as well, in which case   the complexity of an expression will be rounded to the nearest integer.   Input this in the form of, e.g., [(^) => 3, sin => 2].\n  * `complexity_of_constants`: What complexity should be assigned to use of a constant.   By default, this is 1.\n  * `complexity_of_variables`: What complexity should be assigned to use of a variable,   which can also be a vector indicating different per-variable complexity.   By default, this is 1.\n  * `complexity_mapping`: Alternatively, you can pass a function that takes   the expression as input and returns the complexity. Make sure that   this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`),   and returns an integer.\n  * `alpha`: The probability of accepting an equation mutation   during regularized evolution is given by exp(-delta_loss/(alpha * T)),   where T goes from 1 to 0. Thus, alpha=infinite is the same as no annealing.\n  * `maxsize`: Maximum size of equations during the search.\n  * `maxdepth`: Maximum depth of equations during the search, by default   this is set equal to the maxsize.\n  * `parsimony`: A multiplicative factor for how much complexity is   punished.\n  * `dimensional_constraint_penalty`: An additive factor if the dimensional   constraint is violated.\n  * `dimensionless_constants_only`: Whether to only allow dimensionless   constants.\n  * `use_frequency`: Whether to use a parsimony that adapts to the   relative proportion of equations at each complexity; this will   ensure that there are a balanced number of equations considered   for every complexity.\n  * `use_frequency_in_tournament`: Whether to use the adaptive parsimony described   above inside the score, rather than just at the mutation accept/reject stage.\n  * `adaptive_parsimony_scaling`: How much to scale the adaptive parsimony term   in the loss. Increase this if the search is spending too much time   optimizing the most complex equations.\n  * `turbo`: Whether to use `LoopVectorization.@turbo` to evaluate expressions.   This can be significantly faster, but is only compatible with certain   operators. *Experimental!*\n  * `bumper`: Whether to use Bumper.jl for faster evaluation. *Experimental!*\n  * `migration`: Whether to migrate equations between processes.\n  * `hof_migration`: Whether to migrate equations from the hall of fame   to processes.\n  * `fraction_replaced`: What fraction of each population to replace with   migrated equations at the end of each cycle.\n  * `fraction_replaced_hof`: What fraction to replace with hall of fame   equations at the end of each cycle.\n  * `should_simplify`: Whether to simplify equations. If you   pass a custom objective, this will be set to `false`.\n  * `should_optimize_constants`: Whether to use an optimization algorithm   to periodically optimize constants in equations.\n  * `optimizer_algorithm`: Select algorithm to use for optimizing constants. Default   is `Optim.BFGS(linesearch=LineSearches.BackTracking())`.\n  * `optimizer_nrestarts`: How many different random starting positions to consider   for optimization of constants.\n  * `optimizer_probability`: Probability of performing optimization of constants at   the end of a given iteration.\n  * `optimizer_iterations`: How many optimization iterations to perform. This gets   passed to `Optim.Options` as `iterations`. The default is 8.\n  * `optimizer_f_calls_limit`: How many function calls to allow during optimization.   This gets passed to `Optim.Options` as `f_calls_limit`. The default is   `10_000`.\n  * `optimizer_options`: General options for the constant optimization. For details   we refer to the documentation on `Optim.Options` from the `Optim.jl` package.   Options can be provided here as `NamedTuple`, e.g. `(iterations=16,)`, as a   `Dict`, e.g. Dict(:x_tol => 1.0e-32,), or as an `Optim.Options` instance.\n  * `autodiff_backend`: The backend to use for differentiation, which should be   an instance of `AbstractADType` (see `ADTypes.jl`).   Default is `nothing`, which means `Optim.jl` will estimate gradients (likely   with finite differences). You can also pass a symbolic version of the backend   type, such as `:Zygote` for Zygote, `:Enzyme`, etc. Most backends will not   work, and many will never work due to incompatibilities, though support for some   is gradually being added.\n  * `perturbation_factor`: When mutating a constant, either   multiply or divide by (1+perturbation_factor)^(rand()+1).\n  * `probability_negate_constant`: Probability of negating a constant in the equation   when mutating it.\n  * `mutation_weights`: Relative probabilities of the mutations. The struct   `MutationWeights` (or any `AbstractMutationWeights`) should be passed to these options.   See its documentation on `MutationWeights` for the different weights.\n  * `crossover_probability`: Probability of performing crossover.\n  * `annealing`: Whether to use simulated annealing.\n  * `warmup_maxsize_by`: Whether to slowly increase the max size from 5 up to   `maxsize`. If nonzero, specifies the fraction through the search   at which the maxsize should be reached.\n  * `verbosity`: Whether to print debugging statements or   not.\n  * `print_precision`: How many digits to print when printing   equations. By default, this is 5.\n  * `output_directory`: The base directory to save output files to. Files   will be saved in a subdirectory according to the run ID. By default,   this is `./outputs`.\n  * `save_to_file`: Whether to save equations to a file during the search.\n  * `bin_constraints`: See `constraints`. This is the same, but specified for binary   operators only (for example, if you have an operator that is both a binary   and unary operator).\n  * `una_constraints`: Likewise, for unary operators.\n  * `seed`: What random seed to use. `nothing` uses no seed.\n  * `progress`: Whether to use a progress bar output (`verbosity` will   have no effect).\n  * `early_stop_condition`: Float - whether to stop early if the mean loss gets below this value.   Function - a function taking (loss, complexity) as arguments and returning true or false.\n  * `timeout_in_seconds`: Float64 - the time in seconds after which to exit (as an alternative to the number of iterations).\n  * `max_evals`: Int (or Nothing) - the maximum number of evaluations of expressions to perform.\n  * `input_stream`: the stream to read user input from. By default, this is `stdin`. If you encounter issues   with reading from `stdin`, like a hang, you can simply pass `devnull` to this argument.\n  * `skip_mutation_failures`: Whether to simply skip over mutations that fail or are rejected, rather than to replace the mutated   expression with the original expression and proceed normally.\n  * `nested_constraints`: Specifies how many times a combination of operators can be nested. For example,   `[sin => [cos => 0], cos => [cos => 2]]` specifies that `cos` may never appear within a `sin`,   but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos`   can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination   of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified,   it is assumed that it can be nested an unlimited number of times. This requires that there is no operator   which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation).   For binary operators, both arguments are treated the same way, and the max of each argument is constrained.\n  * `deterministic`: Use a global counter for the birth time, rather than calls to `time()`. This gives   perfect resolution, and is therefore deterministic. However, it is not thread safe, and must be used   in serial mode.\n  * `define_helper_functions`: Whether to define helper functions   for constructing and evaluating trees.\n  * `niterations::Int=10`: The number of iterations to perform the search.   More iterations will improve the results.\n  * `parallelism=:multithreading`: What parallelism mode to use.   The options are `:multithreading`, `:multiprocessing`, and `:serial`.   By default, multithreading will be used. Multithreading uses less memory,   but multiprocessing can handle multi-node compute. If using `:multithreading`   mode, the number of threads available to julia are used. If using   `:multiprocessing`, `numprocs` processes will be created dynamically if   `procs` is unset. If you have already allocated processes, pass them   to the `procs` argument and they will be used.   You may also pass a string instead of a symbol, like `\"multithreading\"`.\n  * `numprocs::Union{Int, Nothing}=nothing`:  The number of processes to use,   if you want `equation_search` to set this up automatically. By default   this will be `4`, but can be any number (you should pick a number <=   the number of cores available).\n  * `procs::Union{Vector{Int}, Nothing}=nothing`: If you have set up   a distributed run manually with `procs = addprocs()` and `@everywhere`,   pass the `procs` to this keyword argument.\n  * `addprocs_function::Union{Function, Nothing}=nothing`: If using multiprocessing   (`parallelism=:multithreading`), and are not passing `procs` manually,   then they will be allocated dynamically using `addprocs`. However,   you may also pass a custom function to use instead of `addprocs`.   This function should take a single positional argument,   which is the number of processes to use, as well as the `lazy` keyword argument.   For example, if set up on a slurm cluster, you could pass   `addprocs_function = addprocs_slurm`, which will set up slurm processes.\n  * `heap_size_hint_in_bytes::Union{Int,Nothing}=nothing`: On Julia 1.9+, you may set the `--heap-size-hint`   flag on Julia processes, recommending garbage collection once a process   is close to the recommended size. This is important for long-running distributed   jobs where each process has an independent memory, and can help avoid   out-of-memory errors. By default, this is set to `Sys.free_memory() / numprocs`.\n  * `worker_imports::Union{Vector{Symbol},Nothing}=nothing`: If you want to import   additional modules on each worker, pass them here as a vector of symbols.   By default some of the extensions will automatically be loaded when needed.\n  * `runtests::Bool=true`: Whether to run (quick) tests before starting the   search, to see if there will be any problems during the equation search   related to the host environment.\n  * `run_id::Union{String,Nothing}=nothing`: A unique identifier for the run.   This will be used to store outputs from the run in the `outputs` directory.   If not specified, a unique ID will be generated.\n  * `loss_type::Type=Nothing`: If you would like to use a different type   for the loss than for the data you passed, specify the type here.   Note that if you pass complex data `::Complex{L}`, then the loss   type will automatically be set to `L`.\n  * `selection_method::Function`: Function to selection expression from   the Pareto frontier for use in `predict`.   See `SymbolicRegression.MLJInterfaceModule.choose_best` for an example.   This function should return a single integer specifying   the index of the expression to use. By default, this maximizes   the score (a pound-for-pound rating) of expressions reaching the threshold   of 1.5x the minimum loss. To override this at prediction time, you can pass   a named tuple with keys `data` and `idx` to `predict`. See the Operations   section for details.\n  * `dimensions_type::AbstractDimensions`: The type of dimensions to use when storing   the units of the data. By default this is `DynamicQuantities.SymbolicDimensions`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which   should have same scitype as `X` above. The expression used for prediction is defined   by the `selection_method` function, which can be seen by viewing `report(mach).best_idx`.\n  * `predict(mach, (data=Xnew, idx=i))`: Return predictions of the target given features   `Xnew`, which should have same scitype as `X` above. By passing a named tuple with keys   `data` and `idx`, you are able to specify the equation you wish to evaluate in `idx`.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier, as determined by the `selection_method` function. Override in `predict` by passing a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). The outer vector is indexed by target variable, and the inner vector is ordered by increasing complexity. `T` is equal to the element type of the passed data.\n  * `equation_strings::Vector{Vector{String}}`: The expressions discovered by the search, represented as strings for easy inspection.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing  a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). The outer vector is indexed by target variable, and the inner vector is ordered by increasing complexity.\n  * `equation_strings::Vector{Vector{String}}`: The expressions discovered by the search, represented as strings for easy inspection.\n  * `complexities::Vector{Vector{Int}}`: The complexity of each expression in each Pareto frontier.\n  * `losses::Vector{Vector{L}}`: The loss of each expression in each Pareto frontier, according to the loss function specified in the model. The type `L` is the loss type, which is usually the same as the element type of data passed (i.e., `T`), but can differ if complex data types are passed.\n  * `scores::Vector{Vector{L}}`: A metric which considers both the complexity and loss of an expression, equal to the change in the log-loss divided by the change in complexity, relative to the previous expression along the Pareto frontier. A larger score aims to indicate an expression is more likely to be the true expression generating the data, but this is very problem-dependent and generally several other factors should be considered.\n\n# Examples\n\n```julia\nusing MLJ\nMultitargetSRRegressor = @load MultitargetSRRegressor pkg=SymbolicRegression\nX = (a=rand(100), b=rand(100), c=rand(100))\nY = (y1=(@. cos(X.c) * 2.1 - 0.9), y2=(@. X.a * X.b + X.c))\nmodel = MultitargetSRRegressor(binary_operators=[+, -, *], unary_operators=[exp], niterations=100)\nmach = machine(model, X, Y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equations used:\nr = report(mach)\nfor (output_index, (eq, i)) in enumerate(zip(r.equation_strings, r.best_idx))\n    println(\"Equation used for \", output_index, \": \", eq[i])\nend\n```\n\nSee also [`SRRegressor`](@ref).\n"""
 ":name" = "MultitargetSRRegressor"
 ":human_name" = "Multi-Target Symbolic Regression via Evolutionary Search"
 ":is_supervised" = "`true`"
 ":prediction_type" = ":deterministic"
 ":abstract_type" = "`MLJModelInterface.Deterministic`"
 ":implemented_methods" = []
-":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_type, :expression_options, :node_type, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :probability_negate_constant, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
-":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}\", \"NamedTuple\", \"Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Any\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_spec, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :loss_function_expression, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :probability_negate_constant, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :expression_type, :expression_options, :node_type, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
+":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, SymbolicRegression.CoreModule.ExpressionSpecModule.AbstractExpressionSpec}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}}\", \"Union{Nothing, NamedTuple}\", \"Union{Nothing, Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}}\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Type\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
 ":iteration_parameter" = "`nothing`"
 ":supports_training_losses" = "`false`"
 ":reports_feature_importances" = "`false`"
@@ -6770,9 +6914,9 @@
 [SymbolicRegression.SRRegressor]
 ":input_scitype" = "`Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}`"
 ":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`AbstractVector`"
-":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
-":predict_scitype" = "`AbstractVector`"
+":target_scitype" = "`AbstractVector{<:ScientificTypesBase.Continuous}`"
+":fit_data_scitype" = "`Union{Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}}, Tuple{Union{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Count}}}, AbstractMatrix{<:ScientificTypesBase.Continuous}}, AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":predict_scitype" = "`AbstractVector{<:ScientificTypesBase.Continuous}`"
 ":transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":target_in_fit" = "`true`"
@@ -6786,16 +6930,16 @@
 ":supports_weights" = "`true`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nSRRegressor\n```\n\nA model type for constructing a Symbolic Regression via Evolutionary Search, based on [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nSRRegressor = @load SRRegressor pkg=SymbolicRegression\n```\n\nDo `model = SRRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `SRRegressor(defaults=...)`.\n\nSingle-target Symbolic Regression regressor (`SRRegressor`) searches for symbolic expressions that predict a single target variable from a set of input variables. All data is assumed to be `Continuous`. The search is performed using an evolutionary algorithm. This algorithm is described in the paper https://arxiv.org/abs/2305.01582.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nOR\n\n```\nmach = machine(model, X, y, w)\n```\n\nHere:\n\n  * `X` is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Variable names in discovered expressions will be taken from the column names of `X`, if available. Units in columns of `X` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is   `Continuous`; check the scitype with `scitype(y)`. Units in `y` (use `DynamicQuantities`   for units) will trigger dimensional analysis to be used.\n  * `w` is the observation weights which can either be `nothing` (default) or an `AbstractVector` whose element scitype is `Count` or `Continuous`.\n\nTrain the machine using `fit!(mach)`, inspect the discovered expressions with `report(mach)`, and predict on new data with `predict(mach, Xnew)`. Note that unlike other regressors, symbolic regression stores a list of trained models. The model chosen from this list is defined by the function `selection_method` keyword argument, which by default balances accuracy and complexity. You can override this at prediction time by passing a named tuple with keys `data` and `idx`.\n\n# Hyper-parameters\n\n  * `defaults`: What set of defaults to use for `Options`. The default,   `nothing`, will simply take the default options from the current version of SymbolicRegression.   However, you may also select the defaults from an earlier version, such as `v\"0.24.5\"`.\n  * `binary_operators`: Vector of binary operators (functions) to use.   Each operator should be defined for two input scalars,   and one output scalar. All operators   need to be defined over the entire real line (excluding infinity - these   are stopped before they are input), or return `NaN` where not defined.   For speed, define it so it takes two reals   of the same type as input, and outputs the same type. For the SymbolicUtils   simplification backend, you will need to define a generic method of the   operator so it takes arbitrary types.\n  * `unary_operators`: Same, but for   unary operators (one input scalar, gives an output scalar).\n  * `constraints`: Array of pairs specifying size constraints   for each operator. The constraints for a binary operator should be a 2-tuple   (e.g., `(-1, -1)`) and the constraints for a unary operator should be an `Int`.   A size constraint is a limit to the size of the subtree   in each argument of an operator. e.g., `[(^)=>(-1, 3)]` means that the   `^` operator can have arbitrary size (`-1`) in its left argument,   but a maximum size of `3` in its right argument. Default is   no constraints.\n  * `batching`: Whether to evolve based on small mini-batches of data,   rather than the entire dataset.\n  * `batch_size`: What batch size to use if using batching.\n  * `elementwise_loss`: What elementwise loss function to use. Can be one of   the following losses, or any other loss of type   `SupervisedLoss`. You can also pass a function that takes   a scalar target (left argument), and scalar predicted (right   argument), and returns a scalar. This will be averaged   over the predicted data. If weights are supplied, your   function should take a third argument for the weight scalar.   Included losses:       Regression:           - `LPDistLoss{P}()`,           - `L1DistLoss()`,           - `L2DistLoss()` (mean square),           - `LogitDistLoss()`,           - `HuberLoss(d)`,           - `L1EpsilonInsLoss(ϵ)`,           - `L2EpsilonInsLoss(ϵ)`,           - `PeriodicLoss(c)`,           - `QuantileLoss(τ)`,       Classification:           - `ZeroOneLoss()`,           - `PerceptronLoss()`,           - `L1HingeLoss()`,           - `SmoothedL1HingeLoss(γ)`,           - `ModifiedHuberLoss()`,           - `L2MarginLoss()`,           - `ExpLoss()`,           - `SigmoidLoss()`,           - `DWDMarginLoss(q)`.\n  * `loss_function`: Alternatively, you may redefine the loss used   as any function of `tree::AbstractExpressionNode{T}`, `dataset::Dataset{T}`,   and `options::AbstractOptions`, so long as you output a non-negative   scalar of type `T`. This is useful if you want to use a loss   that takes into account derivatives, or correlations across   the dataset. This also means you could use a custom evaluation   for a particular expression. If you are using   `batching=true`, then your function should   accept a fourth argument `idx`, which is either `nothing`   (indicating that the full dataset should be used), or a vector   of indices to use for the batch.   For example,\n\n    ```\n      function my_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L}\n          prediction, flag = eval_tree_array(tree, dataset.X, options)\n          if !flag\n              return L(Inf)\n          end\n          return sum((prediction .- dataset.y) .^ 2) / dataset.n\n      end\n    ```\n  * `expression_type::Type{E}=Expression`: The type of expression to use.   For example, `Expression`.\n  * `node_type::Type{N}=default_node_type(Expression)`: The type of node to use for the search.   For example, `Node` or `GraphNode`. The default is computed by `default_node_type(expression_type)`.\n  * `populations`: How many populations of equations to use.\n  * `population_size`: How many equations in each population.\n  * `ncycles_per_iteration`: How many generations to consider per iteration.\n  * `tournament_selection_n`: Number of expressions considered in each tournament.\n  * `tournament_selection_p`: The fittest expression in a tournament is to be   selected with probability `p`, the next fittest with probability `p*(1-p)`,   and so forth.\n  * `topn`: Number of equations to return to the host process, and to   consider for the hall of fame.\n  * `complexity_of_operators`: What complexity should be assigned to each operator,   and the occurrence of a constant or variable. By default, this is 1   for all operators. Can be a real number as well, in which case   the complexity of an expression will be rounded to the nearest integer.   Input this in the form of, e.g., [(^) => 3, sin => 2].\n  * `complexity_of_constants`: What complexity should be assigned to use of a constant.   By default, this is 1.\n  * `complexity_of_variables`: What complexity should be assigned to use of a variable,   which can also be a vector indicating different per-variable complexity.   By default, this is 1.\n  * `complexity_mapping`: Alternatively, you can pass a function that takes   the expression as input and returns the complexity. Make sure that   this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`),   and returns an integer.\n  * `alpha`: The probability of accepting an equation mutation   during regularized evolution is given by exp(-delta_loss/(alpha * T)),   where T goes from 1 to 0. Thus, alpha=infinite is the same as no annealing.\n  * `maxsize`: Maximum size of equations during the search.\n  * `maxdepth`: Maximum depth of equations during the search, by default   this is set equal to the maxsize.\n  * `parsimony`: A multiplicative factor for how much complexity is   punished.\n  * `dimensional_constraint_penalty`: An additive factor if the dimensional   constraint is violated.\n  * `dimensionless_constants_only`: Whether to only allow dimensionless   constants.\n  * `use_frequency`: Whether to use a parsimony that adapts to the   relative proportion of equations at each complexity; this will   ensure that there are a balanced number of equations considered   for every complexity.\n  * `use_frequency_in_tournament`: Whether to use the adaptive parsimony described   above inside the score, rather than just at the mutation accept/reject stage.\n  * `adaptive_parsimony_scaling`: How much to scale the adaptive parsimony term   in the loss. Increase this if the search is spending too much time   optimizing the most complex equations.\n  * `turbo`: Whether to use `LoopVectorization.@turbo` to evaluate expressions.   This can be significantly faster, but is only compatible with certain   operators. *Experimental!*\n  * `bumper`: Whether to use Bumper.jl for faster evaluation. *Experimental!*\n  * `migration`: Whether to migrate equations between processes.\n  * `hof_migration`: Whether to migrate equations from the hall of fame   to processes.\n  * `fraction_replaced`: What fraction of each population to replace with   migrated equations at the end of each cycle.\n  * `fraction_replaced_hof`: What fraction to replace with hall of fame   equations at the end of each cycle.\n  * `should_simplify`: Whether to simplify equations. If you   pass a custom objective, this will be set to `false`.\n  * `should_optimize_constants`: Whether to use an optimization algorithm   to periodically optimize constants in equations.\n  * `optimizer_algorithm`: Select algorithm to use for optimizing constants. Default   is `Optim.BFGS(linesearch=LineSearches.BackTracking())`.\n  * `optimizer_nrestarts`: How many different random starting positions to consider   for optimization of constants.\n  * `optimizer_probability`: Probability of performing optimization of constants at   the end of a given iteration.\n  * `optimizer_iterations`: How many optimization iterations to perform. This gets   passed to `Optim.Options` as `iterations`. The default is 8.\n  * `optimizer_f_calls_limit`: How many function calls to allow during optimization.   This gets passed to `Optim.Options` as `f_calls_limit`. The default is   `10_000`.\n  * `optimizer_options`: General options for the constant optimization. For details   we refer to the documentation on `Optim.Options` from the `Optim.jl` package.   Options can be provided here as `NamedTuple`, e.g. `(iterations=16,)`, as a   `Dict`, e.g. Dict(:x_tol => 1.0e-32,), or as an `Optim.Options` instance.\n  * `autodiff_backend`: The backend to use for differentiation, which should be   an instance of `AbstractADType` (see `ADTypes.jl`).   Default is `nothing`, which means `Optim.jl` will estimate gradients (likely   with finite differences). You can also pass a symbolic version of the backend   type, such as `:Zygote` for Zygote, `:Enzyme`, etc. Most backends will not   work, and many will never work due to incompatibilities, though support for some   is gradually being added.\n  * `perturbation_factor`: When mutating a constant, either   multiply or divide by (1+perturbation_factor)^(rand()+1).\n  * `probability_negate_constant`: Probability of negating a constant in the equation   when mutating it.\n  * `mutation_weights`: Relative probabilities of the mutations. The struct   `MutationWeights` (or any `AbstractMutationWeights`) should be passed to these options.   See its documentation on `MutationWeights` for the different weights.\n  * `crossover_probability`: Probability of performing crossover.\n  * `annealing`: Whether to use simulated annealing.\n  * `warmup_maxsize_by`: Whether to slowly increase the max size from 5 up to   `maxsize`. If nonzero, specifies the fraction through the search   at which the maxsize should be reached.\n  * `verbosity`: Whether to print debugging statements or   not.\n  * `print_precision`: How many digits to print when printing   equations. By default, this is 5.\n  * `output_directory`: The base directory to save output files to. Files   will be saved in a subdirectory according to the run ID. By default,   this is `./outputs`.\n  * `save_to_file`: Whether to save equations to a file during the search.\n  * `bin_constraints`: See `constraints`. This is the same, but specified for binary   operators only (for example, if you have an operator that is both a binary   and unary operator).\n  * `una_constraints`: Likewise, for unary operators.\n  * `seed`: What random seed to use. `nothing` uses no seed.\n  * `progress`: Whether to use a progress bar output (`verbosity` will   have no effect).\n  * `early_stop_condition`: Float - whether to stop early if the mean loss gets below this value.   Function - a function taking (loss, complexity) as arguments and returning true or false.\n  * `timeout_in_seconds`: Float64 - the time in seconds after which to exit (as an alternative to the number of iterations).\n  * `max_evals`: Int (or Nothing) - the maximum number of evaluations of expressions to perform.\n  * `input_stream`: the stream to read user input from. By default, this is `stdin`. If you encounter issues   with reading from `stdin`, like a hang, you can simply pass `devnull` to this argument.\n  * `skip_mutation_failures`: Whether to simply skip over mutations that fail or are rejected, rather than to replace the mutated   expression with the original expression and proceed normally.\n  * `nested_constraints`: Specifies how many times a combination of operators can be nested. For example,   `[sin => [cos => 0], cos => [cos => 2]]` specifies that `cos` may never appear within a `sin`,   but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos`   can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination   of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified,   it is assumed that it can be nested an unlimited number of times. This requires that there is no operator   which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation).   For binary operators, both arguments are treated the same way, and the max of each argument is constrained.\n  * `deterministic`: Use a global counter for the birth time, rather than calls to `time()`. This gives   perfect resolution, and is therefore deterministic. However, it is not thread safe, and must be used   in serial mode.\n  * `define_helper_functions`: Whether to define helper functions   for constructing and evaluating trees.\n  * `niterations::Int=10`: The number of iterations to perform the search.   More iterations will improve the results.\n  * `parallelism=:multithreading`: What parallelism mode to use.   The options are `:multithreading`, `:multiprocessing`, and `:serial`.   By default, multithreading will be used. Multithreading uses less memory,   but multiprocessing can handle multi-node compute. If using `:multithreading`   mode, the number of threads available to julia are used. If using   `:multiprocessing`, `numprocs` processes will be created dynamically if   `procs` is unset. If you have already allocated processes, pass them   to the `procs` argument and they will be used.   You may also pass a string instead of a symbol, like `\"multithreading\"`.\n  * `numprocs::Union{Int, Nothing}=nothing`:  The number of processes to use,   if you want `equation_search` to set this up automatically. By default   this will be `4`, but can be any number (you should pick a number <=   the number of cores available).\n  * `procs::Union{Vector{Int}, Nothing}=nothing`: If you have set up   a distributed run manually with `procs = addprocs()` and `@everywhere`,   pass the `procs` to this keyword argument.\n  * `addprocs_function::Union{Function, Nothing}=nothing`: If using multiprocessing   (`parallelism=:multithreading`), and are not passing `procs` manually,   then they will be allocated dynamically using `addprocs`. However,   you may also pass a custom function to use instead of `addprocs`.   This function should take a single positional argument,   which is the number of processes to use, as well as the `lazy` keyword argument.   For example, if set up on a slurm cluster, you could pass   `addprocs_function = addprocs_slurm`, which will set up slurm processes.\n  * `heap_size_hint_in_bytes::Union{Int,Nothing}=nothing`: On Julia 1.9+, you may set the `--heap-size-hint`   flag on Julia processes, recommending garbage collection once a process   is close to the recommended size. This is important for long-running distributed   jobs where each process has an independent memory, and can help avoid   out-of-memory errors. By default, this is set to `Sys.free_memory() / numprocs`.\n  * `worker_imports::Union{Vector{Symbol},Nothing}=nothing`: If you want to import   additional modules on each worker, pass them here as a vector of symbols.   By default some of the extensions will automatically be loaded when needed.\n  * `runtests::Bool=true`: Whether to run (quick) tests before starting the   search, to see if there will be any problems during the equation search   related to the host environment.\n  * `run_id::Union{String,Nothing}=nothing`: A unique identifier for the run.   This will be used to store outputs from the run in the `outputs` directory.   If not specified, a unique ID will be generated.\n  * `loss_type::Type=Nothing`: If you would like to use a different type   for the loss than for the data you passed, specify the type here.   Note that if you pass complex data `::Complex{L}`, then the loss   type will automatically be set to `L`.\n  * `selection_method::Function`: Function to selection expression from   the Pareto frontier for use in `predict`.   See `SymbolicRegression.MLJInterfaceModule.choose_best` for an example.   This function should return a single integer specifying   the index of the expression to use. By default, this maximizes   the score (a pound-for-pound rating) of expressions reaching the threshold   of 1.5x the minimum loss. To override this at prediction time, you can pass   a named tuple with keys `data` and `idx` to `predict`. See the Operations   section for details.\n  * `dimensions_type::AbstractDimensions`: The type of dimensions to use when storing   the units of the data. By default this is `DynamicQuantities.SymbolicDimensions`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which   should have same scitype as `X` above. The expression used for prediction is defined   by the `selection_method` function, which can be seen by viewing `report(mach).best_idx`.\n  * `predict(mach, (data=Xnew, idx=i))`: Return predictions of the target given features   `Xnew`, which should have same scitype as `X` above. By passing a named tuple with keys   `data` and `idx`, you are able to specify the equation you wish to evaluate in `idx`.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `best_idx::Int`: The index of the best expression in the Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing   a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Node{T}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). `T` is equal to the element type of the passed data.\n  * `equation_strings::Vector{String}`: The expressions discovered by the search, represented as strings for easy inspection.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `best_idx::Int`: The index of the best expression in the Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing  a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Node{T}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity).\n  * `equation_strings::Vector{String}`: The expressions discovered by the search, represented as strings for easy inspection.\n  * `complexities::Vector{Int}`: The complexity of each expression in the Pareto frontier.\n  * `losses::Vector{L}`: The loss of each expression in the Pareto frontier, according to the loss function specified in the model. The type `L` is the loss type, which is usually the same as the element type of data passed (i.e., `T`), but can differ if complex data types are passed.\n  * `scores::Vector{L}`: A metric which considers both the complexity and loss of an expression, equal to the change in the log-loss divided by the change in complexity, relative to the previous expression along the Pareto frontier. A larger score aims to indicate an expression is more likely to be the true expression generating the data, but this is very problem-dependent and generally several other factors should be considered.\n\n# Examples\n\n```julia\nusing MLJ\nSRRegressor = @load SRRegressor pkg=SymbolicRegression\nX, y = @load_boston\nmodel = SRRegressor(binary_operators=[+, -, *], unary_operators=[exp], niterations=100)\nmach = machine(model, X, y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equation used:\nr = report(mach)\nprintln(\"Equation used:\", r.equation_strings[r.best_idx])\n```\n\nWith units and variable names:\n\n```julia\nusing MLJ\nusing DynamicQuantities\nSRegressor = @load SRRegressor pkg=SymbolicRegression\n\nX = (; x1=rand(32) .* us\"km/h\", x2=rand(32) .* us\"km\")\ny = @. X.x2 / X.x1 + 0.5us\"h\"\nmodel = SRRegressor(binary_operators=[+, -, *, /])\nmach = machine(model, X, y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equation used:\nr = report(mach)\nprintln(\"Equation used:\", r.equation_strings[r.best_idx])\n```\n\nSee also [`MultitargetSRRegressor`](@ref).\n"""
+":docstring" = """```\nSRRegressor\n```\n\nA model type for constructing a Symbolic Regression via Evolutionary Search, based on [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nSRRegressor = @load SRRegressor pkg=SymbolicRegression\n```\n\nDo `model = SRRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `SRRegressor(defaults=...)`.\n\nSingle-target Symbolic Regression regressor (`SRRegressor`) searches for symbolic expressions that predict a single target variable from a set of input variables. All data is assumed to be `Continuous`. The search is performed using an evolutionary algorithm. This algorithm is described in the paper https://arxiv.org/abs/2305.01582.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nOR\n\n```\nmach = machine(model, X, y, w)\n```\n\nHere:\n\n  * `X` is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Variable names in discovered expressions will be taken from the column names of `X`, if available. Units in columns of `X` (use `DynamicQuantities` for units) will trigger dimensional analysis to be used.\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is   `Continuous`; check the scitype with `scitype(y)`. Units in `y` (use `DynamicQuantities`   for units) will trigger dimensional analysis to be used.\n  * `w` is the observation weights which can either be `nothing` (default) or an `AbstractVector` whose element scitype is `Count` or `Continuous`.\n\nTrain the machine using `fit!(mach)`, inspect the discovered expressions with `report(mach)`, and predict on new data with `predict(mach, Xnew)`. Note that unlike other regressors, symbolic regression stores a list of trained models. The model chosen from this list is defined by the function `selection_method` keyword argument, which by default balances accuracy and complexity. You can override this at prediction time by passing a named tuple with keys `data` and `idx`.\n\n# Hyper-parameters\n\n  * `defaults`: What set of defaults to use for `Options`. The default,   `nothing`, will simply take the default options from the current version of SymbolicRegression.   However, you may also select the defaults from an earlier version, such as `v\"0.24.5\"`.\n  * `binary_operators`: Vector of binary operators (functions) to use.   Each operator should be defined for two input scalars,   and one output scalar. All operators   need to be defined over the entire real line (excluding infinity - these   are stopped before they are input), or return `NaN` where not defined.   For speed, define it so it takes two reals   of the same type as input, and outputs the same type. For the SymbolicUtils   simplification backend, you will need to define a generic method of the   operator so it takes arbitrary types.\n  * `unary_operators`: Same, but for   unary operators (one input scalar, gives an output scalar).\n  * `constraints`: Array of pairs specifying size constraints   for each operator. The constraints for a binary operator should be a 2-tuple   (e.g., `(-1, -1)`) and the constraints for a unary operator should be an `Int`.   A size constraint is a limit to the size of the subtree   in each argument of an operator. e.g., `[(^)=>(-1, 3)]` means that the   `^` operator can have arbitrary size (`-1`) in its left argument,   but a maximum size of `3` in its right argument. Default is   no constraints.\n  * `batching`: Whether to evolve based on small mini-batches of data,   rather than the entire dataset.\n  * `batch_size`: What batch size to use if using batching.\n  * `elementwise_loss`: What elementwise loss function to use. Can be one of   the following losses, or any other loss of type   `SupervisedLoss`. You can also pass a function that takes   a scalar target (left argument), and scalar predicted (right   argument), and returns a scalar. This will be averaged   over the predicted data. If weights are supplied, your   function should take a third argument for the weight scalar.   Included losses:       Regression:           - `LPDistLoss{P}()`,           - `L1DistLoss()`,           - `L2DistLoss()` (mean square),           - `LogitDistLoss()`,           - `HuberLoss(d)`,           - `L1EpsilonInsLoss(ϵ)`,           - `L2EpsilonInsLoss(ϵ)`,           - `PeriodicLoss(c)`,           - `QuantileLoss(τ)`,       Classification:           - `ZeroOneLoss()`,           - `PerceptronLoss()`,           - `L1HingeLoss()`,           - `SmoothedL1HingeLoss(γ)`,           - `ModifiedHuberLoss()`,           - `L2MarginLoss()`,           - `ExpLoss()`,           - `SigmoidLoss()`,           - `DWDMarginLoss(q)`.\n  * `loss_function`: Alternatively, you may redefine the loss used   as any function of `tree::AbstractExpressionNode{T}`, `dataset::Dataset{T}`,   and `options::AbstractOptions`, so long as you output a non-negative   scalar of type `T`. This is useful if you want to use a loss   that takes into account derivatives, or correlations across   the dataset. This also means you could use a custom evaluation   for a particular expression. If you are using   `batching=true`, then your function should   accept a fourth argument `idx`, which is either `nothing`   (indicating that the full dataset should be used), or a vector   of indices to use for the batch.   For example,\n\n    ```\n      function my_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L}\n          prediction, flag = eval_tree_array(tree, dataset.X, options)\n          if !flag\n              return L(Inf)\n          end\n          return sum((prediction .- dataset.y) .^ 2) / dataset.n\n      end\n    ```\n  * `loss_function_expression`: Similar to `loss_function`, but takes `AbstractExpression` instead of `AbstractExpressionNode` as its first argument. Useful for `TemplateExpressionSpec`.\n  * `expression_spec::AbstractExpressionSpec`: A specification of what types of expressions to use in the   search. For example, `ExpressionSpec()` (default). You can also see `TemplateExpressionSpec` and   `ParametricExpressionSpec` for specialized cases.\n  * `populations`: How many populations of equations to use.\n  * `population_size`: How many equations in each population.\n  * `ncycles_per_iteration`: How many generations to consider per iteration.\n  * `tournament_selection_n`: Number of expressions considered in each tournament.\n  * `tournament_selection_p`: The fittest expression in a tournament is to be   selected with probability `p`, the next fittest with probability `p*(1-p)`,   and so forth.\n  * `topn`: Number of equations to return to the host process, and to   consider for the hall of fame.\n  * `complexity_of_operators`: What complexity should be assigned to each operator,   and the occurrence of a constant or variable. By default, this is 1   for all operators. Can be a real number as well, in which case   the complexity of an expression will be rounded to the nearest integer.   Input this in the form of, e.g., [(^) => 3, sin => 2].\n  * `complexity_of_constants`: What complexity should be assigned to use of a constant.   By default, this is 1.\n  * `complexity_of_variables`: What complexity should be assigned to use of a variable,   which can also be a vector indicating different per-variable complexity.   By default, this is 1.\n  * `complexity_mapping`: Alternatively, you can pass a function that takes   the expression as input and returns the complexity. Make sure that   this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`),   and returns an integer.\n  * `alpha`: The probability of accepting an equation mutation   during regularized evolution is given by exp(-delta_loss/(alpha * T)),   where T goes from 1 to 0. Thus, alpha=infinite is the same as no annealing.\n  * `maxsize`: Maximum size of equations during the search.\n  * `maxdepth`: Maximum depth of equations during the search, by default   this is set equal to the maxsize.\n  * `parsimony`: A multiplicative factor for how much complexity is   punished.\n  * `dimensional_constraint_penalty`: An additive factor if the dimensional   constraint is violated.\n  * `dimensionless_constants_only`: Whether to only allow dimensionless   constants.\n  * `use_frequency`: Whether to use a parsimony that adapts to the   relative proportion of equations at each complexity; this will   ensure that there are a balanced number of equations considered   for every complexity.\n  * `use_frequency_in_tournament`: Whether to use the adaptive parsimony described   above inside the score, rather than just at the mutation accept/reject stage.\n  * `adaptive_parsimony_scaling`: How much to scale the adaptive parsimony term   in the loss. Increase this if the search is spending too much time   optimizing the most complex equations.\n  * `turbo`: Whether to use `LoopVectorization.@turbo` to evaluate expressions.   This can be significantly faster, but is only compatible with certain   operators. *Experimental!*\n  * `bumper`: Whether to use Bumper.jl for faster evaluation. *Experimental!*\n  * `migration`: Whether to migrate equations between processes.\n  * `hof_migration`: Whether to migrate equations from the hall of fame   to processes.\n  * `fraction_replaced`: What fraction of each population to replace with   migrated equations at the end of each cycle.\n  * `fraction_replaced_hof`: What fraction to replace with hall of fame   equations at the end of each cycle.\n  * `should_simplify`: Whether to simplify equations. If you   pass a custom objective, this will be set to `false`.\n  * `should_optimize_constants`: Whether to use an optimization algorithm   to periodically optimize constants in equations.\n  * `optimizer_algorithm`: Select algorithm to use for optimizing constants. Default   is `Optim.BFGS(linesearch=LineSearches.BackTracking())`.\n  * `optimizer_nrestarts`: How many different random starting positions to consider   for optimization of constants.\n  * `optimizer_probability`: Probability of performing optimization of constants at   the end of a given iteration.\n  * `optimizer_iterations`: How many optimization iterations to perform. This gets   passed to `Optim.Options` as `iterations`. The default is 8.\n  * `optimizer_f_calls_limit`: How many function calls to allow during optimization.   This gets passed to `Optim.Options` as `f_calls_limit`. The default is   `10_000`.\n  * `optimizer_options`: General options for the constant optimization. For details   we refer to the documentation on `Optim.Options` from the `Optim.jl` package.   Options can be provided here as `NamedTuple`, e.g. `(iterations=16,)`, as a   `Dict`, e.g. Dict(:x_tol => 1.0e-32,), or as an `Optim.Options` instance.\n  * `autodiff_backend`: The backend to use for differentiation, which should be   an instance of `AbstractADType` (see `ADTypes.jl`).   Default is `nothing`, which means `Optim.jl` will estimate gradients (likely   with finite differences). You can also pass a symbolic version of the backend   type, such as `:Zygote` for Zygote, `:Enzyme`, etc. Most backends will not   work, and many will never work due to incompatibilities, though support for some   is gradually being added.\n  * `perturbation_factor`: When mutating a constant, either   multiply or divide by (1+perturbation_factor)^(rand()+1).\n  * `probability_negate_constant`: Probability of negating a constant in the equation   when mutating it.\n  * `mutation_weights`: Relative probabilities of the mutations. The struct   `MutationWeights` (or any `AbstractMutationWeights`) should be passed to these options.   See its documentation on `MutationWeights` for the different weights.\n  * `crossover_probability`: Probability of performing crossover.\n  * `annealing`: Whether to use simulated annealing.\n  * `warmup_maxsize_by`: Whether to slowly increase the max size from 5 up to   `maxsize`. If nonzero, specifies the fraction through the search   at which the maxsize should be reached.\n  * `verbosity`: Whether to print debugging statements or   not.\n  * `print_precision`: How many digits to print when printing   equations. By default, this is 5.\n  * `output_directory`: The base directory to save output files to. Files   will be saved in a subdirectory according to the run ID. By default,   this is `./outputs`.\n  * `save_to_file`: Whether to save equations to a file during the search.\n  * `bin_constraints`: See `constraints`. This is the same, but specified for binary   operators only (for example, if you have an operator that is both a binary   and unary operator).\n  * `una_constraints`: Likewise, for unary operators.\n  * `seed`: What random seed to use. `nothing` uses no seed.\n  * `progress`: Whether to use a progress bar output (`verbosity` will   have no effect).\n  * `early_stop_condition`: Float - whether to stop early if the mean loss gets below this value.   Function - a function taking (loss, complexity) as arguments and returning true or false.\n  * `timeout_in_seconds`: Float64 - the time in seconds after which to exit (as an alternative to the number of iterations).\n  * `max_evals`: Int (or Nothing) - the maximum number of evaluations of expressions to perform.\n  * `input_stream`: the stream to read user input from. By default, this is `stdin`. If you encounter issues   with reading from `stdin`, like a hang, you can simply pass `devnull` to this argument.\n  * `skip_mutation_failures`: Whether to simply skip over mutations that fail or are rejected, rather than to replace the mutated   expression with the original expression and proceed normally.\n  * `nested_constraints`: Specifies how many times a combination of operators can be nested. For example,   `[sin => [cos => 0], cos => [cos => 2]]` specifies that `cos` may never appear within a `sin`,   but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos`   can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination   of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified,   it is assumed that it can be nested an unlimited number of times. This requires that there is no operator   which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation).   For binary operators, both arguments are treated the same way, and the max of each argument is constrained.\n  * `deterministic`: Use a global counter for the birth time, rather than calls to `time()`. This gives   perfect resolution, and is therefore deterministic. However, it is not thread safe, and must be used   in serial mode.\n  * `define_helper_functions`: Whether to define helper functions   for constructing and evaluating trees.\n  * `niterations::Int=10`: The number of iterations to perform the search.   More iterations will improve the results.\n  * `parallelism=:multithreading`: What parallelism mode to use.   The options are `:multithreading`, `:multiprocessing`, and `:serial`.   By default, multithreading will be used. Multithreading uses less memory,   but multiprocessing can handle multi-node compute. If using `:multithreading`   mode, the number of threads available to julia are used. If using   `:multiprocessing`, `numprocs` processes will be created dynamically if   `procs` is unset. If you have already allocated processes, pass them   to the `procs` argument and they will be used.   You may also pass a string instead of a symbol, like `\"multithreading\"`.\n  * `numprocs::Union{Int, Nothing}=nothing`:  The number of processes to use,   if you want `equation_search` to set this up automatically. By default   this will be `4`, but can be any number (you should pick a number <=   the number of cores available).\n  * `procs::Union{Vector{Int}, Nothing}=nothing`: If you have set up   a distributed run manually with `procs = addprocs()` and `@everywhere`,   pass the `procs` to this keyword argument.\n  * `addprocs_function::Union{Function, Nothing}=nothing`: If using multiprocessing   (`parallelism=:multithreading`), and are not passing `procs` manually,   then they will be allocated dynamically using `addprocs`. However,   you may also pass a custom function to use instead of `addprocs`.   This function should take a single positional argument,   which is the number of processes to use, as well as the `lazy` keyword argument.   For example, if set up on a slurm cluster, you could pass   `addprocs_function = addprocs_slurm`, which will set up slurm processes.\n  * `heap_size_hint_in_bytes::Union{Int,Nothing}=nothing`: On Julia 1.9+, you may set the `--heap-size-hint`   flag on Julia processes, recommending garbage collection once a process   is close to the recommended size. This is important for long-running distributed   jobs where each process has an independent memory, and can help avoid   out-of-memory errors. By default, this is set to `Sys.free_memory() / numprocs`.\n  * `worker_imports::Union{Vector{Symbol},Nothing}=nothing`: If you want to import   additional modules on each worker, pass them here as a vector of symbols.   By default some of the extensions will automatically be loaded when needed.\n  * `runtests::Bool=true`: Whether to run (quick) tests before starting the   search, to see if there will be any problems during the equation search   related to the host environment.\n  * `run_id::Union{String,Nothing}=nothing`: A unique identifier for the run.   This will be used to store outputs from the run in the `outputs` directory.   If not specified, a unique ID will be generated.\n  * `loss_type::Type=Nothing`: If you would like to use a different type   for the loss than for the data you passed, specify the type here.   Note that if you pass complex data `::Complex{L}`, then the loss   type will automatically be set to `L`.\n  * `selection_method::Function`: Function to selection expression from   the Pareto frontier for use in `predict`.   See `SymbolicRegression.MLJInterfaceModule.choose_best` for an example.   This function should return a single integer specifying   the index of the expression to use. By default, this maximizes   the score (a pound-for-pound rating) of expressions reaching the threshold   of 1.5x the minimum loss. To override this at prediction time, you can pass   a named tuple with keys `data` and `idx` to `predict`. See the Operations   section for details.\n  * `dimensions_type::AbstractDimensions`: The type of dimensions to use when storing   the units of the data. By default this is `DynamicQuantities.SymbolicDimensions`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which   should have same scitype as `X` above. The expression used for prediction is defined   by the `selection_method` function, which can be seen by viewing `report(mach).best_idx`.\n  * `predict(mach, (data=Xnew, idx=i))`: Return predictions of the target given features   `Xnew`, which should have same scitype as `X` above. By passing a named tuple with keys   `data` and `idx`, you are able to specify the equation you wish to evaluate in `idx`.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `best_idx::Int`: The index of the best expression in the Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing   a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Node{T}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity). `T` is equal to the element type of the passed data.\n  * `equation_strings::Vector{String}`: The expressions discovered by the search, represented as strings for easy inspection.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `best_idx::Int`: The index of the best expression in the Pareto frontier,  as determined by the `selection_method` function. Override in `predict` by passing  a named tuple with keys `data` and `idx`.\n  * `equations::Vector{Node{T}}`: The expressions discovered by the search, represented in a dominating Pareto frontier (i.e., the best expressions found for each complexity).\n  * `equation_strings::Vector{String}`: The expressions discovered by the search, represented as strings for easy inspection.\n  * `complexities::Vector{Int}`: The complexity of each expression in the Pareto frontier.\n  * `losses::Vector{L}`: The loss of each expression in the Pareto frontier, according to the loss function specified in the model. The type `L` is the loss type, which is usually the same as the element type of data passed (i.e., `T`), but can differ if complex data types are passed.\n  * `scores::Vector{L}`: A metric which considers both the complexity and loss of an expression, equal to the change in the log-loss divided by the change in complexity, relative to the previous expression along the Pareto frontier. A larger score aims to indicate an expression is more likely to be the true expression generating the data, but this is very problem-dependent and generally several other factors should be considered.\n\n# Examples\n\n```julia\nusing MLJ\nSRRegressor = @load SRRegressor pkg=SymbolicRegression\nX, y = @load_boston\nmodel = SRRegressor(binary_operators=[+, -, *], unary_operators=[exp], niterations=100)\nmach = machine(model, X, y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equation used:\nr = report(mach)\nprintln(\"Equation used:\", r.equation_strings[r.best_idx])\n```\n\nWith units and variable names:\n\n```julia\nusing MLJ\nusing DynamicQuantities\nSRegressor = @load SRRegressor pkg=SymbolicRegression\n\nX = (; x1=rand(32) .* us\"km/h\", x2=rand(32) .* us\"km\")\ny = @. X.x2 / X.x1 + 0.5us\"h\"\nmodel = SRRegressor(binary_operators=[+, -, *, /])\nmach = machine(model, X, y)\nfit!(mach)\ny_hat = predict(mach, X)\n# View the equation used:\nr = report(mach)\nprintln(\"Equation used:\", r.equation_strings[r.best_idx])\n```\n\nSee also [`MultitargetSRRegressor`](@ref).\n"""
 ":name" = "SRRegressor"
 ":human_name" = "Symbolic Regression via Evolutionary Search"
 ":is_supervised" = "`true`"
 ":prediction_type" = ":deterministic"
 ":abstract_type" = "`MLJModelInterface.Deterministic`"
 ":implemented_methods" = []
-":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_type, :expression_options, :node_type, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :probability_negate_constant, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
-":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}\", \"NamedTuple\", \"Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Any\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
+":hyperparameters" = "`(:defaults, :binary_operators, :unary_operators, :maxsize, :maxdepth, :expression_spec, :populations, :population_size, :ncycles_per_iteration, :elementwise_loss, :loss_function, :loss_function_expression, :dimensional_constraint_penalty, :parsimony, :constraints, :nested_constraints, :complexity_of_operators, :complexity_of_constants, :complexity_of_variables, :warmup_maxsize_by, :adaptive_parsimony_scaling, :mutation_weights, :crossover_probability, :annealing, :alpha, :tournament_selection_n, :tournament_selection_p, :early_stop_condition, :batching, :batch_size, :dimensionless_constants_only, :complexity_mapping, :use_frequency, :use_frequency_in_tournament, :should_simplify, :perturbation_factor, :probability_negate_constant, :skip_mutation_failures, :optimizer_algorithm, :optimizer_nrestarts, :optimizer_probability, :optimizer_iterations, :optimizer_f_calls_limit, :optimizer_options, :should_optimize_constants, :migration, :hof_migration, :fraction_replaced, :fraction_replaced_hof, :topn, :timeout_in_seconds, :max_evals, :input_stream, :turbo, :bumper, :autodiff_backend, :deterministic, :seed, :verbosity, :print_precision, :progress, :output_directory, :save_to_file, :bin_constraints, :una_constraints, :terminal_width, :use_recorder, :recorder_file, :define_helper_functions, :expression_type, :expression_options, :node_type, :output_file, :fast_cycle, :npopulations, :npop, :niterations, :parallelism, :numprocs, :procs, :addprocs_function, :heap_size_hint_in_bytes, :worker_imports, :logger, :runtests, :run_id, :loss_type, :selection_method, :dimensions_type)`"
+":hyperparameter_types" = "`(\"Union{Nothing, VersionNumber}\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, SymbolicRegression.CoreModule.ExpressionSpecModule.AbstractExpressionSpec}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Function, LossFunctions.Traits.SupervisedLoss}\", \"Union{Nothing, Function}\", \"Union{Nothing, Function}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Any\", \"Any\", \"Any\", \"Union{Nothing, Real}\", \"Union{Nothing, Real, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, SymbolicRegression.CoreModule.MutationWeightsModule.AbstractMutationWeights, NamedTuple, AbstractVector}\", \"Union{Nothing, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Function, Real}\", \"Union{Nothing, Bool}\", \"Union{Nothing, Integer}\", \"Bool\", \"Union{Nothing, Function, SymbolicRegression.CoreModule.OptionsStructModule.ComplexityMapping}\", \"Bool\", \"Bool\", \"Union{Nothing, Bool}\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Bool\", \"Union{AbstractString, Optim.AbstractOptimizer}\", \"Int64\", \"AbstractFloat\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Dict, NamedTuple, Optim.Options}\", \"Bool\", \"Bool\", \"Bool\", \"Union{Nothing, Real}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Real}\", \"Union{Nothing, Integer}\", \"IO\", \"Bool\", \"Bool\", \"Union{Nothing, ADTypes.AbstractADType, Symbol}\", \"Bool\", \"Any\", \"Union{Nothing, Integer}\", \"Integer\", \"Union{Nothing, Bool}\", \"Union{Nothing, String}\", \"Bool\", \"Any\", \"Any\", \"Union{Nothing, Integer}\", \"Bool\", \"AbstractString\", \"Bool\", \"Union{Nothing, Type{<:DynamicExpressions.ExpressionModule.AbstractExpression}}\", \"Union{Nothing, NamedTuple}\", \"Union{Nothing, Type{<:DynamicExpressions.NodeModule.AbstractExpressionNode}}\", \"Union{Nothing, AbstractString}\", \"Bool\", \"Union{Nothing, Integer}\", \"Union{Nothing, Integer}\", \"Int64\", \"Symbol\", \"Union{Nothing, Int64}\", \"Union{Nothing, Vector{Int64}}\", \"Union{Nothing, Function}\", \"Union{Nothing, Integer}\", \"Union{Nothing, Vector{Symbol}}\", \"Union{Nothing, SymbolicRegression.LoggingModule.AbstractSRLogger}\", \"Bool\", \"Union{Nothing, String}\", \"Type\", \"Function\", \"Type{D} where D<:DynamicQuantities.AbstractDimensions\")`"
+":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
 ":iteration_parameter" = "`nothing`"
 ":supports_training_losses" = "`false`"
 ":reports_feature_importances" = "`false`"
@@ -7909,9 +8053,9 @@
 ":prediction_type" = ":unknown"
 ":abstract_type" = "`MLJModelInterface.UnsupervisedDetector`"
 ":implemented_methods" = [":clean!", ":reformat", ":selectrows", ":fit", ":transform"]
-":hyperparameters" = "`(:whitening, :rule_of_thumb)`"
-":hyperparameter_types" = "`(\"Bool\", \"Bool\")`"
-":hyperparameter_ranges" = "`(nothing, nothing)`"
+":hyperparameters" = "`(:model,)`"
+":hyperparameter_types" = "`(\"PythonCall.Core.Py\",)`"
+":hyperparameter_ranges" = "`(nothing,)`"
 ":iteration_parameter" = "`nothing`"
 ":supports_training_losses" = "`false`"
 ":reports_feature_importances" = "`false`"
@@ -8531,109 +8675,37 @@
 ":reporting_operations" = "`()`"
 ":constructor" = "`nothing`"
 
-[GLM.LinearBinaryClassifier]
-":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
+[MLJTransforms.EntityEmbedder]
+":input_scitype" = "`ScientificTypesBase.Unknown`"
 ":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`AbstractVector{<:ScientificTypesBase.Binary}`"
-":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{<:ScientificTypesBase.Binary}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{<:ScientificTypesBase.Binary}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
+":target_scitype" = "`ScientificTypesBase.Unknown`"
+":fit_data_scitype" = "`Tuple{ScientificTypesBase.Unknown, ScientificTypesBase.Unknown}`"
 ":predict_scitype" = "`ScientificTypesBase.Unknown`"
 ":transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
 ":target_in_fit" = "`true`"
 ":is_pure_julia" = "`true`"
-":package_name" = "GLM"
-":package_license" = "MIT"
-":load_path" = "MLJGLMInterface.LinearBinaryClassifier"
-":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-":package_url" = "https://github.com/JuliaStats/GLM.jl"
-":is_wrapper" = "`false`"
-":supports_weights" = "`true`"
-":supports_class_weights" = "`false`"
-":supports_online" = "`false`"
-":docstring" = """```\nLinearBinaryClassifier\n```\n\nA model type for constructing a linear binary classifier, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearBinaryClassifier = @load LinearBinaryClassifier pkg=GLM\n```\n\nDo `model = LinearBinaryClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearBinaryClassifier(fit_intercept=...)`.\n\n`LinearBinaryClassifier` is a [generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model#Variance_function), specialised to the case of a binary target variable, with a user-specified link function. Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype with `schema(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model.  If set to false,  no intercept will be calculated (e.g. the data is expected to be centered)\n  * `link=GLM.LogitLink`: The function which links the linear prediction function to the  probability of a particular outcome or class. This must have type `GLM.Link01`. Options  include `GLM.LogitLink()`, `GLM.ProbitLink()`, `CloglogLink(),`CauchitLink()`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a  variable which is known to have a coefficient of 1.\n  * `maxiter::Integer=30`: The maximum number of iterations allowed to achieve convergence.\n  * `atol::Real=1e-6`: Absolute threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `rtol::Real=1e-6`: Relative threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\n# Operations\n\n  * `predict(mach, Xnew)`: Return predictions of the target given features `Xnew` having the same scitype as `X` above. Predictions are probabilistic.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned  above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features used during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nimport GLM # namespace must be available\n\nLinearBinaryClassifier = @load LinearBinaryClassifier pkg=GLM\nclf = LinearBinaryClassifier(fit_intercept=false, link=GLM.ProbitLink())\n\nX, y = @load_crabs\n\nmach = machine(clf, X, y) |> fit!\n\nXnew = (;FL = [8.1, 24.8, 7.2],\n        RW = [5.1, 25.7, 6.4],\n        CL = [15.9, 46.7, 14.3],\n        CW = [18.7, 59.7, 12.2],\n        BD = [6.2, 23.6, 8.4],)\n\nyhat = predict(mach, Xnew) # probabilistic predictions\npdf(yhat, levels(y)) # probability matrix\np_B = pdf.(yhat, \"B\")\nclass_labels = predict_mode(mach, Xnew)\n\nfitted_params(mach).features\nfitted_params(mach).coef\nfitted_params(mach).intercept\n\nreport(mach)\n```\n\nSee also [`LinearRegressor`](@ref), [`LinearCountRegressor`](@ref)\n"""
-":name" = "LinearBinaryClassifier"
-":human_name" = "linear binary classifier"
-":is_supervised" = "`true`"
-":prediction_type" = ":probabilistic"
-":abstract_type" = "`MLJModelInterface.Probabilistic`"
-":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict"]
-":hyperparameters" = "`(:fit_intercept, :link, :offsetcol, :maxiter, :atol, :rtol, :minstepfac, :report_keys)`"
-":hyperparameter_types" = "`(\"Bool\", \"GLM.Link01\", \"Union{Nothing, Symbol}\", \"Integer\", \"Real\", \"Real\", \"Real\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
-":iteration_parameter" = "`nothing`"
-":supports_training_losses" = "`false`"
-":reports_feature_importances" = "`false`"
-":deep_properties" = "`()`"
-":reporting_operations" = "`()`"
-":constructor" = "`nothing`"
-
-[GLM.LinearCountRegressor]
-":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
-":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`AbstractVector{ScientificTypesBase.Count}`"
-":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Count}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Count}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
-":predict_scitype" = "`AbstractVector{ScientificTypesBase.Density{ScientificTypesBase.Count}}`"
-":transform_scitype" = "`ScientificTypesBase.Unknown`"
-":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
-":target_in_fit" = "`true`"
-":is_pure_julia" = "`true`"
-":package_name" = "GLM"
-":package_license" = "MIT"
-":load_path" = "MLJGLMInterface.LinearCountRegressor"
-":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-":package_url" = "https://github.com/JuliaStats/GLM.jl"
-":is_wrapper" = "`false`"
-":supports_weights" = "`true`"
-":supports_class_weights" = "`false`"
-":supports_online" = "`false`"
-":docstring" = """```\nLinearCountRegressor\n```\n\nA model type for constructing a linear count regressor, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearCountRegressor = @load LinearCountRegressor pkg=GLM\n```\n\nDo `model = LinearCountRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearCountRegressor(fit_intercept=...)`.\n\n`LinearCountRegressor` is a [generalized linear model](https://en.wikipedia.org/wiki/Generalized_linear_model#Variance_function), specialised to the case of a `Count` target variable (non-negative, unbounded integer) with user-specified link function. Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `Count`; check the scitype with `schema(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model. If set to false,  no intercept will be calculated (e.g. the data is expected to be centered)\n  * `distribution=Distributions.Poisson()`: The distribution which the residuals/errors of the  model should fit.\n  * `link=GLM.LogLink()`: The function which links the linear prediction function to the  probability of a particular outcome or class. This should be one of the following:  `GLM.IdentityLink()`, `GLM.InverseLink()`, `GLM.InverseSquareLink()`, `GLM.LogLink()`,  `GLM.SqrtLink()`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a  variable which is known to have a coefficient of 1.\n  * `maxiter::Integer=30`: The maximum number of iterations allowed to achieve convergence.\n  * `atol::Real=1e-6`: Absolute threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `rtol::Real=1e-6`: Relative threshold for convergence. Convergence is achieved when the  relative change in deviance is less than `max(rtol*dev, atol). This term exists to avoid  failure when deviance is unchanged except for rounding errors.\n  * `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew` having  the same Scitype as `X` above. Predictions are probabilistic.\n  * `predict_mean(mach, Xnew)`: instead return the mean of each prediction above\n  * `predict_median(mach, Xnew)`: instead return the median of each prediction above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features encountered during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nimport MLJ.Distributions.Poisson\n\n# Generate some data whose target y looks Poisson when conditioned on\n# X:\nN = 10_000\nw = [1.0, -2.0, 3.0]\nmu(x) = exp(w'x) # mean for a log link function\nXmat = rand(N, 3)\nX = MLJ.table(Xmat)\ny = map(1:N) do i\n    x = Xmat[i, :]\n    rand(Poisson(mu(x)))\nend;\n\nCountRegressor = @load LinearCountRegressor pkg=GLM\nmodel = CountRegressor(fit_intercept=false)\nmach = machine(model, X, y)\nfit!(mach)\n\nXnew = MLJ.table(rand(3, 3))\nyhat = predict(mach, Xnew)\nyhat_point = predict_mean(mach, Xnew)\n\n# get coefficients approximating `w`:\njulia> fitted_params(mach).coef\n3-element Vector{Float64}:\n  0.9969008753103842\n -2.0255901752504775\n  3.014407534033522\n\nreport(mach)\n```\n\nSee also [`LinearRegressor`](@ref), [`LinearBinaryClassifier`](@ref)\n"""
-":name" = "LinearCountRegressor"
-":human_name" = "linear count regressor"
-":is_supervised" = "`true`"
-":prediction_type" = ":probabilistic"
-":abstract_type" = "`MLJModelInterface.Probabilistic`"
-":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict", ":predict_mean"]
-":hyperparameters" = "`(:fit_intercept, :distribution, :link, :offsetcol, :maxiter, :atol, :rtol, :minstepfac, :report_keys)`"
-":hyperparameter_types" = "`(\"Bool\", \"Distributions.Distribution\", \"GLM.Link\", \"Union{Nothing, Symbol}\", \"Integer\", \"Real\", \"Real\", \"Real\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"
-":iteration_parameter" = "`nothing`"
-":supports_training_losses" = "`false`"
-":reports_feature_importances" = "`false`"
-":deep_properties" = "`()`"
-":reporting_operations" = "`()`"
-":constructor" = "`nothing`"
-
-[GLM.LinearRegressor]
-":input_scitype" = "`ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}`"
-":output_scitype" = "`ScientificTypesBase.Unknown`"
-":target_scitype" = "`AbstractVector{ScientificTypesBase.Continuous}`"
-":fit_data_scitype" = "`Union{Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Continuous}}, Tuple{ScientificTypesBase.Table{<:Union{AbstractVector{<:ScientificTypesBase.Continuous}, AbstractVector{<:ScientificTypesBase.Finite}}}, AbstractVector{ScientificTypesBase.Continuous}, AbstractVector{<:Union{ScientificTypesBase.Continuous, ScientificTypesBase.Count}}}}`"
-":predict_scitype" = "`AbstractVector{ScientificTypesBase.Density{ScientificTypesBase.Continuous}}`"
-":transform_scitype" = "`ScientificTypesBase.Unknown`"
-":inverse_transform_scitype" = "`ScientificTypesBase.Unknown`"
-":target_in_fit" = "`true`"
-":is_pure_julia" = "`true`"
-":package_name" = "GLM"
-":package_license" = "MIT"
-":load_path" = "MLJGLMInterface.LinearRegressor"
-":package_uuid" = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-":package_url" = "https://github.com/JuliaStats/GLM.jl"
-":is_wrapper" = "`false`"
-":supports_weights" = "`true`"
+":package_name" = "MLJTransforms"
+":package_license" = "unknown"
+":load_path" = "MLJTransforms.EntityEmbedder"
+":package_uuid" = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
+":package_url" = "https://github.com/JuliaAI/MLJTransforms.jl"
+":is_wrapper" = "`true`"
+":supports_weights" = "`false`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nLinearRegressor\n```\n\nA model type for constructing a linear regressor, based on [GLM.jl](https://github.com/JuliaStats/GLM.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nLinearRegressor = @load LinearRegressor pkg=GLM\n```\n\nDo `model = LinearRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `LinearRegressor(fit_intercept=...)`.\n\n`LinearRegressor` assumes the target is a continuous variable whose conditional distribution is normal with constant variance, and whose expected value is a linear combination of the features (identity link function). Options exist to specify an intercept or offset feature.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with one of:\n\n```\nmach = machine(model, X, y)\nmach = machine(model, X, y, w)\n```\n\nHere\n\n  * `X`: is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check the scitype with `schema(X)`\n  * `y`: is the target, which can be any `AbstractVector` whose element scitype is `Continuous`; check the scitype with `scitype(y)`\n  * `w`: is a vector of `Real` per-observation weights\n\n# Hyper-parameters\n\n  * `fit_intercept=true`: Whether to calculate the intercept for this model.  If set to false, no intercept will be calculated (e.g. the data is expected  to be centered)\n  * `dropcollinear=false`: Whether to drop features in the training data to ensure linear independence.  If true , only the first of each set of linearly-dependent features is used. The coefficient for redundant linearly dependent features is `0.0` and all associated statistics are set to `NaN`.\n  * `offsetcol=nothing`: Name of the column to be used as an offset, if any.  An offset is a variable which is known to have a coefficient of 1.\n  * `report_keys`: `Vector` of keys for the report. Possible keys are: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table` and `:glm_model`. By default only `:glm_model` is excluded.\n\nTrain the machine using `fit!(mach, rows=...)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new  features `Xnew` having the same Scitype as `X` above. Predictions are  probabilistic.\n  * `predict_mean(mach, Xnew)`: instead return the mean of  each prediction above\n  * `predict_median(mach, Xnew)`: instead return the median of  each prediction above.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `features`: The names of the features encountered during model fitting.\n  * `coef`: The linear coefficients determined by the model.\n  * `intercept`: The intercept determined by the model.\n\n# Report\n\nWhen all keys are enabled in `report_keys`, the following fields are available in `report(mach)`:\n\n  * `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted residual sum of squares\n  * `dof_residual`: The degrees of freedom for residuals, when meaningful.\n  * `stderror`: The standard errors of the coefficients.\n  * `vcov`: The estimated variance-covariance matrix of the coefficient estimates.\n  * `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals.\n  * `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training data. Refer to the GLM.jl documentation for usage.\n\n# Examples\n\n```\nusing MLJ\nLinearRegressor = @load LinearRegressor pkg=GLM\nglm = LinearRegressor()\n\nX, y = make_regression(100, 2) # synthetic data\nmach = machine(glm, X, y) |> fit!\n\nXnew, _ = make_regression(3, 2)\nyhat = predict(mach, Xnew) # new predictions\nyhat_point = predict_mean(mach, Xnew) # new predictions\n\nfitted_params(mach).features\nfitted_params(mach).coef # x1, x2, intercept\nfitted_params(mach).intercept\n\nreport(mach)\n```\n\nSee also [`LinearCountRegressor`](@ref), [`LinearBinaryClassifier`](@ref)\n"""
-":name" = "LinearRegressor"
-":human_name" = "linear regressor"
-":is_supervised" = "`true`"
-":prediction_type" = ":probabilistic"
-":abstract_type" = "`MLJModelInterface.Probabilistic`"
-":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":predict", ":predict_mean"]
-":hyperparameters" = "`(:fit_intercept, :dropcollinear, :offsetcol, :report_keys)`"
-":hyperparameter_types" = "`(\"Bool\", \"Bool\", \"Union{Nothing, Symbol}\", \"Union{Nothing, AbstractVector{Symbol}}\")`"
-":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing)`"
+":docstring" = """```\nEntityEmbedder(; model=mljflux_neural_model)\n```\n\n`EntityEmbedder` implements entity embeddings as in the \"Entity Embeddings of Categorical Variables\" paper by Cheng Guo, Felix Berkhahn.\n\n# Training data\n\nIn MLJ (or MLJBase) bind an instance unsupervised `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` is any table of input features supported by the model being wrapped. Features to be transformed must  have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to   check scitypes.\n  * `y` is the target, which can be any `AbstractVector` supported by the model being wrapped.\n\nTrain the machine using `fit!(mach)`.\n\n# Hyper-parameters\n\n  * `model`: The supervised MLJFlux neural network model to be used for entity embedding.  This must be one of these: `MLJFlux.NeuralNetworkClassifier`, `NeuralNetworkBinaryClassifier`, `MLJFlux.NeuralNetworkRegressor`,`MLJFlux.MultitargetNeuralNetworkRegressor`. The selected model may have hyperparameters  that may affect embedding performance, the most notable of which could be the `builder` argument.\n\n# Operations\n\n  * `transform(mach, Xnew)`: Transform the categorical features of `Xnew` into dense `Continuous` vectors using the trained `MLJFlux.EntityEmbedderLayer` layer present in the network.   Check relevant documentation [here](https://fluxml.ai/MLJFlux.jl/dev/) and in particular, the `embedding_dims` hyperparameter.\n\n# Examples\n\n```julia\nusing MLJ\nusing CategoricalArrays\n\n# Setup some data\nN = 200\nX = (;\n    Column1 = repeat(Float32[1.0, 2.0, 3.0, 4.0, 5.0], Int(N / 5)),\n    Column2 = categorical(repeat(['a', 'b', 'c', 'd', 'e'], Int(N / 5))),\n    Column3 = categorical(repeat([\"b\", \"c\", \"d\", \"f\", \"f\"], Int(N / 5)), ordered = true),\n    Column4 = repeat(Float32[1.0, 2.0, 3.0, 4.0, 5.0], Int(N / 5)),\n    Column5 = randn(Float32, N),\n    Column6 = categorical(\n        repeat([\"group1\", \"group1\", \"group2\", \"group2\", \"group3\"], Int(N / 5)),\n    ),\n)\ny = categorical([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])           # Classification\n\n# Initiate model\nEntityEmbedder = @load EntityEmbedder pkg=MLJFlux\nNeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux\n\nclf = NeuralNetworkClassifier(embedding_dims=Dict(:Column2 => 2, :Column3 => 2))\n\nemb = EntityEmbedder(clf)\n\n# Construct machine\nmach = machine(emb, X, y)\n\n# Train model\nfit!(mach)\n\n# Transform data using model to encode categorical columns\nXnew = transform(mach, X)\nXnew\n```\n\nSee also [`NeuralNetworkClassifier`, `NeuralNetworkRegressor`](@ref)\n"""
+":name" = "EntityEmbedder"
+":human_name" = "entity embedder"
+":is_supervised" = "`false`"
+":prediction_type" = ":unknown"
+":abstract_type" = "`MLJModelInterface.Unsupervised`"
+":implemented_methods" = [":fit", ":fitted_params", ":training_losses", ":transform"]
+":hyperparameters" = "`(:model,)`"
+":hyperparameter_types" = "`(\"Union{MLJFlux.MLJFluxDeterministic, MLJFlux.MLJFluxProbabilistic}\",)`"
+":hyperparameter_ranges" = "`(nothing,)`"
 ":iteration_parameter" = "`nothing`"
-":supports_training_losses" = "`false`"
+":supports_training_losses" = "`true`"
 ":reports_feature_importances" = "`false`"
 ":deep_properties" = "`()`"
 ":reporting_operations" = "`()`"
@@ -8658,7 +8730,7 @@
 ":supports_weights" = "`false`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nMultitargetNeuralNetworkRegressor\n```\n\nA model type for constructing a multitarget neural network regressor, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nMultitargetNeuralNetworkRegressor = @load MultitargetNeuralNetworkRegressor pkg=MLJFlux\n```\n\nDo `model = MultitargetNeuralNetworkRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `MultitargetNeuralNetworkRegressor(builder=...)`.\n\n`MultitargetNeuralNetworkRegressor` is for training a data-dependent Flux.jl neural network to predict a multi-valued `Continuous` target, represented as a table, given a table of `Continuous` features. Users provide a recipe for constructing the network, based on properties of the data that is encountered, by specifying an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedder` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedder` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any table or matrix of output targets whose element scitype is `Continuous`; check column scitypes with `schema(y)`. If `y` is a `Matrix`, it is assumed to have columns corresponding to variables and rows corresponding to observations.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs a neural network. Possible `builders` include: `Linear`, `Short`, and `MLP`. See MLJFlux documentation for more on builders, and the example below for using the `@builder` convenience macro.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.mse`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a regression task, natural loss functions are:\n\n      * `Flux.mse`\n      * `Flux.mae`\n      * `Flux.msle`\n      * `Flux.huber_loss`\n\n    Currently MLJ measures are not supported as loss functions here.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew` having the same scitype as `X` above. Predictions are deterministic.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedder` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations  which make up the neural network.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we apply a multi-target regression model to synthetic data:\n\n```julia\nusing MLJ\nimport MLJFlux\nusing Flux\nimport Optimisers\n```\n\nFirst, we generate some synthetic data (needs MLJBase 0.20.16 or higher):\n\n```julia\nX, y = make_regression(100, 9; n_targets = 2) # both tables\nschema(y)\nschema(X)\n```\n\nSplitting off a test set:\n\n```julia\n(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true);\n```\n\nNext, we can define a `builder`, making use of a convenience macro to do so.  In the following `@builder` call, `n_in` is a proxy for the number input features and `n_out` the number of target variables (both known at `fit!` time), while `rng` is a proxy for a RNG (which will be passed from the `rng` field of `model` defined below).\n\n```julia\nbuilder = MLJFlux.@builder begin\n    init=Flux.glorot_uniform(rng)\n    Chain(\n        Dense(n_in, 64, relu, init=init),\n        Dense(64, 32, relu, init=init),\n        Dense(32, n_out, init=init),\n    )\nend\n```\n\nInstantiating the regression model:\n\n```julia\nMultitargetNeuralNetworkRegressor = @load MultitargetNeuralNetworkRegressor\nmodel = MultitargetNeuralNetworkRegressor(builder=builder, rng=123, epochs=20)\n```\n\nWe will arrange for standardization of the the target by wrapping our model in  `TransformedTargetModel`, and standardization of the features by inserting the wrapped  model in a pipeline:\n\n```julia\npipe = Standardizer |> TransformedTargetModel(model, transformer=Standardizer)\n```\n\nIf we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)`\n\n```julia\nmach = machine(pipe, X, y)\nfit!(mach, verbosity=2)\n\n# first element initial loss, 2:end per epoch training losses\nreport(mach).transformed_target_model_deterministic.model.training_losses\n```\n\nFor experimenting with learning rate, see the [`NeuralNetworkRegressor`](@ref) example.\n\n```\npipe.transformed_target_model_deterministic.model.optimiser = Optimisers.Adam(0.0001)\n```\n\nWith the learning rate fixed, we can now compute a CV estimate of the performance (using all data bound to `mach`) and compare this with performance on the test set:\n\n```julia\n\n# CV estimate, based on `(X, y)`:\nevaluate!(mach, resampling=CV(nfolds=5), measure=multitarget_l2)\n\n# loss for `(Xtest, test)`:\nfit!(mach) # trains on all data `(X, y)`\nyhat = predict(mach, Xtest)\nmultitarget_l2(yhat, ytest)\n```\n\nSee also [`NeuralNetworkRegressor`](@ref)\n"""
+":docstring" = """```\nMultitargetNeuralNetworkRegressor\n```\n\nA model type for constructing a multitarget neural network regressor, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nMultitargetNeuralNetworkRegressor = @load MultitargetNeuralNetworkRegressor pkg=MLJFlux\n```\n\nDo `model = MultitargetNeuralNetworkRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `MultitargetNeuralNetworkRegressor(builder=...)`.\n\n`MultitargetNeuralNetworkRegressor` is for training a data-dependent Flux.jl neural network to predict a multi-valued `Continuous` target, represented as a table, given a table of `Continuous` features. Users provide a recipe for constructing the network, based on properties of the data that is encountered, by specifying an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedderLayer` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedderLayer` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any table or matrix of output targets whose element scitype is `Continuous`; check column scitypes with `schema(y)`. If `y` is a `Matrix`, it is assumed to have columns corresponding to variables and rows corresponding to observations.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs a neural network. Possible `builders` include: `Linear`, `Short`, and `MLP`. See MLJFlux documentation for more on builders, and the example below for using the `@builder` convenience macro.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.mse`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a regression task, natural loss functions are:\n\n      * `Flux.mse`\n      * `Flux.mae`\n      * `Flux.msle`\n      * `Flux.huber_loss`\n\n    Currently MLJ measures are not supported as loss functions here.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew` having the same scitype as `X` above. Predictions are deterministic.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedderLayer` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations  which make up the neural network.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we apply a multi-target regression model to synthetic data:\n\n```julia\nusing MLJ\nimport MLJFlux\nusing Flux\nimport Optimisers\n```\n\nFirst, we generate some synthetic data (needs MLJBase 0.20.16 or higher):\n\n```julia\nX, y = make_regression(100, 9; n_targets = 2) # both tables\nschema(y)\nschema(X)\n```\n\nSplitting off a test set:\n\n```julia\n(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true);\n```\n\nNext, we can define a `builder`, making use of a convenience macro to do so.  In the following `@builder` call, `n_in` is a proxy for the number input features and `n_out` the number of target variables (both known at `fit!` time), while `rng` is a proxy for a RNG (which will be passed from the `rng` field of `model` defined below).\n\n```julia\nbuilder = MLJFlux.@builder begin\n    init=Flux.glorot_uniform(rng)\n    Chain(\n        Dense(n_in, 64, relu, init=init),\n        Dense(64, 32, relu, init=init),\n        Dense(32, n_out, init=init),\n    )\nend\n```\n\nInstantiating the regression model:\n\n```julia\nMultitargetNeuralNetworkRegressor = @load MultitargetNeuralNetworkRegressor\nmodel = MultitargetNeuralNetworkRegressor(builder=builder, rng=123, epochs=20)\n```\n\nWe will arrange for standardization of the the target by wrapping our model in  `TransformedTargetModel`, and standardization of the features by inserting the wrapped  model in a pipeline:\n\n```julia\npipe = Standardizer |> TransformedTargetModel(model, transformer=Standardizer)\n```\n\nIf we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)`\n\n```julia\nmach = machine(pipe, X, y)\nfit!(mach, verbosity=2)\n\n# first element initial loss, 2:end per epoch training losses\nreport(mach).transformed_target_model_deterministic.model.training_losses\n```\n\nFor experimenting with learning rate, see the [`NeuralNetworkRegressor`](@ref) example.\n\n```\npipe.transformed_target_model_deterministic.model.optimiser = Optimisers.Adam(0.0001)\n```\n\nWith the learning rate fixed, we can now compute a CV estimate of the performance (using all data bound to `mach`) and compare this with performance on the test set:\n\n```julia\n\n# CV estimate, based on `(X, y)`:\nevaluate!(mach, resampling=CV(nfolds=5), measure=multitarget_l2)\n\n# loss for `(Xtest, test)`:\nfit!(mach) # trains on all data `(X, y)`\nyhat = predict(mach, Xtest)\nmultitarget_l2(yhat, ytest)\n```\n\nSee also [`NeuralNetworkRegressor`](@ref)\n"""
 ":name" = "MultitargetNeuralNetworkRegressor"
 ":human_name" = "multitarget neural network regressor"
 ":is_supervised" = "`true`"
@@ -8694,7 +8766,7 @@
 ":supports_weights" = "`false`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nNeuralNetworkClassifier\n```\n\nA model type for constructing a neural network classifier, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkClassifier(builder=...)`.\n\n`NeuralNetworkClassifier` is for training a data-dependent Flux.jl neural network for making probabilistic predictions of a `Multiclass` or `OrderedFactor` target, given a table of `Continuous` features. Users provide a recipe for constructing  the network, based on properties of the data that is encountered, by specifying  an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedder` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedder` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Multiclass` or `OrderedFactor`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Short()`: An MLJFlux builder that constructs a neural network. Possible  `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and `MLJFlux.MLP`. See  MLJFlux.jl documentation for examples of user-defined builders. See also `finaliser`  below.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.crossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are:\n\n      * `Flux.crossentropy`: Standard multiclass classification loss, also known as the log loss.\n      * `Flux.logitcrossentopy`: Mathematically equal to crossentropy, but numerically more stable than finalising the outputs with `softmax` and then calculating crossentropy. You will need to specify `finaliser=identity` to remove MLJFlux's default softmax finaliser, and understand that the output of `predict` is then unnormalized (no longer probabilistic).\n      * `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives.\n      * `Flux.focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples.\n\n    Currently MLJ measures are not supported values of `loss`.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights.] Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `finaliser=Flux.softmax`: The final activation function of the neural network (applied after the network defined by `builder`). Defaults to `Flux.softmax`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above. Predictions are probabilistic but uncalibrated.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedder` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations which make up the neural network. This includes  the final layer specified by `finaliser` (eg, `softmax`).\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a classification model using the Iris dataset. This is a very basic example, using a default builder and no standardization.  For a more advanced illustration, see [`NeuralNetworkRegressor`](@ref) or [`ImageClassifier`](@ref), and examples in the MLJFlux.jl documentation.\n\n```julia\nusing MLJ\nusing Flux\nimport RDatasets\nimport Optimisers\n```\n\nFirst, we can load the data:\n\n```julia\niris = RDatasets.dataset(\"datasets\", \"iris\");\ny, X = unpack(iris, ==(:Species), rng=123); # a vector and a table\nNeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux\nclf = NeuralNetworkClassifier()\n```\n\nNext, we can train the model:\n\n```julia\nmach = machine(clf, X, y)\nfit!(mach)\n```\n\nWe can train the model in an incremental fashion, altering the learning rate as we go, provided `optimizer_changes_trigger_retraining` is `false` (the default). Here, we also change the number of (total) iterations:\n\n```julia\nclf.optimiser = Optimisers.Adam(clf.optimiser.eta * 2)\nclf.epochs = clf.epochs + 5\n\nfit!(mach, verbosity=2) # trains 5 more epochs\n```\n\nWe can inspect the mean training loss using the `cross_entropy` function:\n\n```julia\ntraining_loss = cross_entropy(predict(mach, X), y)\n```\n\nAnd we can access the Flux chain (model) using `fitted_params`:\n\n```julia\nchain = fitted_params(mach).chain\n```\n\nFinally, we can see how the out-of-sample performance changes over time, using MLJ's `learning_curve` function:\n\n```julia\nr = range(clf, :epochs, lower=1, upper=200, scale=:log10)\ncurve = learning_curve(clf, X, y,\n                     range=r,\n                     resampling=Holdout(fraction_train=0.7),\n                     measure=cross_entropy)\nusing Plots\nplot(curve.parameter_values,\n     curve.measurements,\n     xlab=curve.parameter_name,\n     xscale=curve.parameter_scale,\n     ylab = \"Cross Entropy\")\n\n```\n\nSee also [`ImageClassifier`](@ref), [`NeuralNetworkBinaryClassifier`](@ref).\n"""
+":docstring" = """```\nNeuralNetworkClassifier\n```\n\nA model type for constructing a neural network classifier, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkClassifier(builder=...)`.\n\n`NeuralNetworkClassifier` is for training a data-dependent Flux.jl neural network for making probabilistic predictions of a `Multiclass` or `OrderedFactor` target, given a table of `Continuous` features. Users provide a recipe for constructing  the network, based on properties of the data that is encountered, by specifying  an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedderLayer` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedderLayer` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Multiclass` or `OrderedFactor`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Short()`: An MLJFlux builder that constructs a neural network. Possible  `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and `MLJFlux.MLP`. See  MLJFlux.jl documentation for examples of user-defined builders. See also `finaliser`  below.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.crossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are:\n\n      * `Flux.crossentropy`: Standard multiclass classification loss, also known as the log loss.\n      * `Flux.logitcrossentopy`: Mathematically equal to crossentropy, but numerically more stable than finalising the outputs with `softmax` and then calculating crossentropy. You will need to specify `finaliser=identity` to remove MLJFlux's default softmax finaliser, and understand that the output of `predict` is then unnormalized (no longer probabilistic).\n      * `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives.\n      * `Flux.focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples.\n\n    Currently MLJ measures are not supported values of `loss`.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights.] Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `finaliser=Flux.softmax`: The final activation function of the neural network (applied after the network defined by `builder`). Defaults to `Flux.softmax`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above. Predictions are probabilistic but uncalibrated.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedderLayer` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations which make up the neural network. This includes  the final layer specified by `finaliser` (eg, `softmax`).\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a classification model using the Iris dataset. This is a very basic example, using a default builder and no standardization.  For a more advanced illustration, see [`NeuralNetworkRegressor`](@ref) or [`ImageClassifier`](@ref), and examples in the MLJFlux.jl documentation.\n\n```julia\nusing MLJ\nusing Flux\nimport RDatasets\nimport Optimisers\n```\n\nFirst, we can load the data:\n\n```julia\niris = RDatasets.dataset(\"datasets\", \"iris\");\ny, X = unpack(iris, ==(:Species), rng=123); # a vector and a table\nNeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux\nclf = NeuralNetworkClassifier()\n```\n\nNext, we can train the model:\n\n```julia\nmach = machine(clf, X, y)\nfit!(mach)\n```\n\nWe can train the model in an incremental fashion, altering the learning rate as we go, provided `optimizer_changes_trigger_retraining` is `false` (the default). Here, we also change the number of (total) iterations:\n\n```julia\nclf.optimiser = Optimisers.Adam(clf.optimiser.eta * 2)\nclf.epochs = clf.epochs + 5\n\nfit!(mach, verbosity=2) # trains 5 more epochs\n```\n\nWe can inspect the mean training loss using the `cross_entropy` function:\n\n```julia\ntraining_loss = cross_entropy(predict(mach, X), y)\n```\n\nAnd we can access the Flux chain (model) using `fitted_params`:\n\n```julia\nchain = fitted_params(mach).chain\n```\n\nFinally, we can see how the out-of-sample performance changes over time, using MLJ's `learning_curve` function:\n\n```julia\nr = range(clf, :epochs, lower=1, upper=200, scale=:log10)\ncurve = learning_curve(clf, X, y,\n                     range=r,\n                     resampling=Holdout(fraction_train=0.7),\n                     measure=cross_entropy)\nusing Plots\nplot(curve.parameter_values,\n     curve.measurements,\n     xlab=curve.parameter_name,\n     xscale=curve.parameter_scale,\n     ylab = \"Cross Entropy\")\n\n```\n\nSee also [`ImageClassifier`](@ref), [`NeuralNetworkBinaryClassifier`](@ref).\n"""
 ":name" = "NeuralNetworkClassifier"
 ":human_name" = "neural network classifier"
 ":is_supervised" = "`true`"
@@ -8766,7 +8838,7 @@
 ":supports_weights" = "`false`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nNeuralNetworkBinaryClassifier\n```\n\nA model type for constructing a neural network binary classifier, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkBinaryClassifier = @load NeuralNetworkBinaryClassifier pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkBinaryClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkBinaryClassifier(builder=...)`.\n\n`NeuralNetworkBinaryClassifier` is for training a data-dependent Flux.jl neural network for making probabilistic predictions of a binary (`Multiclass{2}` or `OrderedFactor{2}`) target, given a table of `Continuous` features. Users provide a recipe for constructing  the network, based on properties of the data that is encountered, by specifying  an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedder` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedder` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Multiclass{2}` or `OrderedFactor{2}`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Short()`: An MLJFlux builder that constructs a neural network. Possible  `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and `MLJFlux.MLP`. See  MLJFlux.jl documentation for examples of user-defined builders. See also `finaliser`  below.\n  * `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the updating of the weights of the network. For further reference, see [the Flux optimiser documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.binarycrossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are:\n\n      * `Flux.binarycrossentropy`: Standard binary classification loss, also known as the log loss.\n      * `Flux.logitbinarycrossentropy`: Mathematically equal to crossentropy, but numerically more stable than finalising the outputs with `σ` and then calculating crossentropy. You will need to specify `finaliser=identity` to remove MLJFlux's default sigmoid finaliser, and understand that the output of `predict` is then unnormalized (no longer probabilistic).\n      * `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives.\n      * `Flux.binary_focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples.\n\n    Currently MLJ measures are not supported values of `loss`.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `finaliser=Flux.σ`: The final activation function of the neural network (applied after the network defined by `builder`). Defaults to `Flux.σ`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above. Predictions are probabilistic but uncalibrated.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedder` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations which make up the neural network. This includes  the final layer specified by `finaliser` (eg, `softmax`).\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a classification model using the Iris dataset. This is a very basic example, using a default builder and no standardization.  For a more advanced illustration, see [`NeuralNetworkRegressor`](@ref) or [`ImageClassifier`](@ref), and examples in the MLJFlux.jl documentation.\n\n```julia\nusing MLJ, Flux\nimport Optimisers\nimport RDatasets\n```\n\nFirst, we can load the data:\n\n```julia\nmtcars = RDatasets.dataset(\"datasets\", \"mtcars\");\ny, X = unpack(mtcars, ==(:VS), in([:MPG, :Cyl, :Disp, :HP, :WT, :QSec]));\n```\n\nNote that `y` is a vector and `X` a table.\n\n```julia\ny = categorical(y) # classifier takes catogorical input\nX_f32 = Float32.(X) # To match floating point type of the neural network layers\nNeuralNetworkBinaryClassifier = @load NeuralNetworkBinaryClassifier pkg=MLJFlux\nbclf = NeuralNetworkBinaryClassifier()\n```\n\nNext, we can train the model:\n\n```julia\nmach = machine(bclf, X_f32, y)\nfit!(mach)\n```\n\nWe can train the model in an incremental fashion, altering the learning rate as we go, provided `optimizer_changes_trigger_retraining` is `false` (the default). Here, we also change the number of (total) iterations:\n\n```julia-repl\njulia> bclf.optimiser\nAdam(0.001, (0.9, 0.999), 1.0e-8)\n```\n\n```julia\nbclf.optimiser = Optimisers.Adam(eta = bclf.optimiser.eta * 2)\nbclf.epochs = bclf.epochs + 5\n\nfit!(mach, verbosity=2) # trains 5 more epochs\n```\n\nWe can inspect the mean training loss using the `cross_entropy` function:\n\n```julia\ntraining_loss = cross_entropy(predict(mach, X_f32), y)\n```\n\nAnd we can access the Flux chain (model) using `fitted_params`:\n\n```julia\nchain = fitted_params(mach).chain\n```\n\nFinally, we can see how the out-of-sample performance changes over time, using MLJ's `learning_curve` function:\n\n```julia\nr = range(bclf, :epochs, lower=1, upper=200, scale=:log10)\ncurve = learning_curve(\n    bclf,\n    X_f32,\n    y,\n    range=r,\n    resampling=Holdout(fraction_train=0.7),\n    measure=cross_entropy,\n)\nusing Plots\nplot(\n   curve.parameter_values,\n   curve.measurements,\n   xlab=curve.parameter_name,\n   xscale=curve.parameter_scale,\n   ylab = \"Cross Entropy\",\n)\n\n```\n\nSee also [`ImageClassifier`](@ref).\n"""
+":docstring" = """```\nNeuralNetworkBinaryClassifier\n```\n\nA model type for constructing a neural network binary classifier, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkBinaryClassifier = @load NeuralNetworkBinaryClassifier pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkBinaryClassifier()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkBinaryClassifier(builder=...)`.\n\n`NeuralNetworkBinaryClassifier` is for training a data-dependent Flux.jl neural network for making probabilistic predictions of a binary (`Multiclass{2}` or `OrderedFactor{2}`) target, given a table of `Continuous` features. Users provide a recipe for constructing  the network, based on properties of the data that is encountered, by specifying  an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedderLayer` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedderLayer` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Multiclass{2}` or `OrderedFactor{2}`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Short()`: An MLJFlux builder that constructs a neural network. Possible  `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and `MLJFlux.MLP`. See  MLJFlux.jl documentation for examples of user-defined builders. See also `finaliser`  below.\n  * `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the updating of the weights of the network. For further reference, see [the Flux optimiser documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.binarycrossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are:\n\n      * `Flux.binarycrossentropy`: Standard binary classification loss, also known as the log loss.\n      * `Flux.logitbinarycrossentropy`: Mathematically equal to crossentropy, but numerically more stable than finalising the outputs with `σ` and then calculating crossentropy. You will need to specify `finaliser=identity` to remove MLJFlux's default sigmoid finaliser, and understand that the output of `predict` is then unnormalized (no longer probabilistic).\n      * `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives.\n      * `Flux.binary_focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples.\n\n    Currently MLJ measures are not supported values of `loss`.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increassing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `finaliser=Flux.σ`: The final activation function of the neural network (applied after the network defined by `builder`). Defaults to `Flux.σ`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above. Predictions are probabilistic but uncalibrated.\n  * `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions returned above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedderLayer` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers,  functions, and activations which make up the neural network. This includes  the final layer specified by `finaliser` (eg, `softmax`).\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalised if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a classification model using the Iris dataset. This is a very basic example, using a default builder and no standardization.  For a more advanced illustration, see [`NeuralNetworkRegressor`](@ref) or [`ImageClassifier`](@ref), and examples in the MLJFlux.jl documentation.\n\n```julia\nusing MLJ, Flux\nimport Optimisers\nimport RDatasets\n```\n\nFirst, we can load the data:\n\n```julia\nmtcars = RDatasets.dataset(\"datasets\", \"mtcars\");\ny, X = unpack(mtcars, ==(:VS), in([:MPG, :Cyl, :Disp, :HP, :WT, :QSec]));\n```\n\nNote that `y` is a vector and `X` a table.\n\n```julia\ny = categorical(y) # classifier takes catogorical input\nX_f32 = Float32.(X) # To match floating point type of the neural network layers\nNeuralNetworkBinaryClassifier = @load NeuralNetworkBinaryClassifier pkg=MLJFlux\nbclf = NeuralNetworkBinaryClassifier()\n```\n\nNext, we can train the model:\n\n```julia\nmach = machine(bclf, X_f32, y)\nfit!(mach)\n```\n\nWe can train the model in an incremental fashion, altering the learning rate as we go, provided `optimizer_changes_trigger_retraining` is `false` (the default). Here, we also change the number of (total) iterations:\n\n```julia-repl\njulia> bclf.optimiser\nAdam(0.001, (0.9, 0.999), 1.0e-8)\n```\n\n```julia\nbclf.optimiser = Optimisers.Adam(eta = bclf.optimiser.eta * 2)\nbclf.epochs = bclf.epochs + 5\n\nfit!(mach, verbosity=2) # trains 5 more epochs\n```\n\nWe can inspect the mean training loss using the `cross_entropy` function:\n\n```julia\ntraining_loss = cross_entropy(predict(mach, X_f32), y)\n```\n\nAnd we can access the Flux chain (model) using `fitted_params`:\n\n```julia\nchain = fitted_params(mach).chain\n```\n\nFinally, we can see how the out-of-sample performance changes over time, using MLJ's `learning_curve` function:\n\n```julia\nr = range(bclf, :epochs, lower=1, upper=200, scale=:log10)\ncurve = learning_curve(\n    bclf,\n    X_f32,\n    y,\n    range=r,\n    resampling=Holdout(fraction_train=0.7),\n    measure=cross_entropy,\n)\nusing Plots\nplot(\n   curve.parameter_values,\n   curve.measurements,\n   xlab=curve.parameter_name,\n   xscale=curve.parameter_scale,\n   ylab = \"Cross Entropy\",\n)\n\n```\n\nSee also [`ImageClassifier`](@ref).\n"""
 ":name" = "NeuralNetworkBinaryClassifier"
 ":human_name" = "neural network binary classifier"
 ":is_supervised" = "`true`"
@@ -8802,7 +8874,7 @@
 ":supports_weights" = "`false`"
 ":supports_class_weights" = "`false`"
 ":supports_online" = "`false`"
-":docstring" = """```\nNeuralNetworkRegressor\n```\n\nA model type for constructing a neural network regressor, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkRegressor = @load NeuralNetworkRegressor pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkRegressor(builder=...)`.\n\n`NeuralNetworkRegressor` is for training a data-dependent Flux.jl neural network to predict a `Continuous` target, given a table of `Continuous` features. Users provide a recipe for constructing the network, based on properties of the data that is encountered, by specifying an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedder` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedder` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Continuous`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs a neural  network. Possible `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and  `MLJFlux.MLP`. See MLJFlux documentation for more on builders, and the example below  for using the `@builder` convenience macro.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.mse`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a regression task, natural loss functions are:\n\n      * `Flux.mse`\n      * `Flux.mae`\n      * `Flux.msle`\n      * `Flux.huber_loss`\n\n    Currently MLJ measures are not supported as loss functions here.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increasing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedder` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers, functions,  and activations which make up the neural network.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalized if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a regression model for the Boston house price dataset.\n\n```julia\nusing MLJ\nimport MLJFlux\nusing Flux\nimport Optimisers\n```\n\nFirst, we load in the data: The `:MEDV` column becomes the target vector `y`, and all remaining columns go into a table `X`, with the exception of `:CHAS`:\n\n```julia\ndata = OpenML.load(531); # Loads from https://www.openml.org/d/531\ny, X = unpack(data, ==(:MEDV), !=(:CHAS); rng=123);\n\nscitype(y)\nschema(X)\n```\n\nSince MLJFlux models do not handle ordered factors, we'll treat `:RAD` as `Continuous`:\n\n```julia\nX = coerce(X, :RAD=>Continuous)\n```\n\nSplitting off a test set:\n\n```julia\n(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true);\n```\n\nNext, we can define a `builder`, making use of a convenience macro to do so.  In the following `@builder` call, `n_in` is a proxy for the number input features (which will be known at `fit!` time) and `rng` is a proxy for a RNG (which will be passed from the `rng` field of `model` defined below). We also have the parameter `n_out` which is the number of output features. As we are doing single target regression, the value passed will always be `1`, but the builder we define will also work for [`MultitargetNeuralNetworkRegressor`](@ref).\n\n```julia\nbuilder = MLJFlux.@builder begin\n    init=Flux.glorot_uniform(rng)\n    Chain(\n        Dense(n_in, 64, relu, init=init),\n        Dense(64, 32, relu, init=init),\n        Dense(32, n_out, init=init),\n    )\nend\n```\n\nInstantiating a model:\n\n```julia\nNeuralNetworkRegressor = @load NeuralNetworkRegressor pkg=MLJFlux\nmodel = NeuralNetworkRegressor(\n    builder=builder,\n    rng=123,\n    epochs=20\n)\n```\n\nWe arrange for standardization of the the target by wrapping our model in `TransformedTargetModel`, and standardization of the features by inserting the wrapped model in a pipeline:\n\n```julia\npipe = Standardizer |> TransformedTargetModel(model, transformer=Standardizer)\n```\n\nIf we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)`.\n\n```julia\nmach = machine(pipe, X, y)\nfit!(mach, verbosity=2)\n\n# first element initial loss, 2:end per epoch training losses\nreport(mach).transformed_target_model_deterministic.model.training_losses\n```\n\n## Experimenting with learning rate\n\nWe can visually compare how the learning rate affects the predictions:\n\n```julia\nusing Plots\n\nrates = rates = [5e-5, 1e-4, 0.005, 0.001, 0.05]\nplt=plot()\n\nforeach(rates) do η\n  pipe.transformed_target_model_deterministic.model.optimiser = Optimisers.Adam(η)\n  fit!(mach, force=true, verbosity=0)\n  losses =\n      report(mach).transformed_target_model_deterministic.model.training_losses[3:end]\n  plot!(1:length(losses), losses, label=η)\nend\n\nplt\n\npipe.transformed_target_model_deterministic.model.optimiser.eta = Optimisers.Adam(0.0001)\n```\n\nWith the learning rate fixed, we compute a CV estimate of the performance (using all data bound to `mach`) and compare this with performance on the test set:\n\n```julia\n# CV estimate, based on `(X, y)`:\nevaluate!(mach, resampling=CV(nfolds=5), measure=l2)\n\n# loss for `(Xtest, test)`:\nfit!(mach) # train on `(X, y)`\nyhat = predict(mach, Xtest)\nl2(yhat, ytest)\n```\n\nThese losses, for the pipeline model, refer to the target on the original, unstandardized, scale.\n\nFor implementing stopping criterion and other iteration controls, refer to examples linked from the MLJFlux documentation.\n\nSee also [`MultitargetNeuralNetworkRegressor`](@ref)\n"""
+":docstring" = """```\nNeuralNetworkRegressor\n```\n\nA model type for constructing a neural network regressor, based on [MLJFlux.jl](https://github.com/alan-turing-institute/MLJFlux.jl), and implementing the MLJ model interface.\n\nFrom MLJ, the type can be imported using\n\n```\nNeuralNetworkRegressor = @load NeuralNetworkRegressor pkg=MLJFlux\n```\n\nDo `model = NeuralNetworkRegressor()` to construct an instance with default hyper-parameters. Provide keyword arguments to override hyper-parameter defaults, as in `NeuralNetworkRegressor(builder=...)`.\n\n`NeuralNetworkRegressor` is for training a data-dependent Flux.jl neural network to predict a `Continuous` target, given a table of `Continuous` features. Users provide a recipe for constructing the network, based on properties of the data that is encountered, by specifying an appropriate `builder`. See MLJFlux documentation for more on builders.\n\nIn addition to features with `Continuous` scientific element type, this model supports categorical features in the input table. If present, such features are embedded into dense vectors by the use of an additional `EntityEmbedderLayer` layer after the input, as described in Entity Embeddings of Categorical Variables by Cheng Guo, Felix Berkhahn arXiv, 2016.\n\n# Training data\n\nIn MLJ or MLJBase, bind an instance `model` to data with\n\n```\nmach = machine(model, X, y)\n```\n\nHere:\n\n  * `X` provides input features and is either: (i) a `Matrix` with `Continuous` element scitype (typically `Float32`); or (ii) a table of input features (eg, a `DataFrame`) whose columns have `Continuous`, `Multiclass` or `OrderedFactor` element scitype; check column scitypes with `schema(X)`.  If any `Multiclass` or `OrderedFactor` features appear, the constructed network will use an `EntityEmbedderLayer` layer to transform them into dense vectors. If `X` is a `Matrix`, it is assumed that columns correspond to features and rows corresponding to observations.\n\n  * `y` is the target, which can be any `AbstractVector` whose element scitype is `Continuous`; check the scitype with `scitype(y)`\n\nTrain the machine with `fit!(mach, rows=...)`.\n\n# Hyper-parameters\n\n  * `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs a neural  network. Possible `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, and  `MLJFlux.MLP`. See MLJFlux documentation for more on builders, and the example below  for using the `@builder` convenience macro.\n  * `optimiser::Optimisers.Adam()`: An Optimisers.jl optimiser. The optimiser performs the updating of the weights of the network. To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of `10` between `1` and `1e-7`.\n  * `loss=Flux.mse`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`.  Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a regression task, natural loss functions are:\n\n      * `Flux.mse`\n      * `Flux.mae`\n      * `Flux.msle`\n      * `Flux.huber_loss`\n\n    Currently MLJ measures are not supported as loss functions here.\n  * `epochs::Int=10`: The duration of training, in epochs. Typically, one epoch represents one pass through the complete the training dataset.\n  * `batch_size::int=1`: the batch size to be used for training, representing the number of samples per update of the network weights. Typically, batch size is between `8` and `512`. Increasing batch size may accelerate training if `acceleration=CUDALibs()` and a GPU is available.\n  * `lambda::Float64=0`: The strength of the weight regularization penalty. Can be any value in the range `[0, ∞)`. Note the history reports unpenalized losses.\n  * `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization.\n  * `rng::Union{AbstractRNG, Int64}`: The random number generator or seed used during training. The default is `Random.default_rng()`.\n  * `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when re-fitting a machine if the associated optimiser has changed. If `true`, the associated machine will retrain from scratch on `fit!` call, otherwise it will not.\n  * `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CUDALibs()`.\n  * `embedding_dims`: a `Dict` whose keys are names of categorical features, given as symbols, and whose values are numbers representing the desired dimensionality of the entity embeddings of such features: an integer value of `7`, say, sets the embedding dimensionality to `7`; a float value of `0.5`, say, sets the embedding dimensionality to `ceil(0.5 * c)`, where `c` is the number of feature levels.  Unspecified feature dimensionality defaults to `min(c - 1, 10)`.\n\n# Operations\n\n  * `predict(mach, Xnew)`: return predictions of the target given new features `Xnew`, which should have the same scitype as `X` above.\n  * `transform(mach, Xnew)`: Assuming `Xnew` has the same schema as `X`, transform the categorical features of `Xnew` into dense `Continuous` vectors using the `MLJFlux.EntityEmbedderLayer` layer present in the network. Does nothing in case the model was trained on an input `X` that lacks categorical features.\n\n# Fitted parameters\n\nThe fields of `fitted_params(mach)` are:\n\n  * `chain`: The trained \"chain\" (Flux.jl model), namely the series of layers, functions,  and activations which make up the neural network.\n\n# Report\n\nThe fields of `report(mach)` are:\n\n  * `training_losses`: A vector of training losses (penalized if `lambda != 0`) in  historical order, of length `epochs + 1`.  The first element is the pre-training loss.\n\n# Examples\n\nIn this example we build a regression model for the Boston house price dataset.\n\n```julia\nusing MLJ\nimport MLJFlux\nusing Flux\nimport Optimisers\n```\n\nFirst, we load in the data: The `:MEDV` column becomes the target vector `y`, and all remaining columns go into a table `X`, with the exception of `:CHAS`:\n\n```julia\ndata = OpenML.load(531); # Loads from https://www.openml.org/d/531\ny, X = unpack(data, ==(:MEDV), !=(:CHAS); rng=123);\n\nscitype(y)\nschema(X)\n```\n\nSince MLJFlux models do not handle ordered factors, we'll treat `:RAD` as `Continuous`:\n\n```julia\nX = coerce(X, :RAD=>Continuous)\n```\n\nSplitting off a test set:\n\n```julia\n(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true);\n```\n\nNext, we can define a `builder`, making use of a convenience macro to do so.  In the following `@builder` call, `n_in` is a proxy for the number input features (which will be known at `fit!` time) and `rng` is a proxy for a RNG (which will be passed from the `rng` field of `model` defined below). We also have the parameter `n_out` which is the number of output features. As we are doing single target regression, the value passed will always be `1`, but the builder we define will also work for [`MultitargetNeuralNetworkRegressor`](@ref).\n\n```julia\nbuilder = MLJFlux.@builder begin\n    init=Flux.glorot_uniform(rng)\n    Chain(\n        Dense(n_in, 64, relu, init=init),\n        Dense(64, 32, relu, init=init),\n        Dense(32, n_out, init=init),\n    )\nend\n```\n\nInstantiating a model:\n\n```julia\nNeuralNetworkRegressor = @load NeuralNetworkRegressor pkg=MLJFlux\nmodel = NeuralNetworkRegressor(\n    builder=builder,\n    rng=123,\n    epochs=20\n)\n```\n\nWe arrange for standardization of the the target by wrapping our model in `TransformedTargetModel`, and standardization of the features by inserting the wrapped model in a pipeline:\n\n```julia\npipe = Standardizer |> TransformedTargetModel(model, transformer=Standardizer)\n```\n\nIf we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)`.\n\n```julia\nmach = machine(pipe, X, y)\nfit!(mach, verbosity=2)\n\n# first element initial loss, 2:end per epoch training losses\nreport(mach).transformed_target_model_deterministic.model.training_losses\n```\n\n## Experimenting with learning rate\n\nWe can visually compare how the learning rate affects the predictions:\n\n```julia\nusing Plots\n\nrates = rates = [5e-5, 1e-4, 0.005, 0.001, 0.05]\nplt=plot()\n\nforeach(rates) do η\n  pipe.transformed_target_model_deterministic.model.optimiser = Optimisers.Adam(η)\n  fit!(mach, force=true, verbosity=0)\n  losses =\n      report(mach).transformed_target_model_deterministic.model.training_losses[3:end]\n  plot!(1:length(losses), losses, label=η)\nend\n\nplt\n\npipe.transformed_target_model_deterministic.model.optimiser.eta = Optimisers.Adam(0.0001)\n```\n\nWith the learning rate fixed, we compute a CV estimate of the performance (using all data bound to `mach`) and compare this with performance on the test set:\n\n```julia\n# CV estimate, based on `(X, y)`:\nevaluate!(mach, resampling=CV(nfolds=5), measure=l2)\n\n# loss for `(Xtest, test)`:\nfit!(mach) # train on `(X, y)`\nyhat = predict(mach, Xtest)\nl2(yhat, ytest)\n```\n\nThese losses, for the pipeline model, refer to the target on the original, unstandardized, scale.\n\nFor implementing stopping criterion and other iteration controls, refer to examples linked from the MLJFlux documentation.\n\nSee also [`MultitargetNeuralNetworkRegressor`](@ref)\n"""
 ":name" = "NeuralNetworkRegressor"
 ":human_name" = "neural network regressor"
 ":is_supervised" = "`true`"
diff --git a/src/registry/Models.toml b/src/registry/Models.toml
index 79e18d4..1de1c8a 100644
--- a/src/registry/Models.toml
+++ b/src/registry/Models.toml
@@ -1,13 +1,14 @@
 BetaML = ["RandomForestRegressor", "GaussianMixtureImputer", "RandomForestClassifier", "RandomForestImputer", "PerceptronClassifier", "AutoEncoder", "DecisionTreeRegressor", "PegasosClassifier", "KMeansClusterer", "NeuralNetworkRegressor", "MultitargetGaussianMixtureRegressor", "GaussianMixtureRegressor", "MultitargetNeuralNetworkRegressor", "DecisionTreeClassifier", "GeneralImputer", "NeuralNetworkClassifier", "SimpleImputer", "GaussianMixtureClusterer", "KernelPerceptronClassifier", "KMedoidsClusterer"]
+MLJEnsembles = ["EnsembleModel"]
 CatBoost = ["CatBoostRegressor", "CatBoostClassifier"]
 NearestNeighborModels = ["KNNClassifier", "MultitargetKNNClassifier", "MultitargetKNNRegressor", "KNNRegressor"]
 MLJScikitLearnInterface = ["ProbabilisticSGDClassifier", "RidgeCVClassifier", "LogisticClassifier", "RandomForestRegressor", "ElasticNetCVRegressor", "PerceptronClassifier", "MultiTaskLassoRegressor", "LinearRegressor", "HDBSCAN", "DBSCAN", "RidgeRegressor", "LassoLarsICRegressor", "ARDRegressor", "SVMNuRegressor", "RidgeClassifier", "SGDRegressor", "ComplementNBClassifier", "HuberRegressor", "SVMNuClassifier", "GradientBoostingClassifier", "GaussianProcessRegressor", "SVMLinearRegressor", "LarsRegressor", "MeanShift", "HistGradientBoostingClassifier", "AdaBoostRegressor", "AffinityPropagation", "MultiTaskLassoCVRegressor", "OrthogonalMatchingPursuitRegressor", "BernoulliNBClassifier", "PassiveAggressiveClassifier", "RidgeCVRegressor", "SVMRegressor", "GaussianNBClassifier", "ExtraTreesClassifier", "KMeans", "MultiTaskElasticNetCVRegressor", "LassoLarsCVRegressor", "OrthogonalMatchingPursuitCVRegressor", "AdaBoostClassifier", "PassiveAggressiveRegressor", "BayesianRidgeRegressor", "GaussianProcessClassifier", "BaggingClassifier", "OPTICS", "RANSACRegressor", "KNeighborsRegressor", "HistGradientBoostingRegressor", "MiniBatchKMeans", "LassoCVRegressor", "DummyRegressor", "BisectingKMeans", "LassoLarsRegressor", "LarsCVRegressor", "KNeighborsClassifier", "SVMLinearClassifier", "FeatureAgglomeration", "DummyClassifier", "BaggingRegressor", "BayesianQDA", "BayesianLDA", "SGDClassifier", "TheilSenRegressor", "SpectralClustering", "Birch", "AgglomerativeClustering", "ElasticNetRegressor", "RandomForestClassifier", "LogisticCVClassifier", "MultiTaskElasticNetRegressor", "ExtraTreesRegressor", "LassoRegressor", "MultinomialNBClassifier", "GradientBoostingRegressor", "SVMClassifier"]
 OutlierDetectionNeighbors = ["ABODDetector", "DNNDetector", "LOFDetector", "KNNDetector", "COFDetector"]
 SIRUS = ["StableRulesClassifier", "StableForestClassifier", "StableRulesRegressor", "StableForestRegressor"]
 MLJIteration = ["IteratedModel"]
-PartialLeastSquaresRegressor = ["KPLSRegressor", "PLSRegressor"]
 PartitionedLS = ["PartLS"]
 MLJLinearModels = ["QuantileRegressor", "LogisticClassifier", "MultinomialClassifier", "LADRegressor", "RidgeRegressor", "RobustRegressor", "ElasticNetRegressor", "LinearRegressor", "LassoRegressor", "HuberRegressor"]
+Maxnet = ["MaxnetBinaryClassifier"]
 ParallelKMeans = ["KMeans"]
 NaiveBayes = ["GaussianNBClassifier", "MultinomialNBClassifier"]
 MLJBase = ["Pipeline", "Resampler", "Stack", "TransformedTargetModel"]
@@ -24,7 +25,7 @@ LightGBM = ["LGBMClassifier", "LGBMRegressor"]
 LaplaceRedux = ["LaplaceClassifier", "LaplaceRegressor"]
 XGBoost = ["XGBoostCount", "XGBoostRegressor", "XGBoostClassifier"]
 EvoTrees = ["EvoTreeClassifier", "EvoTreeGaussian", "EvoTreeMLE", "EvoTreeRegressor", "EvoTreeCount"]
-SymbolicRegression = ["MultitargetSRRegressor", "SRRegressor"]
+SymbolicRegression = ["SRTestRegressor", "MultitargetSRTestRegressor", "MultitargetSRRegressor", "SRRegressor"]
 MLJModels = ["ConstantClassifier", "Standardizer", "DeterministicConstantClassifier", "UnivariateTimeTypeToContinuous", "OneHotEncoder", "ContinuousEncoder", "UnivariateBoxCoxTransformer", "InteractionTransformer", "ConstantRegressor", "UnivariateDiscretizer", "BinaryThresholdPredictor", "FillImputer", "DeterministicConstantRegressor", "UnivariateStandardizer", "UnivariateFillImputer"]
 OneRule = ["OneRuleClassifier"]
 OutlierDetectionPython = ["MCDDetector", "COPODDetector", "HBOSDetector", "IForestDetector", "SOSDetector", "ABODDetector", "LOFDetector", "PCADetector", "INNEDetector", "OCSVMDetector", "ECODDetector", "SODDetector", "LODADetector", "KDEDetector", "CDDetector", "KNNDetector", "GMMDetector", "COFDetector", "CBLOFDetector", "LOCIDetector", "LMDDDetector", "RODDetector"]
@@ -32,5 +33,5 @@ SelfOrganizingMaps = ["SelfOrganizingMap"]
 LIBSVM = ["SVC", "EpsilonSVR", "LinearSVC", "ProbabilisticSVC", "NuSVR", "NuSVC", "ProbabilisticNuSVC", "OneClassSVM"]
 TSVD = ["TSVDTransformer"]
 GLM = ["LinearBinaryClassifier", "LinearCountRegressor", "LinearRegressor"]
+MLJTransforms = ["EntityEmbedder"]
 MLJFlux = ["MultitargetNeuralNetworkRegressor", "NeuralNetworkClassifier", "ImageClassifier", "NeuralNetworkBinaryClassifier", "NeuralNetworkRegressor"]
-MLJEnsembles = ["EnsembleModel"]
diff --git a/src/registry/Project.toml b/src/registry/Project.toml
index 994e5bd..61c2aca 100644
--- a/src/registry/Project.toml
+++ b/src/registry/Project.toml
@@ -27,12 +27,12 @@ MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd"
 MLJText = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91"
+Maxnet = "81f79f80-22f2-4e41-ab86-00c11cf0f26f"
 NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
 OneRule = "90484964-6d6a-4979-af09-8657dbed84ff"
 OutlierDetectionNeighbors = "51249a0a-cb36-4849-8e04-30c7f8d311bb"
 OutlierDetectionPython = "2449c660-d36c-460e-a68b-92ab3c865b3e"
 ParallelKMeans = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
-PartialLeastSquaresRegressor = "f4b1acfe-f311-436c-bb79-8483f53c17d5"
 PartitionedLS = "19f41c5e-8610-11e9-2f2a-0d67e7c5027f"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 SIRUS = "cdeec39e-fb35-4959-aadb-a1dd5dede958"