add LearnAPI.clone; tweak contract for update

ablaom · ablaom · commit 511ce6cb9de2 · 2024-10-06T22:14:11.000+13:00
diff --git a/Project.toml b/Project.toml
@@ -13,9 +13,11 @@ julia = "1.6"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["DataFrames", "LinearAlgebra", "MLUtils", "Serialization", "Tables", "Test"]
+test = ["DataFrames", "LinearAlgebra", "MLUtils", "Random", "Serialization", "Statistics", "Tables", "Test"]
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -39,11 +39,6 @@
   - [ ] meta-algorithms
 
 - [ ] In a utility package provide:
-   - [ ] Method to clone an algorithm with user-specified property (hyperparameter)
-     replacement in `LearnAPI.clone(algorithm, p1=value1, p22=value2, ...)` (since
-     `algorithm` can have any type, can't really overload `Base.replace` without
-     piracy). This will be needed in tuning meta-algorithms. Or should this be in
-     LearnAPI.jl proper, to expose it to all users?
    - [ ] Methods to facilitate common-use case data interfaces: support simultaneously
      `fit` data of the form `data = (X, y)` where `X` is table *or* matrix, and `data` a
      table with target specified by hyperparameter; here `obs` will return a thin wrapping
diff --git a/docs/src/reference.md b/docs/src/reference.md
@@ -91,9 +91,15 @@ named_properties = NamedTuple{properties}(getproperty.(Ref(algorithm), propertie
 @assert algorithm == LearnAPI.constructor(algorithm)(; named_properties...)
 ```
 
+which can be tested with `@assert `[`LearnAPI.clone(algorithm)`](@ref)` == algorithm`.
+
 Note that if if `algorithm` is an instance of a *mutable* struct, this requirement
 generally requires overloading `Base.==` for the struct.
 
+No LearnAPI.jl method is permitted to mutate an algorithm. In particular, one should make
+deep copies of RNG hyperparameters before using them in a new implementation of
+[`fit`](@ref).
+
 #### Composite algorithms (wrappers)
 
 A *composite algorithm* is one with at least one property that can take other algorithms
@@ -179,6 +185,14 @@ Most algorithms will also implement [`predict`](@ref) and/or [`transform`](@ref)
   record general information about the algorithm. Only [`LearnAPI.constructor`](@ref) and
   [`LearnAPI.functions`](@ref) are universally compulsory.
 
+
+## Utilities
+
+```@docs
+LearnAPI.clone
+LearnAPI.@trait
+```
+
 ---
 
 ¹ We acknowledge users may not like this terminology, and may know "algorithm" by some
diff --git a/docs/src/traits.md b/docs/src/traits.md
@@ -105,5 +105,4 @@ LearnAPI.iteration_parameter
 LearnAPI.fit_observation_scitype
 LearnAPI.target_observation_scitype
 LearnAPI.predict_or_transform_mutates
-LearnAPI.@trait
 ```
diff --git a/src/LearnAPI.jl b/src/LearnAPI.jl
@@ -11,6 +11,7 @@ include("target_weights_features.jl")
 include("obs.jl")
 include("accessor_functions.jl")
 include("traits.jl")
+include("clone.jl")
 
 export @trait
 export fit, update, update_observations, update_features
diff --git a/src/clone.jl b/src/clone.jl
@@ -0,0 +1,23 @@
+"""
+    LearnAPI.clone(algorithm; replacements...)
+
+Return a shallow copy of `algorithm` with the specified hyperparameter replacements.
+
+```julia
+clone(algorithm; epochs=100, learning_rate=0.01)
+```
+
+It is guaranted that `LearnAPI.clone(algorithm) == algorithm`.
+
+"""
+function clone(algorithm; replacements...)
+    reps = NamedTuple(replacements)
+    names = propertynames(algorithm)
+    rep_names = keys(reps)
+
+    new_values = map(names) do name
+        name in rep_names && return getproperty(reps, name)
+        getproperty(algorithm, name)
+    end
+    return LearnAPI.constructor(algorithm)(NamedTuple{names}(new_values)...)
+end
diff --git a/src/fit_update.jl b/src/fit_update.jl
@@ -59,14 +59,15 @@ Return an updated version of the `model` object returned by a previous [`fit`](@
 `update` call, but with the specified hyperparameter replacements, in the form `p1=value1,
 p2=value2, ...`.
 
-Provided that `data` is identical with the data presented in a preceding `fit` call, as in
-the example below, execution is semantically equivalent to the call `fit(algorithm,
-data)`, where `algorithm` is `LearnAPI.algorithm(model)` with the specified
-replacements. In some cases (typically, when changing an iteration parameter) there may be
-a performance benefit to using `update` instead of retraining ab initio.
+Provided that `data` is identical with the data presented in a preceding `fit` call *and*
+there is at most one hyperparameter replacement, as in the example below, execution is
+semantically equivalent to the call `fit(algorithm, data)`, where `algorithm` is
+`LearnAPI.algorithm(model)` with the specified replacements. In some cases (typically,
+when changing an iteration parameter) there may be a performance benefit to using `update`
+instead of retraining ab initio.
 
-If `data` differs from that in the preceding `fit` or `update` call, then behaviour is
-algorithm-specific.
+If `data` differs from that in the preceding `fit` or `update` call, or there is more than
+one hyperparameter replacement, then behaviour is algorithm-specific.
 
 ```julia
 algorithm = MyForest(ntrees=100)
@@ -85,6 +86,8 @@ See also [`fit`](@ref), [`update_observations`](@ref), [`update_features`](@ref)
 Implementation is optional. The signature must include
 `verbosity`. $(DOC_IMPLEMENTED_METHODS(":(LearnAPI.update)"))
 
+See also [`LearnAPI.clone`](@ref)
+
 """
 update(model, data1, datas...; kwargs...) = update(model, (data1, datas...); kwargs...)
 
@@ -119,6 +122,8 @@ See also [`fit`](@ref), [`update`](@ref), [`update_features`](@ref).
 Implementation is optional. The signature must include
 `verbosity`. $(DOC_IMPLEMENTED_METHODS(":(LearnAPI.update_observations)"))
 
+See also [`LearnAPI.clone`](@ref).
+
 """
 update_observations(algorithm, data1, datas...; kwargs...) =
     update_observations(algorithm, (data1, datas...); kwargs...)
@@ -144,6 +149,8 @@ See also [`fit`](@ref), [`update`](@ref), [`update_features`](@ref).
 Implementation is optional. The signature must include
 `verbosity`. $(DOC_IMPLEMENTED_METHODS(":(LearnAPI.update_features)"))
 
+See also [`LearnAPI.clone`](@ref).
+
 """
 update_features(algorithm, data1, datas...; kwargs...) =
     update_features(algorithm, (data1, datas...); kwargs...)
diff --git a/src/traits.jl b/src/traits.jl
@@ -105,29 +105,43 @@ value is non-empty.
 All new implementations must overload this trait. Here's a checklist for elements in the
 return value:
 
-| symbol                            | implementation/overloading compulsory? | include in returned tuple?         |
-|-----------------------------------|----------------------------------------|------------------------------------|
-| `:(LearnAPI.fit)`                 | yes                                    | yes                                |
-| `:(LearnAPI.algorithm)`           | yes                                    | yes                                |
-| `:(LearnAPI.minimize)`            | no                                     | yes                                |
-| `:(LearnAPI.obs)`                 | no                                     | yes                                |
-| `:(LearnAPI.features)`            | no                                     | yes, unless `fit` consumes no data |
-| `:(LearnAPI.update)`              | no                                     | only if implemented                |
-| `:(LearnAPI.update_observations)` | no                                     | only if implemented                |
-| `:(LearnAPI.update_features)`     | no                                     | only if implemented                |
-| `:(LearnAPI.target)`              | no                                     | only if implemented                |
-| `:(LearnAPI.weights)`             | no                                     | only if implemented                |
-| `:(LearnAPI.predict)`             | no                                     | only if implemented                |
-| `:(LearnAPI.transform)`           | no                                     | only if implemented                |
-| `:(LearnAPI.inverse_transform)`   | no                                     | only if implemented                |
-| <accessor functions>              | no                                     | only if implemented                |
+| expression                        | implementation compulsory? | include in returned tuple?         |
+|-----------------------------------|----------------------------|------------------------------------|
+| `:(LearnAPI.fit)`                 | yes                        | yes                                |
+| `:(LearnAPI.algorithm)`           | yes                        | yes                                |
+| `:(LearnAPI.minimize)`            | no                         | yes                                |
+| `:(LearnAPI.obs)`                 | no                         | yes                                |
+| `:(LearnAPI.features)`            | no                         | yes, unless `fit` consumes no data |
+| `:(LearnAPI.target)`              | no                         | only if implemented                |
+| `:(LearnAPI.weights)`             | no                         | only if implemented                |
+| `:(LearnAPI.update)`              | no                         | only if implemented                |
+| `:(LearnAPI.update_observations)` | no                         | only if implemented                |
+| `:(LearnAPI.update_features)`     | no                         | only if implemented                |
+| `:(LearnAPI.predict)`             | no                         | only if implemented                |
+| `:(LearnAPI.transform)`           | no                         | only if implemented                |
+| `:(LearnAPI.inverse_transform)`   | no                         | only if implemented                |
+| <accessor functions>              | no                         | only if implemented                |
 
 Also include any implemented accessor functions, both those owned by LearnaAPI.jl, and any
 algorithm-specific ones. The LearnAPI.jl accessor functions are: $ACCESSOR_FUNCTIONS_LIST.
 
 """
 functions(::Any) = ()
-
+functions() = (
+    :(LearnAPI.fit),
+    :(LearnAPI.algorithm),
+    :(LearnAPI.minimize),
+    :(LearnAPI.obs),
+    :(LearnAPI.features),
+    :(LearnAPI.target),
+    :(LearnAPI.weights),
+    :(LearnAPI.update),
+    :(LearnAPI.update_observations),
+    :(LearnAPI.update_features),
+    :(LearnAPI.predict),
+    :(LearnAPI.transform),
+    :(LearnAPI.inverse_transform),
+)
 
 """
     LearnAPI.kinds_of_proxy(algorithm)
diff --git a/test/clone.jl b/test/clone.jl
@@ -0,0 +1,22 @@
+using Test
+using LearnAPI
+
+struct Potato
+    x
+    y
+end
+
+Potato(; x=1, y=2) = Potato(x, y)
+LearnAPI.constructor(::Potato) = Potato
+
+@test LearnAPI.clone(Potato()) == Potato()
+
+p = LearnAPI.clone(Potato(), y=20)
+@test p.y == 20
+@test p.x == 1
+
+q = LearnAPI.clone(Potato(), y=20, x=10)
+@test q.y == 20
+@test q.x == 10
+
+true
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,103 +1,19 @@
 using Test
 
-@testset "tools.jl" begin
-    include("tools.jl")
+test_files = [
+    "tools.jl",
+    "traits.jl",
+    "clone.jl",
+    "integration/regression.jl",
+    "integration/static_algorithms.jl",
+]
+
+files = isempty(ARGS) ? test_files : ARGS
+
+for file in files
+    quote
+        @testset $file begin
+            include($file*".jl")
+        end
+    end |> eval
 end
-
-@testset "traits.jl" begin
-    include("traits.jl")
-end 
-
-# # INTEGRATION TESTS
-
-@testset "regression" begin
-    include("integration/regression.jl")
-end
-
-# @testset "classification" begin
-#     include("integration/classification.jl")
-# end
-
-# @testset "clustering" begin
-#     include("integration/clustering.jl")
-# end
-
-# @testset "gradient_descent" begin
-#     include("integration/gradient_descent.jl")
-# end
-
-# @testset "iterative_algorithms" begin
-#     include("integration/iterative_algorithms.jl")
-# end
-
-# @testset "incremental_algorithms" begin
-#     include("integration/incremental_algorithms.jl")
-# end
-
-# @testset "dimension_reduction" begin
-#     include("integration/dimension_reduction.jl")
-# end
-
-# @testset "encoders" begin
-#     include("integration/encoders.jl")
-# end
-
-@testset "static_algorithms" begin
-    include("integration/static_algorithms.jl")
-end
-
-# @testset "missing_value_imputation" begin
-#     include("integration/missing_value_imputation.jl")
-# end
-
-# @testset "ensemble_algorithms" begin
-#     include("integration/ensemble_algorithms.jl")
-# end
-
-# @testset "wrappers" begin
-#     include("integration/wrappers.jl")
-# end
-
-# @testset "time_series_forecasting" begin
-#     include("integration/time_series_forecasting.jl")
-# end
-
-# @testset "time_series_classification" begin
-#     include("integration/time_series_classification.jl")
-# end
-
-# @testset "survival_analysis" begin
-#     include("integration/survival_analysis.jl")
-# end
-
-# @testset "distribution_fitters" begin
-#     include("integration/distribution_fitters.jl")
-# end
-
-# @testset "Bayesian_algorithms" begin
-#     include("integration/Bayesian_algorithms.jl")
-# end
-
-# @testset "outlier_detection" begin
-#     include("integration/outlier_detection.jl")
-# end
-
-# @testset "collaborative_filtering" begin
-#     include("integration/collaborative_filtering.jl")
-# end
-
-# @testset "text_analysis" begin
-#     include("integration/text_analysis.jl")
-# end
-
-# @testset "audio_analysis" begin
-#     include("integration/audio_analysis.jl")
-# end
-
-# @testset "natural_language_processing" begin
-#     include("integration/natural_language_processing.jl")
-# end
-
-# @testset "image_processing" begin
-#     include("integration/image_processing.jl")
-# end