Merge pull request #72 from alan-turing-institute/dev

ablaom · web-flow · commit aed7b794b012 · 2019-10-30T15:35:54.000+13:00
Patch release 0.7.2
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJBase"
 uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.7.1"
+version = "0.7.2"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -12,7 +12,6 @@ OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
diff --git a/src/MLJBase.jl b/src/MLJBase.jl
@@ -9,6 +9,7 @@ export fit, update, clean!
 export predict, predict_mean, predict_mode, fitted_params
 export transform, inverse_transform, se, evaluate, best
 export info, info_dict
+export is_same_except
 
 export load_path, package_url, package_name, package_uuid  # model_traits.jl
 export input_scitype, supports_weights                     # model_traits.jl
@@ -74,7 +75,6 @@ import Missings.levels
 using Statistics
 using Random
 using InteractiveUtils
-using SparseArrays
 
 
 ## CONSTANTS
@@ -125,16 +125,22 @@ abstract type UnsupervisedNetwork <: Unsupervised end
 # `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report`
 # or, one the simplified versions
 # `fit(model, training_args...) -> fitresult`
-# `fit(model, X, ys...) -> fitresult`
-fit(model::Model, verbosity::Integer, args...) = fit(model, args...), nothing, nothing
+fit(model::Model, verbosity::Integer, args...) =
+    fit(model, args...), nothing, nothing
 
 # each model interface may optionally overload the following refitting
 # method:
 update(model::Model, verbosity, fitresult, cache, args...) =
     fit(model, verbosity, args...)
 
+# fallbacks for supervised models that don't support sample weights:
+fit(model::Supervised, verbosity::Integer, X, y, w) =
+    fit(model, verbosity, X, y)
+update(model::Supervised, verbosity, fitresult, cache, X, y, w) =
+    update(model, verbosity, fitresult, cache, X, y)
+
 # methods dispatched on a model and fit-result are called
-# *operations*.  supervised models must implement a `predict`
+# *operations*.  Supervised models must implement a `predict`
 # operation (extending the `predict` method of StatsBase).
 
 # unsupervised methods must implement this operation:
diff --git a/src/data.jl b/src/data.jl
@@ -205,7 +205,7 @@ integer arrays, in which case `d` is broadcast over all elements.
     julia> d(int(v)) == v
     true
 
-*Warning:* It is *not* true that `int(d(u)) == u` always holds. 
+*Warning:* It is *not* true that `int(d(u)) == u` always holds.
 
 See also: [`int`](@ref), [`classes`](@ref).
 
@@ -239,9 +239,11 @@ output, unless `transpose=true`.
 """
 matrix(X; kwargs...) = matrix(Val(ScientificTypes.trait(X)), X; kwargs...)
 matrix(::Val{:other}, X; kwargs...) = throw(ArgumentError)
-matrix(::Val{:other}, X::AbstractMatrix; kwargs...) = X
+matrix(::Val{:other}, X::AbstractMatrix; transpose=false) =
+    transpose ? permutedims(X) : X
 
 matrix(::Val{:table}, X; kwargs...) = Tables.matrix(X; kwargs...)
+
 # matrix(::Val{:table, X)
 #     cols = Tables.columns(X) # property-accessible object
 #     mat = reduce(hcat, [getproperty(cols, ftr) for ftr in propertynames(cols)])
diff --git a/src/distributions.jl b/src/distributions.jl
@@ -150,10 +150,10 @@ function Base.show(stream::IO, d::UnivariateFinite)
                                 # instantiation of d
     x1 = d.decoder(first(raw))
     p1 = d.prob_given_class[first(raw)]
-    str = "UnivariateFinite($x1=>$p1"
+    str = "UnivariateFinite($x1=>$(round(p1, sigdigits=3))"
     pairs = (d.decoder(r)=>d.prob_given_class[r] for r in raw[2:end])
     for pair in pairs
-        str *= ", $(pair[1])=>$(pair[2])"
+        str *= ", $(pair[1])=>$(round(pair[2], sigdigits=3))"
     end
     str *= ")"
     print(stream, str)
@@ -199,18 +199,14 @@ function average(dvec::AbstractVector{UnivariateFinite{L,U,T}};
     end
 
     # get all refs:
-    refs = reduce(union, [keys(d.prob_given_class) for d in dvec])
-
-    # pad each individual dicts so they have common keys:
-    z = LittleDict{U,T}([x => zero(T) for x in refs]...)
-    prob_given_class_vec = map(dvec) do d
-        merge(z, d.prob_given_class)
-    end
+    refs = Tuple(reduce(union, [keys(d.prob_given_class) for d in dvec]))
 
     # initialize the prob dictionary for the distribution sum:
-    prob_given_class = LittleDict{U,T}()
-    for x in refs
-        prob_given_class[x] = zero(T)
+    prob_given_class = LittleDict{U,T}(refs, zeros(T, length(refs)))
+
+    # make vector of all the distributions dicts padded to have same common keys:
+    prob_given_class_vec = map(dvec) do d
+        merge(prob_given_class, d.prob_given_class)
     end
 
     # sum up:
@@ -232,15 +228,10 @@ function average(dvec::AbstractVector{UnivariateFinite{L,U,T}};
     end
 
     return UnivariateFinite(first(dvec).decoder, prob_given_class)
-
 end
 
 function _pdf(d::UnivariateFinite{L,U,T}, ref) where {L,U,T}
-    if haskey(d.prob_given_class, ref)
-        return d.prob_given_class[ref]
-    else
-        return zero(T)
-    end
+    return get(d.prob_given_class, ref, zero(T))
 end
 
 Distributions.pdf(d::UnivariateFinite{L,U,T},
@@ -336,7 +327,9 @@ function Distributions.fit(d::Type{<:UnivariateFinite},
     isempty(vpure) && error("No non-missing data to fit. ")
     N = length(vpure)
     count_given_class = Dist.countmap(vpure)
-    prob_given_class = LittleDict([x=>c/N for (x, c) in count_given_class])
+    classes = Tuple(keys(count_given_class))
+    probs = values(count_given_class)./N
+    prob_given_class = LittleDict(classes, probs)
     return UnivariateFinite(prob_given_class)
 end
 
diff --git a/src/equality.jl b/src/equality.jl
@@ -1,17 +1,29 @@
-# by default, MLJType objects are `==` if: (i) they have == 
-# type, AND (ii) they have the same set of *defined* fields AND (iii)
-# their defined field values are `==` OR the values are both
-# AbstractRNG objects.
-import Base.==
-function ==(m1::M1, m2::M2) where {M1<:MLJType,M2<:MLJType}
+"""
+    is_same_except(m1::MLJType, m2::MLJType, exceptions::Symbol...)
+
+Returns `true` only the following conditions all hold:
+
+- `m1` and `m2` have the same type.
+
+- `m1` and `m2` have the same undefined fields.
+
+- Corresponding fields agree, or are listed as
+  `exceptions`, or have `AbstractRNG` as values (one or both)
+
+Note that Base.== is overloaded such that `m1 == m2` if and only if
+`is_same_except(m1, m2)`.
+
+"""
+function is_same_except(m1::M1, m2::M2,
+            exceptions::Symbol...) where {M1<:MLJType,M2<:MLJType}
     if typeof(m1) != typeof(m2)
         return false
     end
     defined1 = filter(fieldnames(M1)|>collect) do fld
-        isdefined(m1, fld)
+        isdefined(m1, fld) && !(fld in exceptions)
     end
     defined2 = filter(fieldnames(M1)|>collect) do fld
-        isdefined(m2, fld)
+        isdefined(m2, fld) && !(fld in exceptions)
     end
     if defined1 != defined2
         return false
@@ -20,17 +32,21 @@ function ==(m1::M1, m2::M2) where {M1<:MLJType,M2<:MLJType}
     for fld in defined1
         same_values = same_values &&
             (getfield(m1, fld) == getfield(m2, fld) ||
-             getfield(m1, fld) isa AbstractRNG) 
+             getfield(m1, fld) isa AbstractRNG) ||
+             getfield(m2, fld) isa AbstractRNG
     end
     return same_values
 end
 
+import Base.==
+
+==(m1::M1, m2::M2) where {M1<:MLJType,M2<:MLJType} = is_same_except(m1, m2)
+
 # for using `replace` or `replace!` on collections of MLJType objects
 # (eg, Model objects in a learning network) we need a stricter
 # equality:
 MLJBase.isequal(m1::MLJType, m2::MLJType) = (m1 === m2)
 
-
 ## TODO: Do we need to overload hash here?
 function Base.in(x::MLJType, itr::Set)
     anymissing = false
diff --git a/test/data.jl b/test/data.jl
@@ -4,9 +4,9 @@ module TestData
 using Test
 using DataFrames
 import TypedTables
-using StatsBase
+# using StatsBase
 # using JuliaDB
-using SparseArrays
+# using SparseArrays
 using CategoricalArrays
 import Tables
 using ScientificTypes
@@ -172,6 +172,9 @@ end
     tab = table(A)
     selectcols(tab, 1) == v
 
+    @test matrix(B) == B
+    @test matrix(B, transpose=true) == permutedims(B)
+
 end
 
 ## TABLE INDEXING
diff --git a/test/distributions.jl b/test/distributions.jl
@@ -1,6 +1,5 @@
 module TestDistributions
 
-# using Revise
 using Test
 using MLJBase
 using CategoricalArrays
diff --git a/test/equality.jl b/test/equality.jl
@@ -5,20 +5,27 @@ using Test
 mutable struct Foo <: MLJType
     rng::AbstractRNG
     x::Int
+    y::Int
 end
 
 mutable struct Bar <: MLJType
     rng::AbstractRNG
     x::Int
+    y::Int
 end
 
-f1 = Foo(MersenneTwister(7), 1)
-f2 = Foo(MersenneTwister(8), 1)
+f1 = Foo(MersenneTwister(7), 1, 2)
+f2 = Foo(MersenneTwister(8), 1, 2)
 @test f1.rng != f2.rng
 @test f1 == f2
-f1.x = 2
+f1.x = 10
 @test f1 != f2
-b = Bar(MersenneTwister(7), 1)
-@test f1 != b
+b = Bar(MersenneTwister(7), 1, 2)
+@test f2 != b
+
+@test is_same_except(f1, f2, :x)
+f1.y = 20
+@test f1 != f2
+@test is_same_except(f1, f2, :x, :y)
 
 true