Support for rank deficiency in bootstrap (#755)

palday · web-flow · commit 0a873b0f9f9f · 2024-03-19T16:29:10.000-05:00
* coef! method

* use coef! instead of fixef! in bootstrap

* ignore version specific manifests

* slight efficiency tweak

* fixef! with different target eltype for GLMM

* simulate! tweaks

* add test for rank deficient bootstrap

* behavior note

* version bump

* NEWS update

* tweak nightly runner
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -30,8 +30,7 @@ jobs:
           version: ${{ matrix.julia-version }}
       - uses: julia-actions/cache@v1
         with:
-            cache-compiled: "true"
-      - uses: julia-actions/julia-buildpkg@v1
+            cache-compiled: true
       - uses: julia-actions/julia-runtest@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ docs/src/assets/*.svg
 test/scratch.jl
 tune.json
 Manifest.toml
+Manifest-v1.*.toml
 settings.json
 docs/jmd
 LocalPreferences.toml
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+MixedModels v4.23.0 Release Notes
+==============================
+* Support for rank deficiency in the parametric bootstrap. [#755]
+
 MixedModels v4.22.5 Release Notes
 ==============================
 * Use `muladd` where possible to enable fused multiply-add (FMA) on architectures with hardware support. FMA will generally improve computational speed and gives more accurate rounding. [#740]
@@ -504,3 +508,4 @@ Package dependencies
 [#740]: https://github.com/JuliaStats/MixedModels.jl/issues/740
 [#744]: https://github.com/JuliaStats/MixedModels.jl/issues/744
 [#748]: https://github.com/JuliaStats/MixedModels.jl/issues/748
+[#755]: https://github.com/JuliaStats/MixedModels.jl/issues/755
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MixedModels"
 uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
 author = ["Phillip Alday <me@phillipalday.com>", "Douglas Bates <dmbates@gmail.com>", "Jose Bayoan Santiago Calderon <jbs3hp@virginia.edu>"]
-version = "4.22.5"
+version = "4.23.0"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -80,7 +80,7 @@ been constructed with the same structure as the source of the saved replicates.
 
 The two-argument method constructs a [`MixedModelBootstrap`](@ref) with the same eltype as `m`.
 If an eltype is specified as the third argument, then a `MixedModelBootstrap` is returned.
-If a subtype of `MixedModelFitCollection` is specified as the third argument, then that 
+If a subtype of `MixedModelFitCollection` is specified as the third argument, then that
 is the return type.
 
 See also [`savereplicates`](@ref), [`restoreoptsum!`](@ref).
@@ -91,7 +91,7 @@ end
 
 # why this weird second method? it allows us to define custom types and write methods
 # to load into those types directly. For example, we could define a `PowerAnalysis <: MixedModelFitCollection`
-# in MixedModelsSim and then overload this method to get a convenient object. 
+# in MixedModelsSim and then overload this method to get a convenient object.
 # Also, this allows us to write `restorereplicateS(f, m, ::Type{<:MixedModelNonparametricBootstrap})` for
 # entities in MixedModels bootstrap
 function restorereplicates(
@@ -120,14 +120,14 @@ end
 """
     savereplicates(f, b::MixedModelFitCollection)
 
-Save the replicates associated with a [`MixedModelFitCollection`](@ref), 
-e.g. [`MixedModelBootstrap`](@ref) as an Arrow file. 
+Save the replicates associated with a [`MixedModelFitCollection`](@ref),
+e.g. [`MixedModelBootstrap`](@ref) as an Arrow file.
 
 See also [`restorereplicates`](@ref), [`saveoptsum`](@ref)
 
 !!! note
     **Only** the replicates are saved, not the entire contents of the `MixedModelFitCollection`.
-    `restorereplicates` requires a model compatible with the bootstrap to restore the full object. 
+    `restorereplicates` requires a model compatible with the bootstrap to restore the full object.
 """
 savereplicates(f, b::MixedModelFitCollection) = Arrow.write(f, b.fits)
 
@@ -200,6 +200,14 @@ fit during the bootstrapping process. For example, `optsum_overrides=(;ftol_rel=
 reduces the convergence criterion, which can greatly speed up the bootstrap fits.
 Taking advantage of this speed up to increase `n` can often lead to better estimates
 of coverage intervals.
+
+
+!!! note
+    All coefficients are bootstrapped. In the rank deficient case, the inestimatable coefficients are
+    treated as -0.0 in the simulations underlying the bootstrap, which will generally result
+    in their estimate from the simulated data also being being inestimable and thus set to -0.0.
+    **However this behavior may change in future releases to explicitly drop the
+    extraneous columns before simulation and thus not include their estimates in the bootstrap result.**
 """
 function parametricbootstrap(
     rng::AbstractRNG,
@@ -234,7 +242,7 @@ function parametricbootstrap(
     # this seemed to slow things down?!
     # _copy_away_from_lowerbd!(m.optsum.initial, morig.optsum.final, m.lowerbd; incr=0.05)
 
-    β_names = Tuple(Symbol.(fixefnames(morig)))
+    β_names = Tuple(Symbol.(coefnames(morig)))
 
     use_threads && Base.depwarn(
         "use_threads is deprecated and will be removed in a future release",
@@ -247,7 +255,7 @@ function parametricbootstrap(
         (
             objective=ftype.(m.objective),
             σ=ismissing(m.σ) ? missing : ftype(m.σ),
-            β=NamedTuple{β_names}(fixef!(βsc, m)),
+            β=NamedTuple{β_names}(coef!(βsc, m)),
             se=SVector{p,ftype}(stderror!(βsc, m)),
             θ=SVector{k,ftype}(getθ!(θsc, m)),
         )
@@ -331,8 +339,8 @@ end
 
 Compute bootstrap confidence intervals for coefficients and variance components, with confidence level level (by default 95%).
 
-The keyword argument `method` determines whether the `:shortest`, i.e. highest density, interval is used 
-or the `:equaltail`, i.e. quantile-based, interval is used. For historical reasons, the default is `:shortest`, 
+The keyword argument `method` determines whether the `:shortest`, i.e. highest density, interval is used
+or the `:equaltail`, i.e. quantile-based, interval is used. For historical reasons, the default is `:shortest`,
 but `:equaltail` gives the interval that is most comparable to the profile and Wald confidence intervals.
 
 !!! note
@@ -621,7 +629,7 @@ end
 function σρ!(v::AbstractVector{T}, t::LowerTriangular{T}, σ::T) where {T}
     """
         σρ!(v, t, σ)
-    push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v` 
+    push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v`
     """
     dat = t.data
     for i in axes(dat, 1)
diff --git a/src/generalizedlinearmixedmodel.jl b/src/generalizedlinearmixedmodel.jl
@@ -113,8 +113,8 @@ StatsAPI.deviance(m::GeneralizedLinearMixedModel) = deviance(m, m.optsum.nAGQ)
 
 fixef(m::GeneralizedLinearMixedModel) = m.β
 
-function fixef!(v::AbstractVector{T}, m::GeneralizedLinearMixedModel{T}) where {T}
-    return copyto!(fill!(v, -zero(T)), m.β)
+function fixef!(v::AbstractVector{Tv}, m::GeneralizedLinearMixedModel{T}) where {Tv,T}
+    return copyto!(fill!(v, -zero(Tv)), m.β)
 end
 
 objective(m::GeneralizedLinearMixedModel) = deviance(m)
diff --git a/src/linearmixedmodel.jl b/src/linearmixedmodel.jl
@@ -267,8 +267,12 @@ function StatsAPI.fit(
 end
 
 function StatsAPI.coef(m::LinearMixedModel{T}) where {T}
+    return coef!(Vector{T}(undef, length(m.feterm.piv)), m)
+end
+
+function coef!(v::AbstractVector{Tv}, m::MixedModel{T}) where {Tv,T}
     piv = m.feterm.piv
-    return invpermute!(fixef!(similar(piv, T), m), piv)
+    return invpermute!(fixef!(v, m), piv)
 end
 
 βs(m::LinearMixedModel) = NamedTuple{(Symbol.(coefnames(m))...,)}(coef(m))
diff --git a/src/simulate.jl b/src/simulate.jl
@@ -110,9 +110,6 @@ scale.
 The `wts` argument is currently ignored except for `GeneralizedLinearMixedModel`
 models with a `Binomial` distribution.
 
-!!! warning
-    Models are assumed to be full rank.
-
 !!! note
     Note that `simulate!` methods with a `y::AbstractVector` as the first argument
     (besides the RNG) and `simulate` methods return the simulated response. This is
@@ -152,20 +149,17 @@ end
 function simulate!(
     rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=m.β, σ=m.σ, θ=m.θ
 ) where {T}
-    length(β) == length(fixef(m)) ||
-        length(β) == length(coef(m)) ||
+    length(β) == length(m.feterm.piv) || length(β) == m.feterm.rank ||
         throw(ArgumentError("You must specify all (non-singular) βs"))
 
     β = convert(Vector{T}, β)
     σ = T(σ)
     θ = convert(Vector{T}, θ)
     isempty(θ) || setθ!(m, θ)
 
-    if length(β) ≠ length(coef(m))
-        padding = length(coef(m)) - length(β)
-        for ii in 1:padding
-            push!(β, -0.0)
-        end
+    if length(β) ≠ length(m.feterm.piv)
+        padding = length(model.feterm.piv) - m.feterm.rank
+        append!(β, fill(-0.0, padding))
     end
 
     # initialize y to standard normal
@@ -234,8 +228,7 @@ function _simulate!(
     θ,
     resp=nothing,
 ) where {T}
-    length(β) == length(fixef(m)) ||
-        length(β) == length(coef(m)) ||
+    length(β) == length(m.feterm.piv) || length(β) == m.feterm.rank ||
         throw(ArgumentError("You must specify all (non-singular) βs"))
 
     dispersion_parameter(m) ||
@@ -254,11 +247,9 @@ function _simulate!(
 
     d = m.resp.d
 
-    if length(β) ≠ length(coef(m))
-        padding = length(coef(m)) - length(β)
-        for ii in 1:padding
-            push!(β, -0.0)
-        end
+    if length(β) ≠ length(m.feterm.piv)
+        padding = length(model.feterm.piv) - m.feterm.rank
+        append!(β, fill(-0.0, padding))
     end
 
     fast = (length(m.θ) == length(m.optsum.final))
diff --git a/test/bootstrap.jl b/test/bootstrap.jl
@@ -189,7 +189,7 @@ end
         @test pb1 == restorereplicates(seekstart(io), MixedModels.unfit!(deepcopy(m1)))
 
         @test pb1 ≈ restorereplicates(seekstart(io), m1, Float16) rtol=1
-end
+    end
 
     @testset "Bernoulli simulate! and GLMM bootstrap" begin
         contra = dataset(:contra)
@@ -220,6 +220,22 @@ end
                                                        σ=2, progress=false)
         @test sum(issingular(bs)) == 0
     end
+
+    @testset "Rank deficient" begin
+        rng = MersenneTwister(0);
+        x = rand(rng, 100);
+        data = (x = x, x2 = 1.5 .* x, y = rand(rng, 100), z = repeat('A':'T', 5))
+        model = @suppress fit(MixedModel, @formula(y ~ x + x2 + (1|z)), data; progress=false)
+        boot = quickboot(model, 10)
+
+        dropped_idx = model.feterm.piv[end]
+        dropped_coef = coefnames(model)[dropped_idx]
+        @test all(boot.β) do nt
+            # if we're the dropped coef, then we must be -0.0
+            # need isequal because of -0.0
+           return nt.coefname != dropped_coef || isequal(nt.β, -0.0)
+        end
+    end
 end
 
 @testset "show and summary" begin
@@ -236,7 +252,7 @@ end
     @test nrow(df) == 151
     @test propertynames(df) == collect(propertynames(pr.tbl))
 
-    @testset "CI method comparison" begin    
+    @testset "CI method comparison" begin
         level = 0.68
         ci_boot_equaltail = confint(pb; level, method=:equaltail)
         ci_boot_shortest = confint(pb; level, method=:shortest)