diff --git a/Project.toml b/Project.toml index fc56beb8..2e977db4 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ julia = "1" [extras] InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" [targets] -test = ["InteractiveUtils", "Test"] +test = ["InteractiveUtils", "Test", "BenchmarkTools"] diff --git a/benchmark/bench_matrix_ops.jl b/benchmark/bench_matrix_ops.jl index 2795c1e2..e484aabd 100644 --- a/benchmark/bench_matrix_ops.jl +++ b/benchmark/bench_matrix_ops.jl @@ -41,5 +41,50 @@ for f in [*, \] end end +# Multiply-add +function benchmark_matmul(s,N1,N2,ArrayType) + if ArrayType <: MArray + Mat = MMatrix + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + label = "MMatrix" + elseif ArrayType <: SizedArray + Mat = SizedMatrix + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + label = "SizedMatrix" + elseif ArrayType <: Array + A = rand(N1,N2) + B = rand(N2,N2) + C = rand(N1,N2) + label = "Matrix" + end + α,β = 1.0, 1.0 + s1 = s["mul!(C,A,B)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s2 = s["mul!(C,A,B,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s3 = s["mul!(B,A',C)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s4 = s["mul!(B,A',C,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + + s1[label] = @benchmarkable mul!($C,$A,$B) + s2[label] = @benchmarkable mul!($C,$A,$B,$α,$β) + s3[label] = @benchmarkable mul!($B,Transpose($A),$C) + s4[label] = @benchmarkable mul!($B,Transpose($A),$C,$α,$β) +end + +begin + suite["mul!(C,A,B)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(C,A,B,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(B,A',C)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(B,A',C,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"]) + for N in matrix_sizes + for Mat in (MMatrix, SizedMatrix, Matrix) + benchmark_matmul(suite, N+1, N, Mat) + end + end +end + + end # module BenchMatrixOps.suite diff --git a/src/StaticArrays.jl b/src/StaticArrays.jl index fa520eb9..5502a61e 100644 --- a/src/StaticArrays.jl +++ b/src/StaticArrays.jl @@ -126,6 +126,7 @@ include("mapreduce.jl") include("sort.jl") include("arraymath.jl") include("linalg.jl") +include("matrix_multiply_add.jl") include("matrix_multiply.jl") include("det.jl") include("inv.jl") diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index 15996b05..b9bbb284 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -13,14 +13,6 @@ import LinearAlgebra: BlasFloat, matprod, mul! @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Adjoint{<:Any,<:StaticVector}) where {N} = vec(A) * B @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Transpose{<:Any,<:StaticVector}) where {N} = vec(A) * B -@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticVector) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticMatrix) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::StaticMatrix) = mul!(dest, reshape(A, Size(Size(A)[1], 1)), B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -#@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb}) - - # Implementations @@ -97,10 +89,14 @@ end # Heuristic choice between BLAS and explicit unrolling (or chunk-based unrolling) if sa[1]*sa[2]*sb[2] >= 14*14*14 + Sa = TSize{size(S),false}() + Sb = TSize{sa,false}() + Sc = TSize{sb,false}() + _add = MulAddMul(true,false) return quote @_inline_meta C = similar(a, T, $S) - mul_blas!($S, C, Sa, Sb, a, b) + mul_blas!($Sa, C, $Sa, $Sb, a, b, $_add) return C end elseif sa[1]*sa[2]*sb[2] < 8*8*8 @@ -177,7 +173,7 @@ end # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than (possibly) a mutable type. Avoids allocation == faster tmp_type_in = :(SVector{$(sb[1]), T}) tmp_type_out = :(SVector{$(sa[1]), T}) - vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(Size(a), Size($(sb[1])), a, + vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(TSize(a), TSize($(sb[1])), a, $(Expr(:call, tmp_type_in, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))::$tmp_type_out) for k2 = 1:sb[2]] @@ -193,201 +189,4 @@ end end end -@generated function partly_unrolled_multiply(::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb} - if sa[2] != sb[1] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb")) - end - - if sa[2] != 0 - exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]] - else - exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]] - end - - return quote - $(Expr(:meta,:noinline)) - @inbounds return SVector(tuple($(exprs...))) - end -end - -# TODO aliasing problems if c === b? -@generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticVector) where {sa, sb, sc} - if sb[1] != sa[2] || sc[1] != sa[1] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - if sa[2] != 0 - exprs = [:(c[$k] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] - else - exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - return c - end -end - -@generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector, - b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}) where {sa, sb, sc} - if sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - exprs = [:(c[$(LinearIndices(sc)[i, j])] = a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - return c - end -end - -@generated function _mul!(Sc::Size{sc}, c::StaticMatrix{<:Any, <:Any, Tc}, Sa::Size{sa}, Sb::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticMatrix{<:Any, <:Any, Tb}) where {sa, sb, sc, Ta, Tb, Tc} - can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat - - if can_blas - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - mul_unrolled!(Sc, c, Sa, Sb, a, b) - return c - end - elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) - return quote - @_inline_meta - mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) - return c - end - else - return quote - @_inline_meta - mul_blas!(Sc, c, Sa, Sb, a, b) - return c - end - end - else - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - mul_unrolled!(Sc, c, Sa, Sb, a, b) - return c - end - else - return quote - @_inline_meta - mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) - return c - end - end - end -end - - -@generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, ::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}) where {s,sa,sb, T <: BlasFloat} - if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s")) - end - - if sa[1] > 0 && sa[2] > 0 && sb[2] > 0 - # This code adapted from `gemm!()` in base/linalg/blas.jl - - if T == Float64 - gemm = :dgemm_ - elseif T == Float32 - gemm = :sgemm_ - elseif T == Complex{Float64} - gemm = :zgemm_ - else # T == Complex{Float32} - gemm = :cgemm_ - end - - blascall = quote - ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Nothing, - (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt}, - Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, - Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, - Ref{LinearAlgebra.BLAS.BlasInt}), - transA, transB, m, n, - ka, alpha, a, strideA, - b, strideB, beta, c, - strideC) - end - - return quote - alpha = one(T) - beta = zero(T) - transA = 'N' - transB = 'N' - m = $(sa[1]) - ka = $(sa[2]) - kb = $(sb[1]) - n = $(sb[2]) - strideA = $(sa[1]) - strideB = $(sb[1]) - strideC = $(s[1]) - - $blascall - - return c - end - else - throw(DimensionMismatch("Cannot call BLAS gemm with zero-dimension arrays, attempted $sa * $sb -> $s.")) - end -end - - -@generated function mul_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]]))) for k1 = 1:sa[1], k2 = 1:sb[2]] - else - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - end -end - -@generated function mul_unrolled_chunks!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] - - # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster - tmp_type = SVector{sb[1], eltype(c)} - vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(Size(sa)), $(Size(sb[1])), a, $(Expr(:call, tmp_type, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] - - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] - - return quote - @_inline_meta - @inbounds $(Expr(:block, vect_exprs...)) - @inbounds $(Expr(:block, exprs...)) - end -end - -#function mul_blas(a, b, c, A, B) -#q -#end - -# The idea here is to get pointers to stack variables and call BLAS. -# This saves an aweful lot of time compared to copying SArray's to Ref{SArray{...}} -# and using BLAS should be fastest for (very) large SArrays - -# Here is an LLVM function that gets the pointer to its input, %x -# After this we would make the ccall above. # -# define i8* @f(i32 %x) #0 { -# %1 = alloca i32, align 4 -# store i32 %x, i32* %1, align 4 -# ret i32* %1 -# } diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl new file mode 100644 index 00000000..1802b8bb --- /dev/null +++ b/src/matrix_multiply_add.jl @@ -0,0 +1,283 @@ +# import LinearAlgebra.MulAddMul + +abstract type MulAddMul{T} end + +struct AlphaBeta{T} <: MulAddMul{T} + α::T + β::T + function AlphaBeta{T}(α,β) where T <: Real + new{T}(α,β) + end +end +@inline AlphaBeta(α::A,β::B) where {A,B} = AlphaBeta{promote_type(A,B)}(α,β) +@inline alpha(ab::AlphaBeta) = ab.α +@inline beta(ab::AlphaBeta) = ab.β + +struct NoMulAdd{T} <: MulAddMul{T} end +@inline alpha(ma::NoMulAdd{T}) where T = one(T) +@inline beta(ma::NoMulAdd{T}) where T = zero(T) + +""" Size that stores whether a Matrix is a Transpose +Useful when selecting multiplication methods, and avoiding allocations when dealing with +the `Transpose` type by passing around the original matrix. +Should pair with `parent`. +""" +struct TSize{S,T} + function TSize{S,T}() where {S,T} + new{S::Tuple{Vararg{StaticDimension}},T::Bool}() + end +end +TSize(A::Type{<:Transpose{<:Any,<:StaticArray}}) = TSize{size(A),true}() +TSize(A::Type{<:Adjoint{<:Real,<:StaticArray}}) = TSize{size(A),true}() # can't handle complex adjoints yet +TSize(A::Type{<:StaticArray}) = TSize{size(A),false}() +TSize(A::StaticArrayLike) = TSize(typeof(A)) +TSize(S::Size{s}, T=false) where s = TSize{s,T}() +TSize(s::Number) = TSize(Size(s)) +istranpose(::TSize{<:Any,T}) where T = T +size(::TSize{S}) where S = S +Size(::TSize{S}) where S = Size{S}() +Base.transpose(::TSize{S,T}) where {S,T} = TSize{reverse(S),!T}() + +# Get the parent of transposed arrays, or the array itself if it has no parent +# QUESTION: maybe call this something else? +Base.parent(A::Union{<:Transpose{<:Any,<:StaticArray}, <:Adjoint{<:Any,<:StaticArray}}) = A.parent +Base.parent(A::StaticArray) = A + +# 5-argument matrix multiplication +# To avoid allocations, strip away Transpose type and store tranpose info in Size +@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike, B::StaticVecOrMatLike, + α::Real, β::Real) = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), + AlphaBeta(α,β)) + +@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike{T}, + B::StaticVecOrMatLike{T}) where T = + _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), NoMulAdd{T}()) + + +"Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling." +@inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) = + prod(size(A)) * size(B,2) + +"Validate the dimensions of a matrix multiplication, including matrix-vector products" +function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc} + if sb[1] != sa[2] || sc[1] != sa[1] + return false + elseif length(sc) == 2 || length(sb) == 2 + sc2 = length(sc) == 1 ? 1 : sc[2] + sb2 = length(sb) == 1 ? 1 : sb[2] + if sc2 != sb2 + return false + end + end + return true +end + +""" Combine left and right sides of an assignment expression, short-cutting + lhs = α * rhs + β * lhs, + element-wise. +If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed. +""" +function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:AlphaBeta}) + @assert length(lhs) == length(rhs) + n = length(rhs) + rhs = [:(α * $(expr)) for expr in rhs] + rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n] + exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n] + _assign(lhs, rhs) + return exprs +end + +@inline _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul}) = _assign(lhs, rhs) + +@inline function _assign(lhs::Array{Expr}, rhs::Array{Expr}) + @assert length(lhs) == length(rhs) + [:($(lhs[k]) = $(rhs[k])) for k = 1:length(lhs)] +end + +"Obtain an expression for the linear index of var[k,j], taking transposes into account" +@inline _lind(A::Type{<:TSize}, k::Int, j::Int) = _lind(:a, A, k, j) +function _lind(var::Symbol, A::Type{TSize{sa,tA}}, k::Int, j::Int) where {sa,tA} + if tA + return :($var[$(LinearIndices(reverse(sa))[j, k])]) + else + return :($var[$(LinearIndices(sa)[k, j])]) + end +end + +# Matrix-vector multiplication +@generated function _mul!(Sc::TSize{sc}, c::StaticVecOrMatLike, Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrix, b::StaticVector, _add::MulAddMul, + ::Val{col}=Val(1)) where {sa, sb, sc, col} + if sa[2] != sb[1] || sc[1] != sa[1] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + lhs = [:($(_lind(:c,Sc,k,col))) for k = 1:sa[1]] + ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:($(_lind(Sa,k,j))*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] + exprs = _muladd_expr(lhs, ab, _add) + else + exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] + end + + return quote + # @_inline_meta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) + @inbounds $(Expr(:block, exprs...)) + return c + end +end + +# Outer product +@generated function _mul!(::TSize{sc}, c::StaticMatrix, ::TSize{sa,false}, ::TSize{sb,true}, + a::StaticVector, b::StaticVector, _add::MulAddMul) where {sa, sb, sc} + if sc[1] != sa[1] || sc[2] != sb[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + lhs = [:(c[$(LinearIndices(sc)[i,j])]) for i = 1:sa[1], j = 1:sb[2]] + ab = [:(a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] + exprs = _muladd_expr(lhs, ab, _add) + + return quote + @_inline_meta + α = alpha(_add) + β = beta(_add) + @inbounds $(Expr(:block, exprs...)) + return c + end +end + +# Matrix-matrix multiplication +@generated function _mul!(Sc::TSize{sc}, c::StaticMatrixLike, + Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrixLike, b::StaticMatrixLike, + _add::MulAddMul) where {sa, sb, sc} + Ta,Tb,Tc = eltype(a), eltype(b), eltype(c) + can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat + + mult_dim = multiplied_dimension(a,b) + if mult_dim < 4*4*4 + return quote + @_inline_meta + muladd_unrolled_all!(Sc, c, Sa, Sb, a, b, _add) + return c + end + elseif mult_dim < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add) + return c + end + else + if can_blas + return quote + @_inline_meta + mul_blas!(Sc, c, Sa, Sb, a, b, _add) + return c + end + else + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add) + return c + end + end + end +end + + +@generated function muladd_unrolled_all!(Sc::TSize{sc}, c::StaticMatrixLike, Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrixLike, b::StaticMatrixLike, _add::MulAddMul) where {sa, sb, sc} + if !check_dims(Size(sc),Size(sa),Size(sb)) + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]] + ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:($(_lind(:a, Sa, k1, j)) * $(_lind(:b, Sb, j, k2))) for j = 1:sa[2]] + ))) for k1 = 1:sa[1], k2 = 1:sb[2]] + exprs = _muladd_expr(lhs, ab, _add) + end + + return quote + @_inline_meta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) + @inbounds $(Expr(:block, exprs...)) + end +end + + +@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, c::StaticMatrix, ::TSize{sa,tA}, Sb::TSize{sb,tB}, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {sa, sb, sc, tA, tB} + if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] + + # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster + tmp_type = SVector{sb[1], eltype(c)} + vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(TSize{sa,tA}()), $(TSize{(sb[1],),tB}()), + a, $(Expr(:call, tmp_type, [:($(_lind(:b, Sb, i, k2))) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] + + lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]] + # exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + rhs = [:($(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + exprs = _muladd_expr(lhs, rhs, _add) + + return quote + @_inline_meta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) + @inbounds $(Expr(:block, vect_exprs...)) + @inbounds $(Expr(:block, exprs...)) + end +end + +# @inline partly_unrolled_multiply(Sa::Size, Sb::Size, a::StaticMatrix, b::StaticArray) where {sa, sb, Ta, Tb} = +# partly_unrolled_multiply(TSize(Sa), TSize(Sb), a, b) +@generated function partly_unrolled_multiply(Sa::TSize{sa}, ::TSize{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb} + if sa[2] != sb[1] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb")) + end + + if sa[2] != 0 + exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:($(_lind(:a,Sa,k,j))*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]] + else + exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]] + end + + return quote + $(Expr(:meta,:noinline)) + @inbounds return SVector(tuple($(exprs...))) + end +end + +@inline _get_raw_data(A::SizedArray) = A.data +@inline _get_raw_data(A::StaticArray) = A + +function mul_blas!(::TSize{<:Any,false}, c::StaticMatrix, ::TSize{<:Any,tA}, ::TSize{<:Any,tB}, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {tA,tB} + mat_char(tA) = tA ? 'T' : 'N' + T = eltype(a) + A = _get_raw_data(a) + B = _get_raw_data(b) + C = _get_raw_data(c) + BLAS.gemm!(mat_char(tA), mat_char(tB), T(alpha(_add)), A, B, T(beta(_add)), C) +end + +# if C is transposed, transpose the entire expression +@inline mul_blas!(Sc::TSize{<:Any,true}, c::StaticMatrix, Sa::TSize, Sb::TSize, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) = + mul_blas!(transpose(Sc), c, transpose(Sb), transpose(Sa), b, a, _add) diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl new file mode 100644 index 00000000..cf6cf3c5 --- /dev/null +++ b/test/matrix_multiply_add.jl @@ -0,0 +1,178 @@ +using StaticArrays +using LinearAlgebra +using BenchmarkTools +using Test + +macro test_noalloc(ex) + esc(quote + $ex + @test(@allocated($ex) == 0) + end) +end + +# check_dims +@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,)) +@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(4,)) +@test !StaticArrays.check_dims(Size(4,), Size(3,4), Size(4,)) + +@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,1)) +@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,)) +@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,1)) +@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,)) +@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,2)) +@test StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,2)) +@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,3)) + +function test_multiply_add(N1,N2,ArrayType=MArray) + if ArrayType <: MArray + Mat = MMatrix + Vec = MVector + elseif ArrayType <: SizedArray + Mat = SizedMatrix + Vec = SizedVector + end + α,β = 2.0, 1.0 + + A = rand(Mat{N1,N2}) + At = Transpose(A) + b = rand(Vec{N2}) + c = rand(Vec{N1}) + + # Parent + @test parent(A) === A + @test parent(At) === A + @test size(parent(At)) == (N1,N2) + @test parent(b') === b + @test size(parent(b')) == (N2,) + + # TSize + ta = StaticArrays.TSize(A) + @test !StaticArrays.istranpose(ta) + @test size(ta) == (N1,N2) + @test Size(ta) == Size(N1,N2) + ta = StaticArrays.TSize(At) + @test StaticArrays.istranpose(ta) + @test size(ta) == (N2,N1) + @test Size(ta) == Size(N2,N1) + tb = StaticArrays.TSize(b') + @test StaticArrays.istranpose(tb) + ta = transpose(ta) + @test !StaticArrays.istranpose(ta) + @test size(ta) == (N1,N2) + @test Size(ta) == Size(N1,N2) + + # A*b + mul!(c,A,b) + @test c ≈ A*b + mul!(c,A,b,1.0,0.0) + @test c ≈ A*b + mul!(c,A,b,1.0,1.0) + @test c ≈ 2A*b + + # matrix-transpose × vector + mul!(b,At,c) + @test b ≈ A'c + mul!(b,At,c,2.0,0.0) + @test b ≈ 2A'c + mul!(b,At,c,1.0,2.0) + @test b ≈ 5A'c + + @test_noalloc mul!(c,A,b) + bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(c, A, b, α, β) # records 32 bytes + bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(b, Transpose(A), c) # records 16 bytes + bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(b, Transpose(A), c, α, β) # records 48 bytes + + # outer product + C = rand(Mat{N1,N2}) + a = rand(Vec{N1}) + b = rand(Vec{N2}) + mul!(C,a,b') + @test C ≈ a*b' + mul!(C,a,b',2.,0.) + @test C ≈ 2a*b' + mul!(C,a,b',1.,1.) + @test C ≈ 3a*b' + + b = @benchmark mul!($C,$a,$b') samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, a, b') # records 16 bytes + + # A × B + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + mul!(C,A,B) + @test C ≈ A*B + mul!(C,A,B,2.0,0.0) + @test C ≈ 2A*B + mul!(C,A,B,2.0,1.0) + @test C ≈ 4A*B + + b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, A, B, α, β) # records 32 bytes + + # A'B + At = Transpose(A) + mul!(B,At,C) + @test B ≈ A'C + mul!(B,At,C,2.0,0.0) + @test B ≈ 2A'C + mul!(B,At,C,2.0,1.0) + @test B ≈ 4A'C + + b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(B, Transpose(A), C, α, β) # records 48 bytes + + # A*B' + Bt = Transpose(B) + mul!(C,A,Bt) + @test C ≈ A*B' + mul!(C,A,Bt,2.0,0.0) + @test C ≈ 2A*B' + mul!(C,A,Bt,2.0,1.0) + @test C ≈ 4A*B' + + b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, A, Transpose(B), α, β) # records 48 bytes + + # A'B' + B = rand(Mat{N1,N1}) + C = rand(Mat{N2,N1}) + mul!(C,Transpose(A),Transpose(B)) + @test C ≈ A'B' + mul!(C,Transpose(A),Transpose(B),2.0,0.0) + @test C ≈ 2A'B' + mul!(C,Transpose(A),Transpose(B),2.0,1.0) + @test C ≈ 4A'B' + + b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # records 64 bytes + + # Transpose Output + C = rand(Mat{N1,N2}) + mul!(Transpose(C),Transpose(A),Transpose(B)) + @test C' ≈ A'B' + b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 + @test minimum(b).allocs == 0 + # @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) # records 80 bytes +end + +# Test the three different +@testset "matrix multiply-add" begin + test_multiply_add(3,4) + test_multiply_add(5,6) + test_multiply_add(15,16) + test_multiply_add(3,4,SizedArray) + test_multiply_add(5,6,SizedArray) + test_multiply_add(15,16,SizedArray) +end diff --git a/test/runtests.jl b/test/runtests.jl index ecb2ee49..7be5165b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,6 +49,7 @@ addtests("arraymath.jl") addtests("broadcast.jl") addtests("linalg.jl") addtests("matrix_multiply.jl") +addtests("matrix_multiply_add.jl") addtests("triangular.jl") addtests("det.jl") addtests("inv.jl")