diff --git a/Project.toml b/Project.toml
index fc56beb8..2e977db4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ julia = "1"
 [extras]
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 
 [targets]
-test = ["InteractiveUtils", "Test"]
+test = ["InteractiveUtils", "Test", "BenchmarkTools"]
diff --git a/benchmark/bench_matrix_ops.jl b/benchmark/bench_matrix_ops.jl
index 2795c1e2..e484aabd 100644
--- a/benchmark/bench_matrix_ops.jl
+++ b/benchmark/bench_matrix_ops.jl
@@ -41,5 +41,50 @@ for f in [*, \]
     end
 end
 
+# Multiply-add
+function benchmark_matmul(s,N1,N2,ArrayType)
+    if ArrayType <: MArray
+        Mat = MMatrix
+        A = rand(Mat{N1,N2})
+        B = rand(Mat{N2,N2})
+        C = rand(Mat{N1,N2})
+        label = "MMatrix"
+    elseif ArrayType <: SizedArray
+        Mat = SizedMatrix
+        A = rand(Mat{N1,N2})
+        B = rand(Mat{N2,N2})
+        C = rand(Mat{N1,N2})
+        label = "SizedMatrix"
+    elseif ArrayType <: Array
+        A = rand(N1,N2)
+        B = rand(N2,N2)
+        C = rand(N1,N2)
+        label = "Matrix"
+    end
+    α,β = 1.0, 1.0
+    s1 = s["mul!(C,A,B)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup()
+    s2 = s["mul!(C,A,B,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup()
+    s3 = s["mul!(B,A',C)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup()
+    s4 = s["mul!(B,A',C,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup()
+
+    s1[label] = @benchmarkable mul!($C,$A,$B)
+    s2[label] = @benchmarkable mul!($C,$A,$B,$α,$β)
+    s3[label] = @benchmarkable mul!($B,Transpose($A),$C)
+    s4[label] = @benchmarkable mul!($B,Transpose($A),$C,$α,$β)
+end
+
+begin
+    suite["mul!(C,A,B)"] = BenchmarkGroup(["inplace", "multiply-add"])
+    suite["mul!(C,A,B,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"])
+    suite["mul!(B,A',C)"] = BenchmarkGroup(["inplace", "multiply-add"])
+    suite["mul!(B,A',C,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"])
+    for N in matrix_sizes
+        for Mat in (MMatrix, SizedMatrix, Matrix)
+            benchmark_matmul(suite, N+1, N, Mat)
+        end
+    end
+end
+
+
 end  # module
 BenchMatrixOps.suite
diff --git a/src/StaticArrays.jl b/src/StaticArrays.jl
index fa520eb9..5502a61e 100644
--- a/src/StaticArrays.jl
+++ b/src/StaticArrays.jl
@@ -126,6 +126,7 @@ include("mapreduce.jl")
 include("sort.jl")
 include("arraymath.jl")
 include("linalg.jl")
+include("matrix_multiply_add.jl")
 include("matrix_multiply.jl")
 include("det.jl")
 include("inv.jl")
diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl
index 15996b05..b9bbb284 100644
--- a/src/matrix_multiply.jl
+++ b/src/matrix_multiply.jl
@@ -13,14 +13,6 @@ import LinearAlgebra: BlasFloat, matprod, mul!
 @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Adjoint{<:Any,<:StaticVector}) where {N} = vec(A) * B
 @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Transpose{<:Any,<:StaticVector}) where {N} = vec(A) * B
 
-@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticVector) = _mul!(Size(dest), dest, Size(A), Size(B), A, B)
-@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticMatrix) = _mul!(Size(dest), dest, Size(A), Size(B), A, B)
-@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::StaticMatrix) = mul!(dest, reshape(A, Size(Size(A)[1], 1)), B)
-@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B)
-@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B)
-#@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb})
-
-
 
 # Implementations
 
@@ -97,10 +89,14 @@ end
 
     # Heuristic choice between BLAS and explicit unrolling (or chunk-based unrolling)
     if sa[1]*sa[2]*sb[2] >= 14*14*14
+        Sa = TSize{size(S),false}()
+        Sb = TSize{sa,false}()
+        Sc = TSize{sb,false}()
+        _add = MulAddMul(true,false)
         return quote
             @_inline_meta
             C = similar(a, T, $S)
-            mul_blas!($S, C, Sa, Sb, a, b)
+            mul_blas!($Sa, C, $Sa, $Sb, a, b, $_add)
             return C
         end
     elseif sa[1]*sa[2]*sb[2] < 8*8*8
@@ -177,7 +173,7 @@ end
     # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than (possibly) a mutable type. Avoids allocation == faster
     tmp_type_in = :(SVector{$(sb[1]), T})
     tmp_type_out = :(SVector{$(sa[1]), T})
-    vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(Size(a), Size($(sb[1])), a,
+    vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(TSize(a), TSize($(sb[1])), a,
                     $(Expr(:call, tmp_type_in, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))::$tmp_type_out)
                   for k2 = 1:sb[2]]
 
@@ -193,201 +189,4 @@ end
     end
 end
 
-@generated function partly_unrolled_multiply(::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb}
-    if sa[2] != sb[1]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb"))
-    end
-
-    if sa[2] != 0
-        exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
-    else
-        exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]]
-    end
-
-    return quote
-        $(Expr(:meta,:noinline))
-        @inbounds return SVector(tuple($(exprs...)))
-    end
-end
-
-# TODO aliasing problems if c === b?
-@generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticVector) where {sa, sb, sc}
-    if sb[1] != sa[2] || sc[1] != sa[1]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
-    end
-
-    if sa[2] != 0
-        exprs = [:(c[$k] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]]
-    else
-        exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]]
-    end
-
-    return quote
-        @_inline_meta
-        @inbounds $(Expr(:block, exprs...))
-        return c
-    end
-end
-
-@generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector,
-        b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}) where {sa, sb, sc}
-    if sa[1] != sc[1] || sb[2] != sc[2]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
-    end
-
-    exprs = [:(c[$(LinearIndices(sc)[i, j])] = a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]]
-
-    return quote
-        @_inline_meta
-        @inbounds $(Expr(:block, exprs...))
-        return c
-    end
-end
-
-@generated function _mul!(Sc::Size{sc}, c::StaticMatrix{<:Any, <:Any, Tc}, Sa::Size{sa}, Sb::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticMatrix{<:Any, <:Any, Tb}) where {sa, sb, sc, Ta, Tb, Tc}
-    can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat
-
-    if can_blas
-        if sa[1] * sa[2] * sb[2] < 4*4*4
-            return quote
-                @_inline_meta
-                mul_unrolled!(Sc, c, Sa, Sb, a, b)
-                return c
-            end
-        elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating)
-            return quote
-                @_inline_meta
-                mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b)
-                return c
-            end
-        else
-            return quote
-                @_inline_meta
-                mul_blas!(Sc, c, Sa, Sb, a, b)
-                return c
-            end
-        end
-    else
-        if sa[1] * sa[2] * sb[2] < 4*4*4
-            return quote
-                @_inline_meta
-                mul_unrolled!(Sc, c, Sa, Sb, a, b)
-                return c
-            end
-        else
-            return quote
-                @_inline_meta
-                mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b)
-                return c
-            end
-        end
-    end
-end
-
-
-@generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, ::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}) where {s,sa,sb, T <: BlasFloat}
-    if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s"))
-    end
-
-    if sa[1] > 0 && sa[2] > 0 && sb[2] > 0
-        # This code adapted from `gemm!()` in base/linalg/blas.jl
-
-        if T == Float64
-            gemm = :dgemm_
-        elseif T == Float32
-            gemm = :sgemm_
-        elseif T == Complex{Float64}
-            gemm = :zgemm_
-        else # T == Complex{Float32}
-            gemm = :cgemm_
-        end
-
-        blascall = quote
-             ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Nothing,
-                 (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt},
-                  Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt},
-                  Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T},
-                  Ref{LinearAlgebra.BLAS.BlasInt}),
-                  transA, transB, m, n,
-                  ka, alpha, a, strideA,
-                  b, strideB, beta, c,
-                  strideC)
-        end
-
-        return quote
-            alpha = one(T)
-            beta = zero(T)
-            transA = 'N'
-            transB = 'N'
-            m = $(sa[1])
-            ka = $(sa[2])
-            kb = $(sb[1])
-            n = $(sb[2])
-            strideA = $(sa[1])
-            strideB = $(sb[1])
-            strideC = $(s[1])
-
-            $blascall
-
-            return c
-        end
-    else
-        throw(DimensionMismatch("Cannot call BLAS gemm with zero-dimension arrays, attempted $sa * $sb -> $s."))
-    end
-end
-
-
-@generated function mul_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc}
-    if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
-    end
-
-    if sa[2] != 0
-        exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]]))) for k1 = 1:sa[1], k2 = 1:sb[2]]
-    else
-        exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]]
-    end
-
-    return quote
-        @_inline_meta
-        @inbounds $(Expr(:block, exprs...))
-    end
-end
-
-@generated function mul_unrolled_chunks!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc}
-    if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2]
-        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
-    end
-
-    #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]]
-
-    # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster
-    tmp_type = SVector{sb[1], eltype(c)}
-    vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(Size(sa)), $(Size(sb[1])), a, $(Expr(:call, tmp_type, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]]
-
-    exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
-
-    return quote
-        @_inline_meta
-        @inbounds $(Expr(:block, vect_exprs...))
-        @inbounds $(Expr(:block, exprs...))
-    end
-end
-
-#function mul_blas(a, b, c, A, B)
-#q
-#end
-
-# The idea here is to get pointers to stack variables and call BLAS.
-# This saves an aweful lot of time compared to copying SArray's to Ref{SArray{...}}
-# and using BLAS should be fastest for (very) large SArrays
-
-# Here is an LLVM function that gets the pointer to its input, %x
-# After this we would make the ccall above.
 #
-# define i8* @f(i32 %x) #0 {
-#     %1 = alloca i32, align 4
-#     store i32 %x, i32* %1, align 4
-#     ret i32* %1
-# }
diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl
new file mode 100644
index 00000000..1802b8bb
--- /dev/null
+++ b/src/matrix_multiply_add.jl
@@ -0,0 +1,283 @@
+# import LinearAlgebra.MulAddMul
+
+abstract type MulAddMul{T} end
+
+struct AlphaBeta{T} <: MulAddMul{T}
+    α::T
+    β::T
+    function AlphaBeta{T}(α,β) where T <: Real
+        new{T}(α,β)
+    end
+end
+@inline AlphaBeta(α::A,β::B) where {A,B} = AlphaBeta{promote_type(A,B)}(α,β)
+@inline alpha(ab::AlphaBeta) = ab.α
+@inline beta(ab::AlphaBeta) = ab.β
+
+struct NoMulAdd{T} <: MulAddMul{T} end
+@inline alpha(ma::NoMulAdd{T}) where T = one(T)
+@inline beta(ma::NoMulAdd{T}) where T = zero(T)
+
+""" Size that stores whether a Matrix is a Transpose
+Useful when selecting multiplication methods, and avoiding allocations when dealing with
+the `Transpose` type by passing around the original matrix.
+Should pair with `parent`.
+"""
+struct TSize{S,T}
+    function TSize{S,T}() where {S,T}
+        new{S::Tuple{Vararg{StaticDimension}},T::Bool}()
+    end
+end
+TSize(A::Type{<:Transpose{<:Any,<:StaticArray}}) = TSize{size(A),true}()
+TSize(A::Type{<:Adjoint{<:Real,<:StaticArray}}) = TSize{size(A),true}()  # can't handle complex adjoints yet
+TSize(A::Type{<:StaticArray}) = TSize{size(A),false}()
+TSize(A::StaticArrayLike) = TSize(typeof(A))
+TSize(S::Size{s}, T=false) where s = TSize{s,T}()
+TSize(s::Number) = TSize(Size(s))
+istranpose(::TSize{<:Any,T}) where T = T
+size(::TSize{S}) where S = S
+Size(::TSize{S}) where S = Size{S}()
+Base.transpose(::TSize{S,T}) where {S,T} = TSize{reverse(S),!T}()
+
+# Get the parent of transposed arrays, or the array itself if it has no parent
+#   QUESTION: maybe call this something else?
+Base.parent(A::Union{<:Transpose{<:Any,<:StaticArray}, <:Adjoint{<:Any,<:StaticArray}}) = A.parent
+Base.parent(A::StaticArray) = A
+
+# 5-argument matrix multiplication
+#    To avoid allocations, strip away Transpose type and store tranpose info in Size
+@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike, B::StaticVecOrMatLike,
+    α::Real, β::Real) = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B),
+    AlphaBeta(α,β))
+
+@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike{T},
+        B::StaticVecOrMatLike{T}) where T =
+    _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), NoMulAdd{T}())
+
+
+"Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling."
+@inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) =
+    prod(size(A)) * size(B,2)
+
+"Validate the dimensions of a matrix multiplication, including matrix-vector products"
+function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc}
+    if sb[1] != sa[2] || sc[1] != sa[1]
+        return false
+    elseif length(sc) == 2 || length(sb) == 2
+        sc2 = length(sc) == 1 ? 1 : sc[2]
+        sb2 = length(sb) == 1 ? 1 : sb[2]
+        if sc2 != sb2
+            return false
+        end
+    end
+    return true
+end
+
+""" Combine left and right sides of an assignment expression, short-cutting
+        lhs = α * rhs + β * lhs,
+    element-wise.
+If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed.
+"""
+function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:AlphaBeta})
+    @assert length(lhs) == length(rhs)
+    n = length(rhs)
+    rhs = [:(α * $(expr)) for expr in rhs]
+    rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n]
+    exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n]
+    _assign(lhs, rhs)
+    return exprs
+end
+
+@inline _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul}) = _assign(lhs, rhs)
+
+@inline function _assign(lhs::Array{Expr}, rhs::Array{Expr})
+    @assert length(lhs) == length(rhs)
+    [:($(lhs[k]) = $(rhs[k])) for k = 1:length(lhs)]
+end
+
+"Obtain an expression for the linear index of var[k,j], taking transposes into account"
+@inline _lind(A::Type{<:TSize}, k::Int, j::Int) = _lind(:a, A, k, j)
+function _lind(var::Symbol, A::Type{TSize{sa,tA}}, k::Int, j::Int) where {sa,tA}
+    if tA
+        return :($var[$(LinearIndices(reverse(sa))[j, k])])
+    else
+        return :($var[$(LinearIndices(sa)[k, j])])
+    end
+end
+
+# Matrix-vector multiplication
+@generated function _mul!(Sc::TSize{sc}, c::StaticVecOrMatLike, Sa::TSize{sa}, Sb::TSize{sb},
+        a::StaticMatrix, b::StaticVector, _add::MulAddMul,
+        ::Val{col}=Val(1)) where {sa, sb, sc, col}
+    if sa[2] != sb[1] || sc[1] != sa[1]
+        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
+    end
+
+    if sa[2] != 0
+        lhs = [:($(_lind(:c,Sc,k,col))) for k = 1:sa[1]]
+        ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)),
+            [:($(_lind(Sa,k,j))*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]]
+        exprs = _muladd_expr(lhs, ab, _add)
+    else
+        exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]]
+    end
+
+    return quote
+        # @_inline_meta
+        # α = _add.alpha
+        # β = _add.beta
+        α = alpha(_add)
+        β = beta(_add)
+        @inbounds $(Expr(:block, exprs...))
+        return c
+    end
+end
+
+# Outer product
+@generated function _mul!(::TSize{sc}, c::StaticMatrix, ::TSize{sa,false}, ::TSize{sb,true},
+        a::StaticVector, b::StaticVector, _add::MulAddMul) where {sa, sb, sc}
+    if sc[1] != sa[1] || sc[2] != sb[2]
+        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
+    end
+
+    lhs = [:(c[$(LinearIndices(sc)[i,j])]) for i = 1:sa[1], j = 1:sb[2]]
+    ab = [:(a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]]
+    exprs = _muladd_expr(lhs, ab, _add)
+
+    return quote
+        @_inline_meta
+        α = alpha(_add)
+        β = beta(_add)
+        @inbounds $(Expr(:block, exprs...))
+        return c
+    end
+end
+
+# Matrix-matrix multiplication
+@generated function _mul!(Sc::TSize{sc}, c::StaticMatrixLike,
+        Sa::TSize{sa}, Sb::TSize{sb},
+        a::StaticMatrixLike, b::StaticMatrixLike,
+        _add::MulAddMul) where {sa, sb, sc}
+    Ta,Tb,Tc = eltype(a), eltype(b), eltype(c)
+    can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat
+
+    mult_dim = multiplied_dimension(a,b)
+    if mult_dim < 4*4*4
+        return quote
+            @_inline_meta
+            muladd_unrolled_all!(Sc, c, Sa, Sb, a, b, _add)
+            return c
+        end
+    elseif mult_dim < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating)
+        return quote
+            @_inline_meta
+            muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add)
+            return c
+        end
+    else
+        if can_blas
+            return quote
+                @_inline_meta
+                mul_blas!(Sc, c, Sa, Sb, a, b, _add)
+                return c
+            end
+        else
+            return quote
+                @_inline_meta
+                muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add)
+                return c
+            end
+        end
+    end
+end
+
+
+@generated function muladd_unrolled_all!(Sc::TSize{sc}, c::StaticMatrixLike, Sa::TSize{sa}, Sb::TSize{sb},
+        a::StaticMatrixLike, b::StaticMatrixLike, _add::MulAddMul) where {sa, sb, sc}
+    if !check_dims(Size(sc),Size(sa),Size(sb))
+        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
+    end
+
+    if sa[2] != 0
+        lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]]
+        ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)),
+                [:($(_lind(:a, Sa, k1, j)) * $(_lind(:b, Sb, j, k2))) for j = 1:sa[2]]
+            ))) for k1 = 1:sa[1], k2 = 1:sb[2]]
+        exprs = _muladd_expr(lhs, ab, _add)
+    end
+
+    return quote
+        @_inline_meta
+        # α = _add.alpha
+        # β = _add.beta
+        α = alpha(_add)
+        β = beta(_add)
+        @inbounds $(Expr(:block, exprs...))
+    end
+end
+
+
+@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, c::StaticMatrix, ::TSize{sa,tA}, Sb::TSize{sb,tB},
+        a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {sa, sb, sc, tA, tB}
+    if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2]
+        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
+    end
+
+    #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]]
+
+    # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster
+    tmp_type = SVector{sb[1], eltype(c)}
+    vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(TSize{sa,tA}()), $(TSize{(sb[1],),tB}()),
+        a, $(Expr(:call, tmp_type, [:($(_lind(:b, Sb, i, k2))) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]]
+
+    lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]]
+    # exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
+    rhs = [:($(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
+    exprs = _muladd_expr(lhs, rhs, _add)
+
+    return quote
+        @_inline_meta
+        # α = _add.alpha
+        # β = _add.beta
+        α = alpha(_add)
+        β = beta(_add)
+        @inbounds $(Expr(:block, vect_exprs...))
+        @inbounds $(Expr(:block, exprs...))
+    end
+end
+
+# @inline partly_unrolled_multiply(Sa::Size, Sb::Size, a::StaticMatrix, b::StaticArray) where {sa, sb, Ta, Tb} =
+#     partly_unrolled_multiply(TSize(Sa), TSize(Sb), a, b)
+@generated function partly_unrolled_multiply(Sa::TSize{sa}, ::TSize{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb}
+    if sa[2] != sb[1]
+        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb"))
+    end
+
+    if sa[2] != 0
+        exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:($(_lind(:a,Sa,k,j))*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
+    else
+        exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]]
+    end
+
+    return quote
+        $(Expr(:meta,:noinline))
+        @inbounds return SVector(tuple($(exprs...)))
+    end
+end
+
+@inline _get_raw_data(A::SizedArray) = A.data
+@inline _get_raw_data(A::StaticArray) = A
+
+function mul_blas!(::TSize{<:Any,false}, c::StaticMatrix, ::TSize{<:Any,tA}, ::TSize{<:Any,tB},
+        a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {tA,tB}
+    mat_char(tA) = tA ? 'T' : 'N'
+    T = eltype(a)
+    A = _get_raw_data(a)
+    B = _get_raw_data(b)
+    C = _get_raw_data(c)
+    BLAS.gemm!(mat_char(tA), mat_char(tB), T(alpha(_add)), A, B, T(beta(_add)), C)
+end
+
+# if C is transposed, transpose the entire expression
+@inline mul_blas!(Sc::TSize{<:Any,true}, c::StaticMatrix, Sa::TSize, Sb::TSize,
+        a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) =
+    mul_blas!(transpose(Sc), c, transpose(Sb), transpose(Sa), b, a, _add)
diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl
new file mode 100644
index 00000000..cf6cf3c5
--- /dev/null
+++ b/test/matrix_multiply_add.jl
@@ -0,0 +1,178 @@
+using StaticArrays
+using LinearAlgebra
+using BenchmarkTools
+using Test
+
+macro test_noalloc(ex)
+    esc(quote
+        $ex
+        @test(@allocated($ex) == 0)
+    end)
+end
+
+# check_dims
+@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,))
+@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(4,))
+@test !StaticArrays.check_dims(Size(4,), Size(3,4), Size(4,))
+
+@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,1))
+@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,))
+@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,1))
+@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,))
+@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,2))
+@test StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,2))
+@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,3))
+
+function test_multiply_add(N1,N2,ArrayType=MArray)
+    if ArrayType <: MArray
+        Mat = MMatrix
+        Vec = MVector
+    elseif ArrayType <: SizedArray
+        Mat = SizedMatrix
+        Vec = SizedVector
+    end
+    α,β = 2.0, 1.0
+
+    A = rand(Mat{N1,N2})
+    At = Transpose(A)
+    b = rand(Vec{N2})
+    c = rand(Vec{N1})
+
+    # Parent
+    @test parent(A) === A
+    @test parent(At) === A
+    @test size(parent(At)) == (N1,N2)
+    @test parent(b') === b
+    @test size(parent(b')) == (N2,)
+
+    # TSize
+    ta = StaticArrays.TSize(A)
+    @test !StaticArrays.istranpose(ta)
+    @test size(ta) == (N1,N2)
+    @test Size(ta) == Size(N1,N2)
+    ta = StaticArrays.TSize(At)
+    @test StaticArrays.istranpose(ta)
+    @test size(ta) == (N2,N1)
+    @test Size(ta) == Size(N2,N1)
+    tb = StaticArrays.TSize(b')
+    @test StaticArrays.istranpose(tb)
+    ta = transpose(ta)
+    @test !StaticArrays.istranpose(ta)
+    @test size(ta) == (N1,N2)
+    @test Size(ta) == Size(N1,N2)
+
+    # A*b
+    mul!(c,A,b)
+    @test c ≈ A*b
+    mul!(c,A,b,1.0,0.0)
+    @test c ≈ A*b
+    mul!(c,A,b,1.0,1.0)
+    @test c ≈ 2A*b
+
+    # matrix-transpose × vector
+    mul!(b,At,c)
+    @test b ≈ A'c
+    mul!(b,At,c,2.0,0.0)
+    @test b ≈ 2A'c
+    mul!(b,At,c,1.0,2.0)
+    @test b ≈ 5A'c
+
+    @test_noalloc mul!(c,A,b)
+    bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs == 0
+    # @test_noalloc mul!(c, A, b, α, β)  # records 32 bytes
+    bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10
+    @test minimum(bmark).allocs == 0
+    # @test_noalloc mul!(b, Transpose(A), c)  # records 16 bytes
+    bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs == 0
+    # @test_noalloc mul!(b, Transpose(A), c, α, β)  # records 48 bytes
+
+    # outer product
+    C = rand(Mat{N1,N2})
+    a = rand(Vec{N1})
+    b = rand(Vec{N2})
+    mul!(C,a,b')
+    @test C ≈ a*b'
+    mul!(C,a,b',2.,0.)
+    @test C ≈ 2a*b'
+    mul!(C,a,b',1.,1.)
+    @test C ≈ 3a*b'
+
+    b = @benchmark mul!($C,$a,$b') samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(C, a, b')  # records 16 bytes
+
+    # A × B
+    A = rand(Mat{N1,N2})
+    B = rand(Mat{N2,N2})
+    C = rand(Mat{N1,N2})
+    mul!(C,A,B)
+    @test C ≈ A*B
+    mul!(C,A,B,2.0,0.0)
+    @test C ≈ 2A*B
+    mul!(C,A,B,2.0,1.0)
+    @test C ≈ 4A*B
+
+    b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(C, A, B, α, β)  # records 32 bytes
+
+    # A'B
+    At = Transpose(A)
+    mul!(B,At,C)
+    @test B ≈ A'C
+    mul!(B,At,C,2.0,0.0)
+    @test B ≈ 2A'C
+    mul!(B,At,C,2.0,1.0)
+    @test B ≈ 4A'C
+
+    b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(B, Transpose(A), C, α, β)  # records 48 bytes
+
+    # A*B'
+    Bt = Transpose(B)
+    mul!(C,A,Bt)
+    @test C ≈ A*B'
+    mul!(C,A,Bt,2.0,0.0)
+    @test C ≈ 2A*B'
+    mul!(C,A,Bt,2.0,1.0)
+    @test C ≈ 4A*B'
+
+    b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(C, A, Transpose(B), α, β)  # records 48 bytes
+
+    # A'B'
+    B = rand(Mat{N1,N1})
+    C = rand(Mat{N2,N1})
+    mul!(C,Transpose(A),Transpose(B))
+    @test C ≈ A'B'
+    mul!(C,Transpose(A),Transpose(B),2.0,0.0)
+    @test C ≈ 2A'B'
+    mul!(C,Transpose(A),Transpose(B),2.0,1.0)
+    @test C ≈ 4A'B'
+
+    b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β)  # records 64 bytes
+
+    # Transpose Output
+    C = rand(Mat{N1,N2})
+    mul!(Transpose(C),Transpose(A),Transpose(B))
+    @test C' ≈ A'B'
+    b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(b).allocs == 0
+    # @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β)  # records 80 bytes
+end
+
+# Test the three different
+@testset "matrix multiply-add" begin
+    test_multiply_add(3,4)
+    test_multiply_add(5,6)
+    test_multiply_add(15,16)
+    test_multiply_add(3,4,SizedArray)
+    test_multiply_add(5,6,SizedArray)
+    test_multiply_add(15,16,SizedArray)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index ecb2ee49..7be5165b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -49,6 +49,7 @@ addtests("arraymath.jl")
 addtests("broadcast.jl")
 addtests("linalg.jl")
 addtests("matrix_multiply.jl")
+addtests("matrix_multiply_add.jl")
 addtests("triangular.jl")
 addtests("det.jl")
 addtests("inv.jl")