Skip to content

Commit e508e50

Browse files
committed
Added a couple tests, and tweaked determinestrategy.jl.
1 parent 1f3e9d2 commit e508e50

File tree

2 files changed

+59
-27
lines changed

2 files changed

+59
-27
lines changed

β€Žsrc/determinestrategy.jlβ€Ž

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -408,41 +408,55 @@ function maybedemotesize(uβ‚‚::Int, N::Int, U::Int, Uloop::Loop, maxuβ‚‚base::In
408408
end
409409
uβ‚‚
410410
end
411+
411412
function solve_unroll(
412-
ls::LoopSet, u₁loopsym::Symbol, tiled::Symbol,
413+
ls::LoopSet, u₁loopsym::Symbol, uβ‚‚loopsym::Symbol,
413414
cost_vec::AbstractVector{Float64},
414415
reg_pressure::AbstractVector{Float64},
415416
W::Int, vectorized::Symbol
417+
)
418+
u₁loop = getloop(ls, u₁loopsym)
419+
uβ‚‚loop = getloop(ls, uβ‚‚loopsym)
420+
solve_unroll(
421+
u₁loopsym, uβ‚‚loopsym, cost_vec, reg_pressure, W, vectorized, u₁loop, uβ‚‚loop
422+
)
423+
end
424+
425+
function solve_unroll(
426+
u₁loopsym::Symbol, uβ‚‚loopsym::Symbol,
427+
cost_vec::AbstractVector{Float64},
428+
reg_pressure::AbstractVector{Float64},
429+
W::Int, vectorized::Symbol,
430+
u₁loop::Loop, uβ‚‚loop::Loop
416431
)
417432
maxuβ‚‚base = maxu₁base = VectorizationBase.REGISTER_COUNT == 32 ? 6 : 4#8
418433
maxuβ‚‚ = maxuβ‚‚base#8
419434
maxu₁ = maxu₁base#8
420-
tiledloop = getloop(ls, tiled)
421-
unrolledloop = getloop(ls, u₁loopsym)
422-
if isstaticloop(tiledloop)
423-
if length(tiledloop) ≀ 4
424-
uβ‚‚ = length(tiledloop)
425-
u₁ = max(1, solve_unroll_constT(cost_vec, reg_pressure, uβ‚‚))
426-
return u₁, uβ‚‚, unroll_cost(cost_vec, u₁, uβ‚‚, length(unrolledloop), uβ‚‚)
435+
if isstaticloop(uβ‚‚loop)
436+
uβ‚‚L = length(uβ‚‚loop)
437+
if uβ‚‚loopsym !== vectorized && uβ‚‚L ≀ 4
438+
u₁ = max(1, solve_unroll_constT(cost_vec, reg_pressure, uβ‚‚L))
439+
return u₁, uβ‚‚L, unroll_cost(cost_vec, u₁, uβ‚‚L, length(u₁loop), uβ‚‚L)
427440
end
428-
maxuβ‚‚ = min(4maxuβ‚‚, length(tiledloop))
441+
uβ‚‚L = uβ‚‚loopsym === vectorized ? cld(uβ‚‚L,W) : uβ‚‚L
442+
maxuβ‚‚ = min(4maxuβ‚‚, uβ‚‚L)
429443
end
430-
if isstaticloop(unrolledloop)
431-
u₁L = length(unrolledloop)
444+
if isstaticloop(u₁loop)
445+
u₁L = length(u₁loop)
432446
if u₁loopsym !== vectorized && u₁L ≀ 4
433447
uβ‚‚ = max(1, solve_unroll_constU(cost_vec, reg_pressure, u₁L))
434-
return u₁L, uβ‚‚, unroll_cost(cost_vec, u₁L, uβ‚‚, u₁L, length(tiledloop))
448+
return u₁L, uβ‚‚, unroll_cost(cost_vec, u₁L, uβ‚‚, u₁L, length(uβ‚‚loop))
435449
end
436450
u₁L = u₁loopsym === vectorized ? cld(u₁L,W) : u₁L
437451
maxu₁ = min(4maxu₁, u₁L)
438452
end
439-
u₁, uβ‚‚, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxuβ‚‚, length(unrolledloop), length(tiledloop))
453+
u₁, uβ‚‚, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxuβ‚‚, length(u₁loop), length(uβ‚‚loop))
440454
# heuristic to more evenly divide small numbers of iterations
441-
if isstaticloop(tiledloop)
442-
uβ‚‚ = maybedemotesize(uβ‚‚, length(tiledloop), u₁, unrolledloop, maxuβ‚‚base)
455+
if isstaticloop(uβ‚‚loop)
456+
uβ‚‚ = maybedemotesize(uβ‚‚, length(uβ‚‚loop), u₁, u₁loop, maxuβ‚‚base)
443457
end
444-
if isstaticloop(unrolledloop)
445-
u₁ = maybedemotesize(u₁, length(unrolledloop), uβ‚‚, tiledloop, maxu₁base)
458+
if isstaticloop(u₁loop)
459+
u₁ = maybedemotesize(u₁, length(u₁loop), uβ‚‚, uβ‚‚loop, maxu₁base)
446460
end
447461
u₁, uβ‚‚, cost
448462
end
@@ -641,8 +655,8 @@ function evaluate_cost_tile(
641655
size_T = biggest_type_size(ls)
642656
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
643657
# costs =
644-
# cost_mat[1] / ( unrolled * tiled)
645-
# cost_mat[2] / ( tiled)
658+
# cost_mat[1] / ( unrolled * uβ‚‚loopsym)
659+
# cost_mat[2] / ( uβ‚‚loopsym)
646660
# cost_mat[3] / ( unrolled)
647661
# cost_mat[4]
648662
# @show order
@@ -681,7 +695,7 @@ function evaluate_cost_tile(
681695
# cost is reduced by unrolling u₁ if it is interior to u₁loop (true if either u₁reached, or if depends on uβ‚‚ [or u₁]) and doesn't depend on u₁
682696
reduced_by_unrolling[1,id] = (u₁reached | depends_on_uβ‚‚) & !depends_on_u₁
683697
reduced_by_unrolling[2,id] = (uβ‚‚reached | depends_on_u₁) & !depends_on_uβ‚‚
684-
# @show op iter, unrolledtiled[:,id]
698+
# @show op iter, unrolleduβ‚‚loopsym[:,id]
685699
iters[id] = iter
686700
innerloop ∈ loopdependencies(op) && set_upstream_family!(descendentsininnerloop, op, true)
687701
end

β€Žtest/gemm.jlβ€Ž

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,36 +87,46 @@
8787
# end
8888
# C[m,n] += Ξ”Cβ‚˜β‚™ * factor
8989
# end;
90-
function AmuladdBavx!(C, A, B, factor = 1)
90+
function AmuladdBavx!(C, A, B, Ξ± = one(eltype(C)))
9191
@avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
9292
Ξ”Cβ‚˜β‚™ = zero(eltype(C))
9393
for k ∈ 1:size(A,2)
9494
Ξ”Cβ‚˜β‚™ += A[m,k] * B[k,n]
9595
end
96-
C[m,n] += Ξ”Cβ‚˜β‚™ * factor
96+
C[m,n] += Ξ± * Ξ”Cβ‚˜β‚™
97+
end
98+
end
99+
function AmuladdBavx!(C, A, B, Ξ±, Ξ²)# = zero(eltype(C)))
100+
@avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
101+
Ξ”Cβ‚˜β‚™ = zero(eltype(C))
102+
for k ∈ 1:size(A,2)
103+
Ξ”Cβ‚˜β‚™ += A[m,k] * B[k,n]
104+
end
105+
C[m,n] = Ξ± * Ξ”Cβ‚˜β‚™ + Ξ² * C[m,n]
97106
end
98107
end
99108
Amuladdq = :(for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
100109
Ξ”Cβ‚˜β‚™ = zero(eltype(C))
101110
for k ∈ 1:size(A,2)
102111
Ξ”Cβ‚˜β‚™ += A[m,k] * B[k,n]
103112
end
104-
C[m,n] += Ξ”Cβ‚˜β‚™ * factor
113+
C[m,n] = Ξ± * Ξ”Cβ‚˜β‚™ + Ξ² * C[m,n]
105114
end);
106115
lsAmuladd = LoopVectorization.LoopSet(Amuladdq);
116+
@test LoopVectorization.choose_order(lsAmuladd) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
107117
Atmuladdq = :(for m ∈ 1:size(A,2), n ∈ 1:size(B,2)
108118
Ξ”Cβ‚˜β‚™ = zero(eltype(C))
109119
for k ∈ 1:size(A,1)
110120
Ξ”Cβ‚˜β‚™ += A[k,m] * B[k,n]
111-
end
112-
C[m,n] += Ξ”Cβ‚˜β‚™ * factor
121+
end
122+
C[m,n] += Ξ± * Ξ”Cβ‚˜β‚™
113123
end);
114124
lsAtmuladd = LoopVectorization.LoopSet(Atmuladdq);
115-
LoopVectorization.lower(lsAtmuladd, 2, 2)
125+
# LoopVectorization.lower(lsAtmuladd, 2, 2)
116126
# lsAmuladd.operations
117127
# LoopVectorization.loopdependencies.(lsAmuladd.operations)
118128
# LoopVectorization.reduceddependencies.(lsAmuladd.operations)
119-
@test LoopVectorization.choose_order(lsAmuladd) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
129+
@test LoopVectorization.choose_order(lsAtmuladd) == (Symbol[:n,:m,:k], :n, :m, :k, Unum, Tnum)
120130

121131
function AmulB_avx1!(C, A, B)
122132
@_avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
@@ -615,6 +625,14 @@
615625
@test C β‰ˆ C2
616626
AmuladdBavx!(C, At', B, -2)
617627
@test C β‰ˆ -C2
628+
AmuladdBavx!(C, At', B, 3, 2)
629+
@test C β‰ˆ C2
630+
# How much of this can I do before rounding errors are likely to cause test failures?
631+
# Setting back to zero here...
632+
AmuladdBavx!(C, At', B, 1, 0)
633+
@test C β‰ˆ C2
634+
AmuladdBavx!(C, At', Bt', 2, -1)
635+
@test C β‰ˆ C2
618636
fill!(C, 9999.999); AmulB2x2avx!(C, A, B);
619637
@test C β‰ˆ C2
620638
fill!(C, 9999.999); AmulB2x2avx!(C, At', B);

0 commit comments

Comments
Β (0)