@@ -48,15 +48,15 @@ T = Float64
4848 # end);
4949 # lsq2d = LoopVectorization.LoopSet(q2d); LoopVectorization.choose_order(lsq2d)
5050
51- oq2 = :(for j in rng2, i in rng1
52- tmp = zero (eltype (out))
53- for jk in - 1 : 1 , ik in - 1 : 1
54- tmp += A[i+ ik,j+ jk]* kern[ik,jk]
55- end
56- out[i,j] = tmp
57- end );
58- lsoq = LoopVectorization. LoopSet (oq2);
59- LoopVectorization. choose_order (lsoq)
51+ # oq2 = :(for j in rng2, i in rng1
52+ # tmp = zero(eltype(out))
53+ # for jk in -1:1, ik in -1:1
54+ # tmp += A[i+ik,j+jk]*kern[ik,jk]
55+ # end
56+ # out[i,j] = tmp
57+ # end);
58+ # lsoq = LoopVectorization.LoopSet(oq2);
59+ # LoopVectorization.choose_order(lsoq)
6060
6161 function avx2d! (out:: AbstractMatrix , A:: AbstractMatrix , kern)
6262 rng1k, rng2k = axes (kern)
@@ -141,19 +141,20 @@ T = Float64
141141 # lsuq = LoopVectorization.LoopSet(macroexpand(Base, uq));
142142 # LoopVectorization.choose_order(lsuq)
143143
144- # out = out1;
145- # z = zero(eltype(out));
146- # R=CartesianIndices(out);
147- # Rk = CartesianIndices(kern);
148- # lsgeneric = LoopVectorization.@avx_debug for I in R
149- # tmp = z
150- # for J in Rk
151- # tmp += A[I+J]*kern[J]
152- # end
153- # out[I] = tmp
154- # end;
155- # LoopVectorization.choose_order(lsgeneric)
156- # out = out1;
144+ # using LoopVectorization, OffsetArrays
145+ # T = Float64
146+ # A = rand(T, 100, 100);
147+ # kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
148+ # out = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
149+ # lsgeneric = LoopVectorization.@avx_debug for I in CartesianIndices(out)
150+ # tmp = zero(eltype(out))
151+ # for J in CartesianIndices(kern)
152+ # tmp += A[I+J]*kern[J]
153+ # end
154+ # out[I] = tmp
155+ # end;
156+ # LoopVectorization.choose_order(lsgeneric)
157+ # # out = out1;
157158# lsgenerics = LoopVectorization.@avx_debug for I in CartesianIndices(out)
158159# tmp = zero(eltype(out))
159160# for J in CartesianIndices(skern)
@@ -194,7 +195,7 @@ T = Float64
194195
195196 for T ∈ (Float32, Float64)
196197 @show T, @__LINE__
197- A = rand (T, 100 , 100 );
198+ A = rand (T, 100 , 100 ); At = copy (A ' );
198199 kern = OffsetArray (rand (T, 3 , 3 ), - 1 : 1 , - 1 : 1 );
199200 skern = SizedOffsetMatrix {T,-1,1,-1,1} (parent (kern));
200201 out1 = OffsetArray (similar (A, size (A).- 2 ), 1 , 1 ); # stay away from the edges of A
@@ -210,6 +211,15 @@ T = Float64
210211 fill! (out2, NaN ); avx2d! (out2, A, skern);
211212 @test out1 ≈ out2
212213
214+ fill! (out2, NaN ); avx2d! (out2, At' , skern);
215+ @test out1 ≈ out2
216+
217+ fill! (out2, NaN ); avx2d! (out2' , A, skern);
218+ @test out1 ≈ out2'
219+
220+ fill! (out2, NaN ); avx2d! (out2' , At' , skern);
221+ @test out1 ≈ out2'
222+
213223 fill! (out3, NaN ); avx2douter! (out3, A, skern);
214224 @test out1 ≈ out3
215225
0 commit comments