Need to use opoffsets for symlicms.

chriselrod · chriselrod · commit e390c17d533d · 2020-03-30T01:59:31.000-04:00
diff --git a/Manifest.toml b/Manifest.toml
@@ -32,19 +32,19 @@ deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
 [[OffsetArrays]]
-git-tree-sha1 = "6a35d9446b40ae5004cd7bd0f1ae3505528c7fd6"
+git-tree-sha1 = "930db8ef90483570107f2396b1ffc6680f08e8b7"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.0.3"
+version = "1.0.4"
 
 [[Random]]
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SIMDPirates]]
 deps = ["VectorizationBase"]
-git-tree-sha1 = "8f89aa38f5e4e89f2a474ffdc850fc21d6ab9ed4"
+git-tree-sha1 = "53c43af0172c24b0783bd93650bd8b78afb3e57b"
 uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
-version = "0.7.4"
+version = "0.7.5"
 
 [[SLEEFPirates]]
 deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.6.24"
+version = "0.6.25"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -13,7 +13,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 OffsetArrays = "1"
-SIMDPirates = "0.7.4"
+SIMDPirates = "0.7.5"
 SLEEFPirates = "0.4"
 UnPack = "0"
 VectorizationBase = "0.9.5"
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -245,6 +245,19 @@ function tile_cost(X, U, T, UL, TL)
     # X[1]*Tfactor*Ufactor + X[4] + X[2] * Tfactor + X[3] * Ufactor
     X[1] + X[4] + X[2] * Tfactor + X[3] * Ufactor
 end
+# function itertilesize(X, UL, TL)
+#     cb = Inf
+#     Ub = 1; Tb = 1
+#     for U ∈ 1:4, T ∈ 1:4
+#         c = tile_cost(X, U, T, UL, TL)
+#         @show U, T, c
+#         if cb > c
+#             cb = c
+#             Ub = U; Tb = T
+#         end
+#     end
+#     Ub, Tb, cb
+# end
 function solve_tilesize(X, R, UL, TL)
     # @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     first(iszero(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
@@ -253,14 +266,17 @@ function solve_tilesize(X, R, UL, TL)
     # first solving for U via quadratic formula
     # X is vector of costs, and R is of register pressures
     RR = REGISTER_COUNT - R[3] - R[4] # RR ≡ RemainingRegisters
+    R[1] + R[2] > 0.5RR && return 1,1, tile_cost(X, 1, 1, UL, TL)
     a = (R[1])^2*X[2] - (R[2])^2*R[1]*X[3]/RR
     b = 2*R[1]*R[2]*X[3]
     c = -RR*R[1]*X[3]
-    Ufloat = (sqrt(b^2 - 4a*c) - b) / (2a)
-    Tfloat = (RR - Ufloat*R[2])/(Ufloat*R[1])
-    # @show Ufloat, Tfloat
+    discriminant = b^2 - 4a*c
+    discriminant < 0 && return -1,-1,Inf
+    Ufloat = (sqrt(discriminant) - b) / (2a)
+    Tfloat = (RR - max(1.0,Ufloat)*R[2])/(max(1.0,Ufloat)*R[1])
     if !(isfinite(Tfloat) && isfinite(Ufloat))
         return 4, 4, tile_cost(X, 4, 4, UL, TL)
+        # return itertilesize(X, UL, TL)
     end
     Ulow = max(1, floor(Int, Ufloat)) # must be at least 1
     Tlow = max(1, floor(Int, Tfloat)) # must be at least 1
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -469,7 +469,7 @@ function add_operation!(
     elseif RHS.head === :if
         add_if!(ls, LHS, RHS, elementbytes, position)
     else
-        throw("Expression not recognized:\n$x")
+        throw("Expression not recognized:\n$RHS")
     end
 end
 add_operation!(ls::LoopSet, RHS::Expr, elementbytes::Int, position::Int) = add_operation!(ls, gensym(:LHS), RHS, elementbytes, position)
diff --git a/src/lower_compute.jl b/src/lower_compute.jl
@@ -28,7 +28,7 @@ function lower_compute!(
     end
     parentsunrolled = isunrolled_sym.(parents_op, unrolled, tiled)
     if instr.instr === :identity && name(first(parents_op)) === var && isone(length(parents_op))
-        if (opunrolled == first(parentsunrolled)) && ((!isnothing(suffix)) == first(parentstiled))
+        if (opunrolled == first(parentsunrolled)) && ((!isnothing(suffix)) == parentstiled[1])
             return
         end
     end
@@ -58,6 +58,7 @@ function lower_compute!(
                 for u ∈ 0:U-1
                     push!(q.args, Expr(:(=), Symbol(newparentname, u), Symbol(parentname, u)))
                 end
+                @show parentop
                 reduce_expr!(q, newparentname, Instruction(reduction_to_single_vector(instruction(newparentop))), U)
                 push!(q.args, Expr(:(=), newparentname, Symbol(newparentname, 0)))
             end
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -169,7 +169,7 @@ function process_metadata!(ls::LoopSet, AM, num_arrays::Int)
     for (i,si) ∈ enumerate(AM[3].parameters)
         sii = si::Int
         s = gensym(:symlicm)
-        push!(ls.preamble_symsym, (si, s))
+        push!(ls.preamble_symsym, (opoffsets[sii] + 1, s))
         pushpreamble!(ls, Expr(:(=), s, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, :vargs, num_arrays + i))))
     end
     expandbyoffset!(ls.preamble_symint, AM[4].parameters, opoffsets)
@@ -343,6 +343,7 @@ end
 # elbytes(::VectorizationBase.AbstractPointer{T}) where {T} = sizeof(T)::Int
 typeeltype(::Type{P}) where {T,P<:VectorizationBase.AbstractPointer{T}} = T
 typeeltype(::Type{<:AbstractRange{T}}) where {T} = T
+# typeeltype(::Any) = Int8
 
 function add_array_symbols!(ls::LoopSet, arraysymbolinds::Vector{Symbol}, offset::Int)
     for (i,as) ∈ enumerate(arraysymbolinds)