diff --git a/build/slices/sums_asm.go b/build/slices/sums_asm.go index 6548bbe7..79a17c07 100644 --- a/build/slices/sums_asm.go +++ b/build/slices/sums_asm.go @@ -1,4 +1,4 @@ -// +build ignore +//go:build ignore package main @@ -27,7 +27,7 @@ type Processor struct { } func init() { - ConstraintExpr("!purego") + ConstraintExpr("!purego amd64") } func main() { diff --git a/slices/sums.go b/slices/sums.go index 9269c521..7b44cb0d 100644 --- a/slices/sums.go +++ b/slices/sums.go @@ -3,7 +3,7 @@ package slices import _ "github.com/segmentio/asm/cpu" // SumUint64 sums pairs of by index from x and y, similar to python's zip routine. -// If available AVX instructions will be used to operate on many uint64s simultaneously. +// If available SIMD instructions will be used to operate on many uint64s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. @@ -18,7 +18,7 @@ func sumUint64Generic(x, y []uint64) { } // SumUint32 sums pairs of by index from x and y, similar to python's zip routine. -// If available AVX instructions will be used to operate on many uint32s simultaneously. +// If available SIMD instructions will be used to operate on many uint32s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. @@ -33,7 +33,7 @@ func sumUint32Generic(x, y []uint32) { } // SumUint16 sums pairs of by index from x and y, similar to python's zip routine. -// If available AVX instructions will be used to operate on many uint16s simultaneously. +// If available SIMD instructions will be used to operate on many uint16s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. @@ -48,7 +48,7 @@ func sumUint16Generic(x, y []uint16) { } // SumUint8 sums pairs of by index from x and y, similar to python's zip routine. -// If available AVX instructions will be used to operate on many uint8s simultaneously. +// If available SIMD instructions will be used to operate on many uint8s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. diff --git a/slices/sums_amd64.go b/slices/sums_amd64.go index eabb99de..596ab0a5 100644 --- a/slices/sums_amd64.go +++ b/slices/sums_amd64.go @@ -1,6 +1,6 @@ // Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT. -//go:build !purego +//go:build !purego || amd64 package slices diff --git a/slices/sums_amd64.s b/slices/sums_amd64.s index c6e818f0..f1af82fa 100644 --- a/slices/sums_amd64.s +++ b/slices/sums_amd64.s @@ -1,6 +1,6 @@ // Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT. -//go:build !purego +//go:build !purego || amd64 #include "textflag.h" diff --git a/slices/sums_arm64.go b/slices/sums_arm64.go new file mode 100644 index 00000000..d952ecd7 --- /dev/null +++ b/slices/sums_arm64.go @@ -0,0 +1,15 @@ +//go:build !purego || arm64 + +package slices + +// Sum uint64s using NEON instructions, results stored in x +func sumUint64(x []uint64, y []uint64) + +// Sum uint32s using NEON instructions, results stored in x +func sumUint32(x []uint32, y []uint32) + +// Sum uint16s using NEON instructions, results stored in x +func sumUint16(x []uint16, y []uint16) + +// Sum uint8s using NEON instructions, results stored in x +func sumUint8(x []uint8, y []uint8) diff --git a/slices/sums_arm64.s b/slices/sums_arm64.s new file mode 100644 index 00000000..156f2b59 --- /dev/null +++ b/slices/sums_arm64.s @@ -0,0 +1,240 @@ +//go:build !purego || arm64 + +#include "textflag.h" + +// func sumUint64(x []uint64, y []uint64) +TEXT ·sumUint64(SB), NOSPLIT, $0-48 + MOVD x_base+0(FP), R0 + MOVD x_len+8(FP), R1 + MOVD y_base+24(FP), R2 + MOVD y_len+32(FP), R3 + + CMP R1, R3 + CSEL LT, R1, R3, R4 + + CBZ R4, done + + LSR $2, R4, R5 + AND $3, R4, R6 + + CBZ R5, remainder +simd_loop: + VLD1 (R0), [V0.D2, V1.D2] + VLD1 (R2), [V2.D2, V3.D2] + + VADD V2.D2, V0.D2, V0.D2 + VADD V3.D2, V1.D2, V1.D2 + + VST1.P [V0.D2, V1.D2], 32(R0) + ADD $32, R2, R2 + + SUB $1, R5, R5 + CBNZ R5, simd_loop + +remainder: + CBZ R6, done + + LSR $1, R6, R7 + CBZ R7, small_remainder + + VLD1 (R0), [V0.D2] + VLD1 (R2), [V1.D2] + VADD V1.D2, V0.D2, V0.D2 + VST1.P [V0.D2], 16(R0) + ADD $16, R2, R2 + AND $1, R6, R6 + +small_remainder: + CBZ R6, done + MOVD ZR, R5 +rem_loop: + LSL $3, R5, R7 + MOVD (R0)(R7), R8 + MOVD (R2)(R7), R9 + ADD R8, R9, R8 + MOVD R8, (R0)(R7) + + ADD $1, R5, R5 + CMP R6, R5 + BLT rem_loop + +done: + RET + +// func sumUint32(x []uint32, y []uint32) +TEXT ·sumUint32(SB), NOSPLIT, $0-48 + MOVD x_base+0(FP), R0 + MOVD x_len+8(FP), R1 + MOVD y_base+24(FP), R2 + MOVD y_len+32(FP), R3 + + CMP R1, R3 + CSEL LT, R1, R3, R4 + + CBZ R4, done + + LSR $3, R4, R5 + AND $7, R4, R6 + + CBZ R5, remainder +simd_loop: + VLD1 (R0), [V0.S4, V1.S4] + VLD1 (R2), [V2.S4, V3.S4] + + VADD V2.S4, V0.S4, V0.S4 + VADD V3.S4, V1.S4, V1.S4 + + VST1.P [V0.S4, V1.S4], 32(R0) + ADD $32, R2, R2 + + SUB $1, R5, R5 + CBNZ R5, simd_loop + +remainder: + CBZ R6, done + + LSR $2, R6, R7 + CBZ R7, small_remainder + + VLD1 (R0), [V0.S4] + VLD1 (R2), [V1.S4] + VADD V1.S4, V0.S4, V0.S4 + VST1.P [V0.S4], 16(R0) + ADD $16, R2, R2 + AND $3, R6, R6 + +small_remainder: + CBZ R6, done + MOVD ZR, R5 +rem_loop: + ADD R5, R5, R5 + ADD R5, R5, R5 + MOVW (R0)(R5), R7 + MOVW (R2)(R5), R8 + ADD R7, R8, R7 + MOVW R7, (R0)(R5) + LSR $2, R5, R5 + + ADD $1, R5, R5 + CMP R6, R5 + BLT rem_loop + +done: + RET + +// func sumUint16(x []uint16, y []uint16) +TEXT ·sumUint16(SB), NOSPLIT, $0-48 + MOVD x_base+0(FP), R0 + MOVD x_len+8(FP), R1 + MOVD y_base+24(FP), R2 + MOVD y_len+32(FP), R3 + + CMP R1, R3 + CSEL LT, R1, R3, R4 + + CBZ R4, done + + LSR $4, R4, R5 + AND $15, R4, R6 + + CBZ R5, remainder +simd_loop: + VLD1 (R0), [V0.H8, V1.H8] + VLD1 (R2), [V2.H8, V3.H8] + + VADD V2.H8, V0.H8, V0.H8 + VADD V3.H8, V1.H8, V1.H8 + + VST1.P [V0.H8, V1.H8], 32(R0) + ADD $32, R2, R2 + + SUB $1, R5, R5 + CBNZ R5, simd_loop + +remainder: + CBZ R6, done + + LSR $3, R6, R7 + CBZ R7, small_remainder + + VLD1 (R0), [V0.H8] + VLD1 (R2), [V1.H8] + VADD V1.H8, V0.H8, V0.H8 + VST1.P [V0.H8], 16(R0) + ADD $16, R2, R2 + AND $7, R6, R6 + +small_remainder: + CBZ R6, done + MOVD ZR, R5 +rem_loop: + ADD R5, R5, R5 + MOVW (R0)(R5), R7 + MOVW (R2)(R5), R8 + ADD R7, R8, R7 + MOVW R7, (R0)(R5) + LSR $1, R5, R5 + + ADD $1, R5, R5 + CMP R6, R5 + BLT rem_loop + +done: + RET + +// func sumUint8(x []uint8, y []uint8) +TEXT ·sumUint8(SB), NOSPLIT, $0-48 + MOVD x_base+0(FP), R0 + MOVD x_len+8(FP), R1 + MOVD y_base+24(FP), R2 + MOVD y_len+32(FP), R3 + + CMP R1, R3 + CSEL LT, R1, R3, R4 + + CBZ R4, done + + LSR $5, R4, R5 + AND $31, R4, R6 + + CBZ R5, remainder +simd_loop: + VLD1 (R0), [V0.B16, V1.B16] + VLD1 (R2), [V2.B16, V3.B16] + + VADD V2.B16, V0.B16, V0.B16 + VADD V3.B16, V1.B16, V1.B16 + + VST1.P [V0.B16, V1.B16], 32(R0) + ADD $32, R2, R2 + + SUB $1, R5, R5 + CBNZ R5, simd_loop + +remainder: + CBZ R6, done + + LSR $4, R6, R7 + CBZ R7, small_remainder + + VLD1 (R0), [V0.B16] + VLD1 (R2), [V1.B16] + VADD V1.B16, V0.B16, V0.B16 + VST1.P [V0.B16], 16(R0) + ADD $16, R2, R2 + AND $15, R6, R6 + +small_remainder: + CBZ R6, done + MOVD ZR, R5 +rem_loop: + MOVB (R0)(R5), R7 + MOVB (R2)(R5), R8 + ADD R7, R8, R7 + MOVB R7, (R0)(R5) + + ADD $1, R5, R5 + CMP R6, R5 + BLT rem_loop +done: + RET diff --git a/slices/sums_default.go b/slices/sums_default.go index 35c2951e..e310a82a 100644 --- a/slices/sums_default.go +++ b/slices/sums_default.go @@ -1,5 +1,5 @@ -//go:build purego || !amd64 -// +build purego !amd64 +//go:build purego +// +build purego package slices