Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build/slices/sums_asm.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// +build ignore
//go:build ignore

package main

Expand Down Expand Up @@ -27,7 +27,7 @@ type Processor struct {
}

func init() {
ConstraintExpr("!purego")
ConstraintExpr("!purego amd64")
}

func main() {
Expand Down
8 changes: 4 additions & 4 deletions slices/sums.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package slices
import _ "github.com/segmentio/asm/cpu"

// SumUint64 sums pairs of by index from x and y, similar to python's zip routine.
// If available AVX instructions will be used to operate on many uint64s simultaneously.
// If available SIMD instructions will be used to operate on many uint64s simultaneously.
//
// Results are returned in the x slice and y is left unaltered. If x and y differ in size
// only len(x) elements will be processed.
Expand All @@ -18,7 +18,7 @@ func sumUint64Generic(x, y []uint64) {
}

// SumUint32 sums pairs of by index from x and y, similar to python's zip routine.
// If available AVX instructions will be used to operate on many uint32s simultaneously.
// If available SIMD instructions will be used to operate on many uint32s simultaneously.
//
// Results are returned in the x slice and y is left unaltered. If x and y differ in size
// only len(x) elements will be processed.
Expand All @@ -33,7 +33,7 @@ func sumUint32Generic(x, y []uint32) {
}

// SumUint16 sums pairs of by index from x and y, similar to python's zip routine.
// If available AVX instructions will be used to operate on many uint16s simultaneously.
// If available SIMD instructions will be used to operate on many uint16s simultaneously.
//
// Results are returned in the x slice and y is left unaltered. If x and y differ in size
// only len(x) elements will be processed.
Expand All @@ -48,7 +48,7 @@ func sumUint16Generic(x, y []uint16) {
}

// SumUint8 sums pairs of by index from x and y, similar to python's zip routine.
// If available AVX instructions will be used to operate on many uint8s simultaneously.
// If available SIMD instructions will be used to operate on many uint8s simultaneously.
//
// Results are returned in the x slice and y is left unaltered. If x and y differ in size
// only len(x) elements will be processed.
Expand Down
2 changes: 1 addition & 1 deletion slices/sums_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion slices/sums_amd64.s
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.

//go:build !purego
//go:build !purego || amd64

#include "textflag.h"

Expand Down
15 changes: 15 additions & 0 deletions slices/sums_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//go:build !purego || arm64

package slices

// Sum uint64s using NEON instructions, results stored in x
func sumUint64(x []uint64, y []uint64)

// Sum uint32s using NEON instructions, results stored in x
func sumUint32(x []uint32, y []uint32)

// Sum uint16s using NEON instructions, results stored in x
func sumUint16(x []uint16, y []uint16)

// Sum uint8s using NEON instructions, results stored in x
func sumUint8(x []uint8, y []uint8)
240 changes: 240 additions & 0 deletions slices/sums_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
//go:build !purego || arm64

#include "textflag.h"

// func sumUint64(x []uint64, y []uint64)
TEXT ·sumUint64(SB), NOSPLIT, $0-48
MOVD x_base+0(FP), R0
MOVD x_len+8(FP), R1
MOVD y_base+24(FP), R2
MOVD y_len+32(FP), R3

CMP R1, R3
CSEL LT, R1, R3, R4

CBZ R4, done

LSR $2, R4, R5
AND $3, R4, R6

CBZ R5, remainder
simd_loop:
VLD1 (R0), [V0.D2, V1.D2]
VLD1 (R2), [V2.D2, V3.D2]

VADD V2.D2, V0.D2, V0.D2
VADD V3.D2, V1.D2, V1.D2

VST1.P [V0.D2, V1.D2], 32(R0)
ADD $32, R2, R2

SUB $1, R5, R5
CBNZ R5, simd_loop

remainder:
CBZ R6, done

LSR $1, R6, R7
CBZ R7, small_remainder

VLD1 (R0), [V0.D2]
VLD1 (R2), [V1.D2]
VADD V1.D2, V0.D2, V0.D2
VST1.P [V0.D2], 16(R0)
ADD $16, R2, R2
AND $1, R6, R6

small_remainder:
CBZ R6, done
MOVD ZR, R5
rem_loop:
LSL $3, R5, R7
MOVD (R0)(R7), R8
MOVD (R2)(R7), R9
ADD R8, R9, R8
MOVD R8, (R0)(R7)

ADD $1, R5, R5
CMP R6, R5
BLT rem_loop

done:
RET

// func sumUint32(x []uint32, y []uint32)
TEXT ·sumUint32(SB), NOSPLIT, $0-48
MOVD x_base+0(FP), R0
MOVD x_len+8(FP), R1
MOVD y_base+24(FP), R2
MOVD y_len+32(FP), R3

CMP R1, R3
CSEL LT, R1, R3, R4

CBZ R4, done

LSR $3, R4, R5
AND $7, R4, R6

CBZ R5, remainder
simd_loop:
VLD1 (R0), [V0.S4, V1.S4]
VLD1 (R2), [V2.S4, V3.S4]

VADD V2.S4, V0.S4, V0.S4
VADD V3.S4, V1.S4, V1.S4

VST1.P [V0.S4, V1.S4], 32(R0)
ADD $32, R2, R2

SUB $1, R5, R5
CBNZ R5, simd_loop

remainder:
CBZ R6, done

LSR $2, R6, R7
CBZ R7, small_remainder

VLD1 (R0), [V0.S4]
VLD1 (R2), [V1.S4]
VADD V1.S4, V0.S4, V0.S4
VST1.P [V0.S4], 16(R0)
ADD $16, R2, R2
AND $3, R6, R6

small_remainder:
CBZ R6, done
MOVD ZR, R5
rem_loop:
ADD R5, R5, R5
ADD R5, R5, R5
MOVW (R0)(R5), R7
MOVW (R2)(R5), R8
ADD R7, R8, R7
MOVW R7, (R0)(R5)
LSR $2, R5, R5

ADD $1, R5, R5
CMP R6, R5
BLT rem_loop

done:
RET

// func sumUint16(x []uint16, y []uint16)
TEXT ·sumUint16(SB), NOSPLIT, $0-48
MOVD x_base+0(FP), R0
MOVD x_len+8(FP), R1
MOVD y_base+24(FP), R2
MOVD y_len+32(FP), R3

CMP R1, R3
CSEL LT, R1, R3, R4

CBZ R4, done

LSR $4, R4, R5
AND $15, R4, R6

CBZ R5, remainder
simd_loop:
VLD1 (R0), [V0.H8, V1.H8]
VLD1 (R2), [V2.H8, V3.H8]

VADD V2.H8, V0.H8, V0.H8
VADD V3.H8, V1.H8, V1.H8

VST1.P [V0.H8, V1.H8], 32(R0)
ADD $32, R2, R2

SUB $1, R5, R5
CBNZ R5, simd_loop

remainder:
CBZ R6, done

LSR $3, R6, R7
CBZ R7, small_remainder

VLD1 (R0), [V0.H8]
VLD1 (R2), [V1.H8]
VADD V1.H8, V0.H8, V0.H8
VST1.P [V0.H8], 16(R0)
ADD $16, R2, R2
AND $7, R6, R6

small_remainder:
CBZ R6, done
MOVD ZR, R5
rem_loop:
ADD R5, R5, R5
MOVW (R0)(R5), R7
MOVW (R2)(R5), R8
ADD R7, R8, R7
MOVW R7, (R0)(R5)
LSR $1, R5, R5

ADD $1, R5, R5
CMP R6, R5
BLT rem_loop

done:
RET

// func sumUint8(x []uint8, y []uint8)
TEXT ·sumUint8(SB), NOSPLIT, $0-48
MOVD x_base+0(FP), R0
MOVD x_len+8(FP), R1
MOVD y_base+24(FP), R2
MOVD y_len+32(FP), R3

CMP R1, R3
CSEL LT, R1, R3, R4

CBZ R4, done

LSR $5, R4, R5
AND $31, R4, R6

CBZ R5, remainder
simd_loop:
VLD1 (R0), [V0.B16, V1.B16]
VLD1 (R2), [V2.B16, V3.B16]

VADD V2.B16, V0.B16, V0.B16
VADD V3.B16, V1.B16, V1.B16

VST1.P [V0.B16, V1.B16], 32(R0)
ADD $32, R2, R2

SUB $1, R5, R5
CBNZ R5, simd_loop

remainder:
CBZ R6, done

LSR $4, R6, R7
CBZ R7, small_remainder

VLD1 (R0), [V0.B16]
VLD1 (R2), [V1.B16]
VADD V1.B16, V0.B16, V0.B16
VST1.P [V0.B16], 16(R0)
ADD $16, R2, R2
AND $15, R6, R6

small_remainder:
CBZ R6, done
MOVD ZR, R5
rem_loop:
MOVB (R0)(R5), R7
MOVB (R2)(R5), R8
ADD R7, R8, R7
MOVB R7, (R0)(R5)

ADD $1, R5, R5
CMP R6, R5
BLT rem_loop
done:
RET
4 changes: 2 additions & 2 deletions slices/sums_default.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
//go:build purego || !amd64
// +build purego !amd64
//go:build purego
// +build purego

package slices

func sumUint64(x, y []uint64) {

Check failure on line 6 in slices/sums_default.go

View workflow job for this annotation

GitHub Actions / test (1.18.x, ubuntu-latest)

sumUint64 redeclared in this block
sumUint64Generic(x, y)
}

func sumUint32(x, y []uint32) {

Check failure on line 10 in slices/sums_default.go

View workflow job for this annotation

GitHub Actions / test (1.18.x, ubuntu-latest)

sumUint32 redeclared in this block
sumUint32Generic(x, y)
}

func sumUint16(x, y []uint16) {

Check failure on line 14 in slices/sums_default.go

View workflow job for this annotation

GitHub Actions / test (1.18.x, ubuntu-latest)

sumUint16 redeclared in this block
sumUint16Generic(x, y)
}

func sumUint8(x, y []uint8) {

Check failure on line 18 in slices/sums_default.go

View workflow job for this annotation

GitHub Actions / test (1.18.x, ubuntu-latest)

sumUint8 redeclared in this block
sumUint8Generic(x, y)
}
Loading