diff --git a/build/slices/sums_asm.go b/build/slices/sums_asm.go
index 6548bbe7..79a17c07 100644
--- a/build/slices/sums_asm.go
+++ b/build/slices/sums_asm.go
@@ -1,4 +1,4 @@
-// +build ignore
+//go:build ignore
 
 package main
 
@@ -27,7 +27,7 @@ type Processor struct {
 }
 
 func init() {
-	ConstraintExpr("!purego")
+	ConstraintExpr("!purego amd64")
 }
 
 func main() {
diff --git a/slices/sums.go b/slices/sums.go
index 9269c521..7b44cb0d 100644
--- a/slices/sums.go
+++ b/slices/sums.go
@@ -3,7 +3,7 @@ package slices
 import _ "github.com/segmentio/asm/cpu"
 
 // SumUint64 sums pairs of by index from x and y, similar to python's zip routine.
-// If available AVX instructions will be used to operate on many uint64s simultaneously.
+// If available SIMD instructions will be used to operate on many uint64s simultaneously.
 //
 // Results are returned in the x slice and y is left unaltered. If x and y differ in size
 // only len(x) elements will be processed.
@@ -18,7 +18,7 @@ func sumUint64Generic(x, y []uint64) {
 }
 
 // SumUint32 sums pairs of by index from x and y, similar to python's zip routine.
-// If available AVX instructions will be used to operate on many uint32s simultaneously.
+// If available SIMD instructions will be used to operate on many uint32s simultaneously.
 //
 // Results are returned in the x slice and y is left unaltered. If x and y differ in size
 // only len(x) elements will be processed.
@@ -33,7 +33,7 @@ func sumUint32Generic(x, y []uint32) {
 }
 
 // SumUint16 sums pairs of by index from x and y, similar to python's zip routine.
-// If available AVX instructions will be used to operate on many uint16s simultaneously.
+// If available SIMD instructions will be used to operate on many uint16s simultaneously.
 //
 // Results are returned in the x slice and y is left unaltered. If x and y differ in size
 // only len(x) elements will be processed.
@@ -48,7 +48,7 @@ func sumUint16Generic(x, y []uint16) {
 }
 
 // SumUint8 sums pairs of by index from x and y, similar to python's zip routine.
-// If available AVX instructions will be used to operate on many uint8s simultaneously.
+// If available SIMD instructions will be used to operate on many uint8s simultaneously.
 //
 // Results are returned in the x slice and y is left unaltered. If x and y differ in size
 // only len(x) elements will be processed.
diff --git a/slices/sums_amd64.go b/slices/sums_amd64.go
index eabb99de..596ab0a5 100644
--- a/slices/sums_amd64.go
+++ b/slices/sums_amd64.go
@@ -1,6 +1,6 @@
 // Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.
 
-//go:build !purego
+//go:build !purego || amd64
 
 package slices
 
diff --git a/slices/sums_amd64.s b/slices/sums_amd64.s
index c6e818f0..f1af82fa 100644
--- a/slices/sums_amd64.s
+++ b/slices/sums_amd64.s
@@ -1,6 +1,6 @@
 // Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.
 
-//go:build !purego
+//go:build !purego || amd64
 
 #include "textflag.h"
 
diff --git a/slices/sums_arm64.go b/slices/sums_arm64.go
new file mode 100644
index 00000000..d952ecd7
--- /dev/null
+++ b/slices/sums_arm64.go
@@ -0,0 +1,15 @@
+//go:build !purego || arm64
+
+package slices
+
+// Sum uint64s using NEON instructions, results stored in x
+func sumUint64(x []uint64, y []uint64)
+
+// Sum uint32s using NEON instructions, results stored in x
+func sumUint32(x []uint32, y []uint32)
+
+// Sum uint16s using NEON instructions, results stored in x
+func sumUint16(x []uint16, y []uint16)
+
+// Sum uint8s  using NEON instructions, results stored in x
+func sumUint8(x []uint8, y []uint8)
diff --git a/slices/sums_arm64.s b/slices/sums_arm64.s
new file mode 100644
index 00000000..156f2b59
--- /dev/null
+++ b/slices/sums_arm64.s
@@ -0,0 +1,240 @@
+//go:build !purego || arm64
+
+#include "textflag.h"
+
+// func sumUint64(x []uint64, y []uint64)
+TEXT ·sumUint64(SB), NOSPLIT, $0-48
+    MOVD x_base+0(FP), R0
+    MOVD x_len+8(FP), R1
+    MOVD y_base+24(FP), R2
+    MOVD y_len+32(FP), R3
+
+    CMP R1, R3
+    CSEL LT, R1, R3, R4
+    
+    CBZ R4, done
+    
+    LSR $2, R4, R5
+    AND $3, R4, R6
+
+    CBZ R5, remainder
+simd_loop:
+    VLD1 (R0), [V0.D2, V1.D2]
+    VLD1 (R2), [V2.D2, V3.D2]
+    
+    VADD V2.D2, V0.D2, V0.D2
+    VADD V3.D2, V1.D2, V1.D2
+    
+    VST1.P [V0.D2, V1.D2], 32(R0)
+    ADD $32, R2, R2
+    
+    SUB $1, R5, R5
+    CBNZ R5, simd_loop
+
+remainder:
+    CBZ R6, done
+    
+    LSR $1, R6, R7
+    CBZ R7, small_remainder
+    
+    VLD1 (R0), [V0.D2]
+    VLD1 (R2), [V1.D2]
+    VADD V1.D2, V0.D2, V0.D2
+    VST1.P [V0.D2], 16(R0)
+    ADD $16, R2, R2
+    AND $1, R6, R6
+
+small_remainder:
+    CBZ R6, done
+    MOVD ZR, R5
+rem_loop:
+    LSL $3, R5, R7
+    MOVD (R0)(R7), R8
+    MOVD (R2)(R7), R9
+    ADD R8, R9, R8
+    MOVD R8, (R0)(R7)
+    
+    ADD $1, R5, R5
+    CMP R6, R5
+    BLT rem_loop
+
+done:
+    RET
+
+// func sumUint32(x []uint32, y []uint32)
+TEXT ·sumUint32(SB), NOSPLIT, $0-48
+    MOVD x_base+0(FP), R0
+    MOVD x_len+8(FP), R1
+    MOVD y_base+24(FP), R2
+    MOVD y_len+32(FP), R3
+
+    CMP R1, R3
+    CSEL LT, R1, R3, R4
+    
+    CBZ R4, done
+    
+    LSR $3, R4, R5
+    AND $7, R4, R6
+
+    CBZ R5, remainder
+simd_loop:
+    VLD1 (R0), [V0.S4, V1.S4]
+    VLD1 (R2), [V2.S4, V3.S4]
+    
+    VADD V2.S4, V0.S4, V0.S4
+    VADD V3.S4, V1.S4, V1.S4
+    
+    VST1.P [V0.S4, V1.S4], 32(R0)
+    ADD $32, R2, R2
+    
+    SUB $1, R5, R5
+    CBNZ R5, simd_loop
+
+remainder:
+    CBZ R6, done
+    
+    LSR $2, R6, R7
+    CBZ R7, small_remainder
+    
+    VLD1 (R0), [V0.S4]
+    VLD1 (R2), [V1.S4]
+    VADD V1.S4, V0.S4, V0.S4
+    VST1.P [V0.S4], 16(R0)
+    ADD $16, R2, R2
+    AND $3, R6, R6
+
+small_remainder:
+    CBZ R6, done
+    MOVD ZR, R5
+rem_loop:
+    ADD R5, R5, R5
+    ADD R5, R5, R5
+    MOVW (R0)(R5), R7
+    MOVW (R2)(R5), R8
+    ADD R7, R8, R7
+    MOVW R7, (R0)(R5)
+    LSR $2, R5, R5
+    
+    ADD $1, R5, R5
+    CMP R6, R5
+    BLT rem_loop
+
+done:
+    RET
+
+// func sumUint16(x []uint16, y []uint16)
+TEXT ·sumUint16(SB), NOSPLIT, $0-48
+    MOVD x_base+0(FP), R0
+    MOVD x_len+8(FP), R1
+    MOVD y_base+24(FP), R2
+    MOVD y_len+32(FP), R3
+
+    CMP R1, R3
+    CSEL LT, R1, R3, R4
+    
+    CBZ R4, done
+    
+    LSR $4, R4, R5
+    AND $15, R4, R6
+
+    CBZ R5, remainder
+simd_loop:
+    VLD1 (R0), [V0.H8, V1.H8]
+    VLD1 (R2), [V2.H8, V3.H8]
+    
+    VADD V2.H8, V0.H8, V0.H8
+    VADD V3.H8, V1.H8, V1.H8
+    
+    VST1.P [V0.H8, V1.H8], 32(R0)
+    ADD $32, R2, R2
+    
+    SUB $1, R5, R5
+    CBNZ R5, simd_loop
+
+remainder:
+    CBZ R6, done
+    
+    LSR $3, R6, R7
+    CBZ R7, small_remainder
+    
+    VLD1 (R0), [V0.H8]
+    VLD1 (R2), [V1.H8]
+    VADD V1.H8, V0.H8, V0.H8
+    VST1.P [V0.H8], 16(R0)
+    ADD $16, R2, R2
+    AND $7, R6, R6
+
+small_remainder:
+    CBZ R6, done
+    MOVD ZR, R5
+rem_loop:
+    ADD R5, R5, R5
+    MOVW (R0)(R5), R7
+    MOVW (R2)(R5), R8
+    ADD R7, R8, R7
+    MOVW R7, (R0)(R5)
+    LSR $1, R5, R5
+    
+    ADD $1, R5, R5
+    CMP R6, R5
+    BLT rem_loop
+
+done:
+    RET
+
+// func sumUint8(x []uint8, y []uint8)
+TEXT ·sumUint8(SB), NOSPLIT, $0-48
+    MOVD x_base+0(FP), 	R0
+    MOVD x_len+8(FP), 	R1
+    MOVD y_base+24(FP), R2
+    MOVD y_len+32(FP), 	R3
+
+    CMP R1, R3
+    CSEL LT, R1, R3, R4
+    
+    CBZ R4, done
+    
+    LSR $5,  R4, R5
+    AND $31, R4, R6
+
+    CBZ R5, remainder
+simd_loop:
+    VLD1 (R0), [V0.B16, V1.B16]
+    VLD1 (R2), [V2.B16, V3.B16]
+    
+    VADD V2.B16, V0.B16, V0.B16
+    VADD V3.B16, V1.B16, V1.B16
+    
+    VST1.P [V0.B16, V1.B16], 32(R0)
+    ADD $32, R2, R2
+    
+    SUB $1, R5, R5
+    CBNZ R5, simd_loop
+
+remainder:
+    CBZ R6, done
+    
+    LSR $4, R6, R7
+    CBZ R7, small_remainder
+    
+    VLD1 (R0), [V0.B16]
+    VLD1 (R2), [V1.B16]
+    VADD V1.B16, V0.B16, V0.B16
+    VST1.P [V0.B16], 16(R0)
+    ADD $16, R2, R2
+    AND $15, R6, R6
+
+small_remainder:
+    CBZ R6, done
+    MOVD ZR, R5
+rem_loop:
+    MOVB (R0)(R5), R7
+    MOVB (R2)(R5), R8
+    ADD R7, R8, R7
+    MOVB R7, (R0)(R5)
+    
+    ADD $1, R5, R5
+    CMP R6, R5
+    BLT rem_loop
+done:
+    RET
diff --git a/slices/sums_default.go b/slices/sums_default.go
index 35c2951e..e310a82a 100644
--- a/slices/sums_default.go
+++ b/slices/sums_default.go
@@ -1,5 +1,5 @@
-//go:build purego || !amd64
-// +build purego !amd64
+//go:build purego
+// +build purego
 
 package slices