Skip to content

Commit ff7e9f8

Browse files
authored
Make sve_128 portable and support true march=native builds (#504)
1 parent 631fb04 commit ff7e9f8

File tree

6 files changed

+59
-12
lines changed

6 files changed

+59
-12
lines changed

Docs/ChangeLog-4x.md

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,21 @@ clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
1111

1212
**Status:** In development
1313

14-
The 4.9.0 release is a minor maintenance release.
14+
The 4.9.0 release is a small release adding support for Arm Scalable Vector
15+
Extensions SIMD, as well as some minor bug fixes.
1516

1617
* **General:**
1718
* **Bug fix:** Fixed incorrect return type in "None" vector library
1819
reference implementation.
1920
* **Bug fix:** Fixed sincos table index under/overflow.
20-
* **Feature:** Added backend for Arm SVE fixed-width 256-bit builds.
21-
* **Feature:** Added backend for Arm SVE fixed-width 128-bit builds.
21+
* **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and
22+
`-mcpu=native`.
23+
* **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These
24+
can only run on hardware implementing 256-bit SVE.
25+
* **Feature:** Added backend for Arm SVE 128-bit builds. These are portable
26+
builds and can run on hardware implemnting any SVE vector length, but the
27+
explicit SVE use is augmented NEON and will only use the bottom 128-bits of
28+
each SVE vector.
2229
* **Feature:** Optimized NEON mask `any()` and `all()` functions.
2330
* **Feature:** Migrated build and test to GitHub Actions pipelines.
2431

@@ -36,8 +43,9 @@ The 4.8.0 release is a minor maintenance release.
3643
language behavior, to improve support for deployment using Emscripten.
3744
* **Feature:** Builds using Clang can now build with undefined behavior
3845
sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line.
39-
* **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha chunks
40-
for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with libpng.
46+
* **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha
47+
chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with
48+
libpng.
4149

4250
<!-- ---------------------------------------------------------------------- -->
4351
## 4.7.0
@@ -49,8 +57,8 @@ the decompressor to match the Khronos specification. This fix includes the
4957
addition of explicit support for optimizing for `decode_unorm8` rounding.
5058

5159
Reminder - the codec library API is not designed to be binary compatible across
52-
versions. We always recommend rebuilding your client-side code using the updated
53-
`astcenc.h` header.
60+
versions. We always recommend rebuilding your client-side code using the
61+
updated `astcenc.h` header.
5462

5563
* **General:**
5664
* **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion

Source/UnitTest/cmake_core.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
117117
# Enable SVE
118118
target_compile_options(${ASTCENC_TEST}
119119
PRIVATE
120-
-march=armv8-a+sve -msve-vector-bits=128)
120+
-march=armv8-a+sve)
121121

122122
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
123123
target_compile_definitions(${ASTCENC_TEST}

Source/astcenc_mathlib.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,18 @@
7474
#endif
7575

7676
#ifndef ASTCENC_SVE
77-
#define ASTCENC_SVE 0
77+
#if defined(__ARM_FEATURE_SVE)
78+
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
79+
#define ASTCENC_SVE 8
80+
// Auto-detected SVE can only assume vector width of 4 is available, but
81+
// must also allow for hardware being longer and so all use of intrinsics
82+
// must explicitly use predicate masks to limit to 4-wide.
83+
#else
84+
#define ASTCENC_SVE 4
85+
#endif
86+
#else
87+
#define ASTCENC_SVE 0
88+
#endif
7889
#endif
7990

8091
// Force vector-sized SIMD alignment

Source/astcenccli_entry2.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ int astcenc_main_veneer(
5757
int argc,
5858
char **argv
5959
) {
60-
#if ASTCENC_SVE != 0
61-
// svcntw() return compile-time length if used with -msve-vector-bits
60+
// We don't need this check for 128-bit SVE, because that is compiled as
61+
// VLA code, using predicate masks in the augmented NEON.
62+
#if ASTCENC_SVE > 4
63+
// svcntw() returns compile-time length if used with -msve-vector-bits
6264
if (svcntw() != ASTCENC_SVE)
6365
{
6466
int bits = ASTCENC_SVE * 32;

Source/astcenccli_toplevel_help.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,14 @@ void astcenc_print_header()
585585
unsigned int bits = static_cast<unsigned int>(sizeof(void*) * 8);
586586
printf(astcenc_copyright_string,
587587
VERSION_STRING, bits, simdtype, pcnttype, f16ctype, YEAR_STRING);
588+
589+
// If possible, print hint that 8-wide SVE could be used
590+
#if ASTCENC_SVE == 4
591+
if (svcntw() == 8)
592+
{
593+
printf("Note: This CPU can support 256-bit SVE builds.\n");
594+
}
595+
#endif
588596
}
589597

590598
/* See header for documentation. */

Source/cmake_core.cmake

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,10 +336,13 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
336336
ASTCENC_F16C=0)
337337

338338
# Enable SVE in the core library
339+
# Note that for 128-bit SVE the generated code is actually
340+
# vector-length agnostic, but any manual intrinsics used in the
341+
# enhanced-NEON library use 128-bit data width predicates
339342
if (NOT ${ASTCENC_VENEER_TYPE})
340343
target_compile_options(${ASTCENC_TARGET_NAME}
341344
PRIVATE
342-
-march=armv8-a+sve -msve-vector-bits=128)
345+
-march=armv8-a+sve)
343346

344347
# Enable SVE without fixed vector length in the veneer
345348
elseif (${ASTCENC_VENEER_TYPE} EQUAL 2)
@@ -429,6 +432,21 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
429432
$<${is_gnu_fe}:-mfma>)
430433
endif()
431434

435+
elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
436+
target_compile_definitions(${ASTCENC_TARGET_NAME}
437+
PRIVATE)
438+
439+
if (${ASTCENC_VENEER_TYPE} GREATER 0)
440+
target_compile_options(${ASTCENC_TARGET_NAME}
441+
PRIVATE
442+
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
443+
else()
444+
target_compile_options(${ASTCENC_TARGET_NAME}
445+
PRIVATE
446+
$<${is_clangcl}:-mcpu=native -march=native>
447+
$<${is_gnu_fe}:-mcpu=native -march=native>
448+
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
449+
endif()
432450
endif()
433451

434452
endmacro()

0 commit comments

Comments
 (0)