Conversation
Greptile SummaryThis PR performs a large-scale refactoring of SIMD math kernel files, splitting monolithic per-type files (e.g. Key issues found:
Confidence Score: 2/5
Last reviewed commit: a353f42 |
src/ailego/CMakeLists.txt
Outdated
| file(GLOB_RECURSE MATH_FILES_AVX512 | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.c | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.c | ||
| ) |
There was a problem hiding this comment.
Dispatch files should NOT be compiled with AVX512 flags. They contain runtime CPU detection logic and must be compiled with baseline flags (like the default broadwell) to run safely on all CPUs. Currently, these dispatch files will crash with illegal instruction errors on non-AVX512 CPUs.
| file(GLOB_RECURSE MATH_FILES_AVX512 | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.c | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.c | |
| ) | |
| file(GLOB_RECURSE MATH_FILES_AVX512 | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_avx512.c | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_avx512.c | |
| ) | |
| file(GLOB_RECURSE MATH_FILES_DISPATCH | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc | |
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c | |
| ) |
src/ailego/CMakeLists.txt
Outdated
| foreach(MATH_FILE ${MATH_FILES_AVX512}) | ||
| set_source_files_properties( | ||
| ${MATH_FILE} | ||
| PROPERTIES | ||
| COMPILE_FLAGS "${MATH_MARCH_FLAG_AVX512}" | ||
| ) | ||
| endforeach() |
There was a problem hiding this comment.
Need separate loop for dispatch files with baseline flags
| foreach(MATH_FILE ${MATH_FILES_AVX512}) | |
| set_source_files_properties( | |
| ${MATH_FILE} | |
| PROPERTIES | |
| COMPILE_FLAGS "${MATH_MARCH_FLAG_AVX512}" | |
| ) | |
| endforeach() | |
| foreach(MATH_FILE ${MATH_FILES_AVX512}) | |
| set_source_files_properties( | |
| ${MATH_FILE} | |
| PROPERTIES | |
| COMPILE_FLAGS "${MATH_MARCH_FLAG_AVX512}" | |
| ) | |
| endforeach() | |
| # Dispatch files should use baseline flags for runtime CPU detection | |
| foreach(MATH_FILE ${MATH_FILES_DISPATCH}) | |
| set_source_files_properties( | |
| ${MATH_FILE} | |
| PROPERTIES | |
| COMPILE_FLAGS "-march=broadwell" | |
| ) | |
| endforeach() |
|
@greptile |
| message(WARNING "No ARMv8 architecture flag supported by compiler.") | ||
| function(_setup_armv8_march) | ||
| set(_arch "armv8") | ||
| check_c_compiler_flag("-march=${_ver}" _COMP_SUPP_${_arch}) |
There was a problem hiding this comment.
Undefined variable _ver breaks ARM architecture check
_ver is undefined in this scope — it was the loop variable from the old _detect_armv8_best function. The call therefore expands to check_c_compiler_flag("-march=" ...), which tests an empty flag rather than -march=armv8. Most compilers will accept an empty flag and succeed, so _COMP_SUPP_armv8 will be TRUE regardless of whether armv8 is actually supported — masking any real incompatibility.
| check_c_compiler_flag("-march=${_ver}" _COMP_SUPP_${_arch}) | |
| check_c_compiler_flag("-march=${_arch}" _COMP_SUPP_${_arch}) |
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") | ||
| file(GLOB_RECURSE MATH_FILES_NEON | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_dispatch.c | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_dispatch.c | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math/*_neon.c | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.cc | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/math_batch/*_neon.c | ||
| ) | ||
|
|
||
| foreach(MATH_FILE ${MATH_FILES_NEON}) | ||
| set_source_files_properties( | ||
| ${MATH_FILE} | ||
| PROPERTIES | ||
| COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}" | ||
| ) | ||
| endforeach() | ||
| endif() |
There was a problem hiding this comment.
MATH_MARCH_FLAG_NEON is never set — ARM per-file flags are silently empty
For ARM builds, MATH_MARCH_FLAG_NEON is never assigned anywhere (the x86 path has setup_compiler_march_for_x86(...) which sets the three flag variables, but there is no equivalent call for ARM). set_source_files_properties(...COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}") therefore expands to an empty string, so all NEON/dispatch source files are compiled with whatever the global default flags happen to be instead of the intended -march=armv8-a+simd (or similar).
You need to define the variable before using it. For example:
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
set(MATH_MARCH_FLAG_NEON "-march=armv8-a+simd") # or armv8.2-a+fp16, etc.
file(GLOB_RECURSE MATH_FILES_NEON
...
)| #if defined(__AVX__) | ||
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | ||
| m_sparse_value, q_sparse_count, | ||
| q_sparse_index, q_sparse_value); | ||
| #elif defined(__AVX512FP16__) | ||
| return InnerProductSparseInSegmentAVX512FP16(m_sparse_count, m_sparse_index, | ||
| m_sparse_value, q_sparse_count, | ||
| q_sparse_index, q_sparse_value); | ||
| #else | ||
| return InnerProductSparseInSegment(m_sparse_count, m_sparse_index, | ||
| m_sparse_value, q_sparse_count, | ||
| q_sparse_index, q_sparse_value); | ||
| #endif |
There was a problem hiding this comment.
AVX512FP16 sparse path is dead code — __AVX__ is always defined when __AVX512FP16__ is
The preprocessor checks __AVX__ before __AVX512FP16__. Because AVX512FP16 implies AVX (compiling with -march=sapphirerapids or similar also defines __AVX__), the #elif defined(__AVX512FP16__) branch is unreachable — the more optimal InnerProductSparseInSegmentAVX512FP16 path will never be taken. The higher-capability ISA must be checked first:
| #if defined(__AVX__) | |
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #elif defined(__AVX512FP16__) | |
| return InnerProductSparseInSegmentAVX512FP16(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #else | |
| return InnerProductSparseInSegment(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #endif | |
| #if defined(__AVX512FP16__) | |
| return InnerProductSparseInSegmentAVX512FP16(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #elif defined(__AVX__) | |
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #else | |
| return InnerProductSparseInSegment(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #endif |
| } | ||
| } | ||
|
|
||
| if (dim + 8 < dimensionality) { |
There was a problem hiding this comment.
Off-by-one boundary check skips the 8-wide SIMD path for exactly 8 remaining elements
After the main 16-at-a-time loop, when exactly 8 elements remain (dim + 8 == dimensionality), the strict < makes this condition false. Those 8 elements fall through to the scalar loop instead of being handled by the _mm256_cvtph_ps + _mm256_fmadd_ps path. Use <= to capture that case:
| if (dim + 8 < dimensionality) { | |
| if (dim + 8 <= dimensionality) { |
| message(WARNING "No ARMv8 architecture flag supported by compiler.") | ||
| function(_setup_armv8_march) | ||
| set(_arch "armv8") | ||
| check_c_compiler_flag("-march=${_ver}" _COMP_SUPP_${_arch}) |
There was a problem hiding this comment.
Undefined variable _ver breaks ARM architecture check.
_ver is undefined in this scope. The function defines _arch = "armv8" on line 78, but the flag check references _ver, causing the check to expand to -march= (empty flag). Most compilers will accept an empty flag and succeed, so the ARM march support test becomes vacuous.
| check_c_compiler_flag("-march=${_ver}" _COMP_SUPP_${_arch}) | |
| check_c_compiler_flag("-march=${_arch}" _COMP_SUPP_${_arch}) |
| set_source_files_properties( | ||
| ${MATH_FILE} | ||
| PROPERTIES | ||
| COMPILE_FLAGS "${MATH_MARCH_FLAG_NEON}" |
There was a problem hiding this comment.
MATH_MARCH_FLAG_NEON is never set — ARM per-file SIMD flags are silently empty.
The x86 path calls setup_compiler_march_for_x86(...) to set MATH_MARCH_FLAG_SSE, MATH_MARCH_FLAG_AVX2, and MATH_MARCH_FLAG_AVX512 (lines 22–23), but the ARM path does not set MATH_MARCH_FLAG_NEON before using it on line 98. As a result, all NEON/dispatch source files are compiled with empty per-file COMPILE_FLAGS instead of the intended -march=armv8-a+simd (or similar).
Define the variable before using it:
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
set(CMAKE_CXX_FLAGS "-march=armv8-a")
set(CMAKE_C_FLAGS "-march=armv8-a")
set(MATH_MARCH_FLAG_NEON "-march=armv8-a")| #if defined(__AVX__) | ||
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | ||
| m_sparse_value, q_sparse_count, | ||
| q_sparse_index, q_sparse_value); | ||
| #elif defined(__AVX512FP16__) |
There was a problem hiding this comment.
AVX512FP16 sparse path is dead code — __AVX__ is always defined when __AVX512FP16__ is available.
When compiling with AVX512FP16 support (e.g., -march=sapphirerapids), the preprocessor defines both __AVX512FP16__ and __AVX__. Since __AVX__ is checked first, the #elif defined(__AVX512FP16__) branch is unreachable, and the more optimal InnerProductSparseInSegmentAVX512FP16 path is never taken. Check for the more specific ISA first:
| #if defined(__AVX__) | |
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #elif defined(__AVX512FP16__) | |
| #if defined(__AVX512FP16__) | |
| return InnerProductSparseInSegmentAVX512FP16(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #elif defined(__AVX__) | |
| return InnerProductSparseInSegmentAVX(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #else | |
| return InnerProductSparseInSegment(m_sparse_count, m_sparse_index, | |
| m_sparse_value, q_sparse_count, | |
| q_sparse_index, q_sparse_value); | |
| #endif |
| } | ||
| } | ||
|
|
||
| if (dim + 8 < dimensionality) { |
There was a problem hiding this comment.
Off-by-one boundary check skips the 8-wide SIMD path for exactly 8 remaining elements.
The main loop processes 16 elements at a time (line 38). After it exits, when exactly 8 elements remain (e.g., dim + 8 == dimensionality), the strict < makes this condition false, and those 8 elements fall through to the scalar loop (lines 85–89) instead of being handled by the optimized _mm256_cvtph_ps + _mm256_fmadd_ps path (lines 68–75). Use <= to include that boundary case:
| if (dim + 8 < dimensionality) { | |
| if (dim + 8 <= dimensionality) { |
march based reorganization