Skip to content

Conversation

gchatelet
Copy link
Contributor

Reverts #107540

This PR demonstrated improvements on micro-benchmarks but the gains did not seem to materialize in production. We are reverting this change for now to get more data. This PR might be reintegrated later once we're more confident in its effects.

@llvmbot llvmbot added the libc label Oct 10, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 10, 2025

@llvm/pr-subscribers-libc

Author: Guillaume Chatelet (gchatelet)

Changes

Reverts llvm/llvm-project#107540

This PR demonstrated improvements on micro-benchmarks but the gains did not seem to materialize in production. We are reverting this change for now to get more data. This PR might be reintegrated later once we're more confident in its effects.


Full diff: https://github.com/llvm/llvm-project/pull/162859.diff

2 Files Affected:

  • (modified) libc/src/string/memory_utils/op_x86.h (+20-66)
  • (modified) libc/src/string/memory_utils/x86_64/inline_bcmp.h (+21-11)
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 1b4052747552d..215cafb9fcfeb 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -73,15 +73,6 @@ struct Memcpy {
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
 
-// Not equals: returns non-zero iff values at head or tail differ.
-// This function typically loads more data than necessary when the two buffer
-// differs.
-template <typename T>
-LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
-  static_assert(cpp::is_integral_v<T>);
-  return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint16_t
 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -154,11 +145,6 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
 #if defined(__SSE4_1__)
 template <> struct is_vector<__m128i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
-LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m128i>(p1, offset);
-  const auto b = load<__m128i>(p2, offset);
-  return _mm_xor_si128(a, b);
-}
 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
   return _mm_max_epu8(a, b);
 }
@@ -170,21 +156,17 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
   return static_cast<uint16_t>(
       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
 }
-LIBC_INLINE bool is_zero(__m128i value) {
-  return _mm_testz_si128(value, value) == 1;
-}
 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  return is_zero(load_and_xor_m128i(p1, p2, offset));
+  const auto a = load<__m128i>(p1, offset);
+  const auto b = load<__m128i>(p2, offset);
+  const auto xored = _mm_xor_si128(a, b);
+  return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
 }
 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  return !is_zero(load_and_xor_m128i(p1, p2, offset));
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
-                                                       size_t count) {
-  const __m128i head = load_and_xor_m128i(p1, p2, 0);
-  const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
-  return !is_zero(_mm_or_si128(head, tail));
+  const auto a = load<__m128i>(p1, offset);
+  const auto b = load<__m128i>(p2, offset);
+  const auto xored = _mm_xor_si128(a, b);
+  return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -203,34 +185,19 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
 #if defined(__AVX__)
 template <> struct is_vector<__m256i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
-  return _mm256_castps_si256(
-      _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
-  return _mm256_castps_si256(
-      _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
+template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m256i>(p1, offset);
   const auto b = load<__m256i>(p2, offset);
-  return xor_m256i(a, b);
-}
-LIBC_INLINE bool is_zero(__m256i value) {
-  return _mm256_testz_si256(value, value) == 1;
-}
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
-  return is_zero(load_and_xor_m256i(p1, p2, offset));
+  const auto xored = _mm256_castps_si256(
+      _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+  return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
 }
 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
-  return !is_zero(load_and_xor_m256i(p1, p2, offset));
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
-                                                       size_t count) {
-  const __m256i head = load_and_xor_m256i(p1, p2, 0);
-  const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
-  return !is_zero(or_m256i(head, tail));
+  const auto a = load<__m256i>(p1, offset);
+  const auto b = load<__m256i>(p2, offset);
+  const auto xored = _mm256_castps_si256(
+      _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+  return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
 }
 #endif // __AVX__
 
@@ -345,22 +312,9 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
-  return _mm512_cmpneq_epi8_mask(a, b) != 0;
-}
-LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m512i>(p1, offset);
-  const auto b = load<__m512i>(p2, offset);
-  return _mm512_xor_epi64(a, b);
-}
-LIBC_INLINE bool is_zero(__m512i value) {
-  return _mm512_test_epi32_mask(value, value) == 0;
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
-                                                       size_t count) {
-  const __m512i head = load_and_xor_m512i(p1, p2, 0);
-  const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
-  return !is_zero(_mm512_or_epi64(head, tail));
+  const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
+  return static_cast<uint32_t>(xored >> 32) |
+         static_cast<uint32_t>(xored & 0xFFFFFFFF);
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
index 8be391b428910..0eaf968f2f78d 100644
--- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h
+++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
   return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
   if (count <= 64)
-    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
+    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
   return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
   if (count <= 64)
-    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
+    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
   if (count <= 128)
-    return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
+    return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
   return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX512BW__
@@ -62,12 +62,22 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
     return BcmpReturnType::zero();
   if (count == 1)
     return generic::Bcmp<uint8_t>::block(p1, p2);
-  if (count <= 4)
-    return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
-  if (count <= 8)
-    return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
+  if (count == 2)
+    return generic::Bcmp<uint16_t>::block(p1, p2);
+  if (count == 3)
+    return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
+  if (count == 4)
+    return generic::Bcmp<uint32_t>::block(p1, p2);
+  if (count == 5)
+    return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
+  if (count == 6)
+    return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
+  if (count == 7)
+    return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
+  if (count == 8)
+    return generic::Bcmp<uint64_t>::block(p1, p2);
   if (count <= 16)
-    return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
+    return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
 #if defined(__AVX512BW__)
   return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
 #elif defined(__AVX__)

@gchatelet gchatelet merged commit 57726bd into main Oct 13, 2025
22 checks passed
@gchatelet gchatelet deleted the revert-107540-main branch October 13, 2025 21:25
akadutta pushed a commit to akadutta/llvm-project that referenced this pull request Oct 14, 2025
…lvm#162859)

Reverts llvm#107540

This PR demonstrated improvements on micro-benchmarks but the gains did
not seem to materialize in production. We are reverting this change for
now to get more data. This PR might be reintegrated later once we're
more confident in its effects.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants