Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 20 additions & 66 deletions libc/src/string/memory_utils/op_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,6 @@ struct Memcpy {
namespace LIBC_NAMESPACE_DECL {
namespace generic {

// Not equals: returns non-zero iff values at head or tail differ.
// This function typically loads more data than necessary when the two buffer
// differs.
template <typename T>
LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
static_assert(cpp::is_integral_v<T>);
return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
}

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
Expand Down Expand Up @@ -154,11 +145,6 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
#if defined(__SSE4_1__)
template <> struct is_vector<__m128i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
return _mm_xor_si128(a, b);
}
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
return _mm_max_epu8(a, b);
}
Expand All @@ -170,21 +156,17 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return static_cast<uint16_t>(
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
LIBC_INLINE bool is_zero(__m128i value) {
return _mm_testz_si128(value, value) == 1;
}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
return is_zero(load_and_xor_m128i(p1, p2, offset));
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto xored = _mm_xor_si128(a, b);
return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
}
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
return !is_zero(load_and_xor_m128i(p1, p2, offset));
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
size_t count) {
const __m128i head = load_and_xor_m128i(p1, p2, 0);
const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
return !is_zero(_mm_or_si128(head, tail));
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto xored = _mm_xor_si128(a, b);
return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
Expand All @@ -203,34 +185,19 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
#if defined(__AVX__)
template <> struct is_vector<__m256i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
return _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
}
LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
return _mm256_castps_si256(
_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
}
LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
return xor_m256i(a, b);
}
LIBC_INLINE bool is_zero(__m256i value) {
return _mm256_testz_si256(value, value) == 1;
}
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
return is_zero(load_and_xor_m256i(p1, p2, offset));
const auto xored = _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
}
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
return !is_zero(load_and_xor_m256i(p1, p2, offset));
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
size_t count) {
const __m256i head = load_and_xor_m256i(p1, p2, 0);
const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
return !is_zero(or_m256i(head, tail));
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
const auto xored = _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
}
#endif // __AVX__

Expand Down Expand Up @@ -345,22 +312,9 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_cmpneq_epi8_mask(a, b) != 0;
}
LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_xor_epi64(a, b);
}
LIBC_INLINE bool is_zero(__m512i value) {
return _mm512_test_epi32_mask(value, value) == 0;
}
template <>
LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
size_t count) {
const __m512i head = load_and_xor_m512i(p1, p2, 0);
const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
return !is_zero(_mm512_or_epi64(head, tail));
const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
return static_cast<uint32_t>(xored >> 32) |
static_cast<uint32_t>(xored & 0xFFFFFFFF);
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
Expand Down
32 changes: 21 additions & 11 deletions libc/src/string/memory_utils/x86_64/inline_bcmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __SSE4_1__
Expand All @@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX__
Expand All @@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
if (count <= 128)
return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX512BW__
Expand All @@ -62,12 +62,22 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
return BcmpReturnType::zero();
if (count == 1)
return generic::Bcmp<uint8_t>::block(p1, p2);
if (count <= 4)
return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
if (count <= 8)
return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
if (count == 2)
return generic::Bcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count == 4)
return generic::Bcmp<uint32_t>::block(p1, p2);
if (count == 5)
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
if (count == 6)
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
if (count == 7)
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
if (count == 8)
return generic::Bcmp<uint64_t>::block(p1, p2);
if (count <= 16)
return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX__)
Expand Down
Loading