Skip to content

Commit e278a3d

Browse files
authored
Implement intersperse using SSE2 (#310)
1 parent 37b4da8 commit e278a3d

File tree

2 files changed

+23
-1
lines changed

2 files changed

+23
-1
lines changed

bench/BenchAll.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,10 @@ main = do
361361
, benchFE "floatHexFixed" $ fromIntegral >$< P.floatHexFixed
362362
, benchFE "doubleHexFixed" $ fromIntegral >$< P.doubleHexFixed
363363
]
364+
, bgroup "intersperse"
365+
[ bench "intersperse" $ whnf (S.intersperse 32) byteStringData
366+
, bench "intersperse (unaligned)" $ whnf (S.intersperse 32) (S.drop 1 byteStringData)
367+
]
364368
, bgroup "partition"
365369
[
366370
bgroup "strict"

cbits/fpstring.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
*/
3131

3232
#include "fpstring.h"
33+
#if defined(__x86_64__)
34+
#include <emmintrin.h>
35+
#include <xmmintrin.h>
36+
#endif
3337

3438
/* copy a string in reverse */
3539
void fps_reverse(unsigned char *q, unsigned char *p, size_t n) {
@@ -44,7 +48,21 @@ void fps_intersperse(unsigned char *q,
4448
unsigned char *p,
4549
size_t n,
4650
unsigned char c) {
47-
51+
#if defined(__x86_64__)
52+
{
53+
const __m128i separator = _mm_set1_epi8(c);
54+
const unsigned char *const p_begin = p;
55+
const unsigned char *const p_end = p_begin + n - 9;
56+
while (p < p_end) {
57+
const __m128i eight_src_bytes = _mm_loadl_epi64((__m128i *)p);
58+
const __m128i sixteen_dst_bytes = _mm_unpacklo_epi8(eight_src_bytes, separator);
59+
_mm_storeu_si128((__m128i *)q, sixteen_dst_bytes);
60+
p += 8;
61+
q += 16;
62+
}
63+
n -= p - p_begin;
64+
}
65+
#endif
4866
while (n > 1) {
4967
*q++ = *p++;
5068
*q++ = c;

0 commit comments

Comments
 (0)