Skip to content

Commit f9ed152

Browse files
committed
Unrolled the load/store of data from/to the ringbuf
1 parent c903e1d commit f9ed152

File tree

1 file changed

+102
-20
lines changed

1 file changed

+102
-20
lines changed

inc/fast/ringbuf.h

Lines changed: 102 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ enum rte_ring_queue_behavior {
149149
* values in a modulo-32bit base: that's why the overflow of the indexes is not
150150
* a problem.
151151
*/
152-
struct __CACHELINE_ALIGNED __rte_index {
152+
struct __rte_index {
153153
atomic_uint_fast32_t head;
154154
atomic_uint_fast32_t tail;
155155
};
@@ -163,8 +163,8 @@ struct __CACHELINE_ALIGNED rte_ring {
163163
};
164164

165165
/** Ring producer & consumer status. */
166-
struct __rte_index prod;
167-
struct __rte_index cons;
166+
struct __rte_index prod __CACHELINE_ALIGNED;
167+
struct __rte_index cons __CACHELINE_ALIGNED;
168168

169169
void * volatile ring[0] __CACHELINE_ALIGNED;
170170
};
@@ -230,6 +230,99 @@ rte_ring_destroy(struct rte_ring *r)
230230
}
231231

232232

233+
/*
234+
* Unrolled store data to ring
235+
*/
236+
static inline void
237+
__store_ring(struct rte_ring *r, uint_fast32_t head, void * const* obj, unsigned n)
238+
{
239+
unsigned i;
240+
unsigned loops = n & 0x3;
241+
unsigned idx = head & r->mask;
242+
243+
244+
// If we know we won't wrap around, we unroll 4 times
245+
if (likely((idx + n) <= r->mask)) {
246+
for (i = 0; i < loops; i += 4, idx += 4) {
247+
r->ring[idx+0] = obj[i+0];
248+
r->ring[idx+1] = obj[i+1];
249+
r->ring[idx+2] = obj[i+2];
250+
r->ring[idx+3] = obj[i+3];
251+
}
252+
253+
// mop up remainder
254+
switch (n & 0x3) {
255+
case 3:
256+
r->ring[idx+0] = obj[i+0];
257+
r->ring[idx+1] = obj[i+1];
258+
r->ring[idx+2] = obj[i+2];
259+
break;
260+
261+
case 2:
262+
r->ring[idx+0] = obj[i+0];
263+
r->ring[idx+1] = obj[i+1];
264+
break;
265+
266+
case 1:
267+
r->ring[idx+0] = obj[i+0];
268+
break;
269+
}
270+
} else {
271+
const uint32_t mask = r->mask;
272+
273+
for (i = 0; i < n; i++, idx++) {
274+
r->ring[idx & mask] = obj[i];
275+
}
276+
}
277+
}
278+
279+
280+
/*
281+
* Unrolled load data from ring
282+
*/
283+
static inline void
284+
__load_ring(struct rte_ring *r, uint_fast32_t head, void **obj, unsigned n)
285+
{
286+
unsigned i;
287+
unsigned loops = n & 0x3;
288+
unsigned idx = head & r->mask;
289+
290+
291+
// If we know we won't wrap around, we unroll 4 times
292+
if (likely((idx + n) <= r->mask)) {
293+
for (i = 0; i < loops; i += 4, idx += 4) {
294+
obj[i+0] = r->ring[idx+0];
295+
obj[i+1] = r->ring[idx+1];
296+
obj[i+2] = r->ring[idx+2];
297+
obj[i+3] = r->ring[idx+3];
298+
}
299+
300+
// mop up remainder
301+
switch (n & 0x3) {
302+
case 3:
303+
obj[i+0] = r->ring[idx+0];
304+
obj[i+1] = r->ring[idx+1];
305+
obj[i+2] = r->ring[idx+2];
306+
break;
307+
308+
case 2:
309+
obj[i+0] = r->ring[idx+0];
310+
obj[i+1] = r->ring[idx+1];
311+
break;
312+
313+
case 1:
314+
obj[i+0] = r->ring[idx+0];
315+
break;
316+
}
317+
} else {
318+
const uint32_t mask = r->mask;
319+
for (i = 0; i < n; i++, idx++) {
320+
obj[i+0] = r->ring[idx & mask];
321+
}
322+
}
323+
}
324+
325+
233326
/**
234327
* @internal Enqueue several objects on the ring (multi-producers safe).
235328
*
@@ -265,7 +358,6 @@ __rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
265358
uint_fast32_t prod_head, prod_next;
266359
uint_fast32_t cons_tail, free_entries;
267360
int success;
268-
unsigned i;
269361

270362
/* move prod.head atomically */
271363
do {
@@ -298,8 +390,7 @@ __rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
298390
} while (unlikely(success == 0));
299391

300392
/* write entries in ring */
301-
for (i = 0; likely(i < n); i++)
302-
r->ring[(prod_head + i) & mask] = obj_table[i];
393+
__store_ring(r, prod_head, obj_table, n);
303394

304395
/*
305396
* If there are other enqueues in progress that preceeded us,
@@ -342,7 +433,6 @@ __rte_ring_sp_do_enqueue(struct rte_ring *r, void * const *obj_table,
342433
const uint32_t mask = r->mask;
343434
uint_fast32_t prod_head, cons_tail;
344435
uint_fast32_t prod_next, free_entries;
345-
unsigned i;
346436

347437
prod_head = atomic_load_explicit(&r->prod.head, memory_order_acquire);
348438
cons_tail = atomic_load_explicit(&r->cons.tail, memory_order_acquire);
@@ -367,9 +457,9 @@ __rte_ring_sp_do_enqueue(struct rte_ring *r, void * const *obj_table,
367457
atomic_store_explicit(&r->prod.head, prod_next, memory_order_release);
368458

369459
/* write entries in ring */
370-
for (i = 0; likely(i < n); i++)
371-
r->ring[(prod_head + i) & mask] = obj_table[i];
460+
__store_ring(r, prod_head, obj_table, n);
372461

462+
assert(atomic_load(&r->prod.tail) == prod_head);
373463
atomic_store_explicit(&r->prod.tail, prod_next, memory_order_release);
374464

375465
return n;
@@ -404,11 +494,9 @@ __rte_ring_mc_do_dequeue(struct rte_ring *r, void **obj_table,
404494
unsigned n, enum rte_ring_queue_behavior behavior)
405495
{
406496
const unsigned max = n;
407-
const uint32_t mask = r->mask;
408497
uint_fast32_t cons_head, prod_tail;
409498
uint_fast32_t cons_next, entries;
410499
int success;
411-
unsigned i;
412500

413501
/* move cons.head atomically */
414502
do {
@@ -438,8 +526,7 @@ __rte_ring_mc_do_dequeue(struct rte_ring *r, void **obj_table,
438526
} while (likely(success == 0));
439527

440528

441-
for (i = 0; likely(i < n); i++)
442-
obj_table[i] = r->ring[(cons_head + i) & mask];
529+
__load_ring(r, cons_head, obj_table, n);
443530

444531
/*
445532
* If there are other dequeues in progress that preceded us,
@@ -480,10 +567,8 @@ static inline int
480567
__rte_ring_sc_do_dequeue(struct rte_ring *r, void **obj_table,
481568
unsigned n, enum rte_ring_queue_behavior behavior)
482569
{
483-
const uint32_t mask = r->mask;
484570
uint_fast32_t cons_head, prod_tail;
485571
uint_fast32_t cons_next, entries;
486-
unsigned i;
487572

488573
cons_head = atomic_load_explicit(&r->cons.head, memory_order_acquire);
489574
prod_tail = atomic_load_explicit(&r->prod.tail, memory_order_acquire);
@@ -503,12 +588,9 @@ __rte_ring_sc_do_dequeue(struct rte_ring *r, void **obj_table,
503588
cons_next = cons_head + n;
504589
atomic_store_explicit(&r->cons.head, cons_next, memory_order_release);
505590

506-
for (i = 0; likely(i < n); i++) {
507-
/* WTF??? WHY DOES THIS CODE GIVE STRICT-ALIASING WARNINGS
508-
* ON SOME GCC. THEY ARE FREAKING VOID* !!! */
509-
obj_table[i] = r->ring[(cons_head + i) & mask];
510-
}
591+
__load_ring(r, cons_head, obj_table, n);
511592

593+
assert(atomic_load(&r->cons.tail) == cons_head);
512594
atomic_store_explicit(&r->cons.tail, cons_next, memory_order_release);
513595
return n;
514596
}

0 commit comments

Comments
 (0)