Skip to content

Commit df10c10

Browse files
ckennellycopybara-github
authored andcommitted
Leverage asm goto for delete fast path.
When building with LLVM, exclude builds with SLH (speculative load hardening), as this does not support asm goto. This was previously fb48655, reverted as 629bd65. PiperOrigin-RevId: 317120677 Change-Id: I7947d29a0ba54ffef36a396502883519010b7d97
1 parent 0e804fc commit df10c10

File tree

1 file changed

+69
-25
lines changed

1 file changed

+69
-25
lines changed

tcmalloc/internal/percpu_tcmalloc.h

Lines changed: 69 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,24 @@
2525
#include "tcmalloc/internal/mincore.h"
2626
#include "tcmalloc/internal/percpu.h"
2727

28+
#if defined(PERCPU_USE_RSEQ)
29+
#if !defined(__clang__)
30+
#define PERCPU_USE_RSEQ_ASM_GOTO 1
31+
#elif __clang_major__ >= 9 && !__has_feature(speculative_load_hardening)
32+
// asm goto requires the use of Clang 9 or newer:
33+
// https://releases.llvm.org/9.0.0/tools/clang/docs/ReleaseNotes.html#c-language-changes-in-clang
34+
//
35+
// SLH (Speculative Load Hardening) builds do not support asm goto. We can
36+
// detect these compilation modes since
37+
// https://github.com/llvm/llvm-project/commit/379e68a763097bed55556c6dc7453e4b732e3d68.
38+
#define PERCPU_USE_RSEQ_ASM_GOTO 1
39+
#else
40+
#define PERCPU_USE_RSEQ_ASM_GOTO 0
41+
#endif
42+
#else
43+
#define PERCPU_USE_RSEQ_ASM_GOTO 0
44+
#endif
45+
2846
namespace tcmalloc {
2947

3048
struct PerCPUMetadataState {
@@ -229,10 +247,12 @@ template <size_t Shift, size_t NumClasses>
229247
static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push(
230248
typename TcmallocSlab<Shift, NumClasses>::Slabs* slabs, size_t cl,
231249
void* item, OverflowHandler f) {
232-
// TODO(b/149467541): Move this to asm goto.
233-
uint64_t scratch, current;
250+
#ifdef PERCPU_USE_RSEQ_ASM_GOTO
251+
asm goto(
252+
#else
234253
bool overflow;
235254
asm volatile(
255+
#endif
236256
// TODO(b/141629158): __rseq_cs only needs to be writeable to allow for
237257
// relocations, but could be read-only for non-PIE builds.
238258
".pushsection __rseq_cs, \"aw?\"\n"
@@ -267,45 +287,69 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push(
267287
"jmp 3f\n"
268288
".popsection\n"
269289
// Prepare
290+
//
291+
// TODO(b/151503411): Pending widespread availability of LLVM's asm
292+
// goto with output contraints
293+
// (https://github.com/llvm/llvm-project/commit/23c2a5ce33f0), we can
294+
// return the register allocations to the compiler rather than using
295+
// explicit clobbers. Prior to this, blocks which use asm goto cannot
296+
// also specify outputs.
297+
//
298+
// r10: Scratch
299+
// r11: Current
270300
"3:\n"
271-
"lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %[scratch]\n"
272-
"mov %[scratch], %c[rseq_cs_offset](%[rseq_abi])\n"
301+
"lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %%r10\n"
302+
"mov %%r10, %c[rseq_cs_offset](%[rseq_abi])\n"
273303
// Start
274304
"4:\n"
275-
// scratch = __rseq_abi.cpu_id;
276-
"mov %c[rseq_cpu_offset](%[rseq_abi]), %k[scratch]\n"
277-
// scratch = slabs + scratch
278-
"shl %[shift], %[scratch]\n"
279-
"add %[slabs], %[scratch]\n"
305+
// r10 = __rseq_abi.cpu_id;
306+
"mov %c[rseq_cpu_offset](%[rseq_abi]), %%r10d\n"
307+
// r10 = slabs + r10
308+
"shl %[shift], %%r10\n"
309+
"add %[slabs], %%r10\n"
280310
// r11 = slabs->current;
281-
"movzwq (%[scratch], %[cl], 8), %[current]\n"
311+
"movzwq (%%r10, %[cl], 8), %%r11\n"
282312
// if (ABSL_PREDICT_FALSE(r11 >= slabs->end)) { goto overflow; }
283-
"cmp 6(%[scratch], %[cl], 8), %w[current]\n"
313+
"cmp 6(%%r10, %[cl], 8), %%r11w\n"
314+
#ifdef PERCPU_USE_RSEQ_ASM_GOTO
315+
"jae %l[overflow_label]\n"
316+
#else
284317
"jae 5f\n"
285-
// Important! code below this must not affect any flags (i.e.: ccae)
286-
// If so, the above code needs to explicitly set a ccae return value.
287-
"mov %[item], (%[scratch], %[current], 8)\n"
288-
"lea 1(%[current]), %[current]\n"
289-
"mov %w[current], (%[scratch], %[cl], 8)\n"
318+
// Important! code below this must not affect any flags (i.e.: ccae)
319+
// If so, the above code needs to explicitly set a ccae return value.
320+
#endif
321+
"mov %[item], (%%r10, %%r11, 8)\n"
322+
"lea 1(%%r11), %%r11\n"
323+
"mov %%r11w, (%%r10, %[cl], 8)\n"
290324
// Commit
291325
"5:\n"
292-
: [current] "=&r"(current), [scratch] "=&r"(scratch),
293-
[overflow] "=@ccae"(overflow)
326+
:
327+
#ifndef PERCPU_USE_RSEQ_ASM_GOTO
328+
[overflow] "=@ccae"(overflow)
329+
#endif
294330
: [rseq_abi] "r"(&__rseq_abi),
295331
[rseq_cs_offset] "n"(offsetof(kernel_rseq, rseq_cs)),
296332
// TODO(b/130894622): When using virtual CPU IDs, this will be dynamic.
297333
[rseq_cpu_offset] "n"(offsetof(kernel_rseq, cpu_id)),
298334
[rseq_sig] "in"(PERCPU_RSEQ_SIGNATURE), [shift] "in"(Shift),
299335
[slabs] "r"(slabs), [cl] "r"(cl), [item] "r"(item)
300-
: "cc", "memory");
301-
// Undo transformation of cpu_id to the value of scratch.
302-
int cpu = reinterpret_cast<typename TcmallocSlab<Shift, NumClasses>::Slabs*>(
303-
scratch) -
304-
slabs;
336+
: "cc", "memory", "r10", "r11"
337+
#ifdef PERCPU_USE_RSEQ_ASM_GOTO
338+
: overflow_label
339+
#endif
340+
);
341+
#ifndef PERCPU_USE_RSEQ_ASM_GOTO
305342
if (ABSL_PREDICT_FALSE(overflow)) {
306-
return f(cpu, cl, item);
343+
goto overflow_label;
307344
}
308-
return cpu;
345+
#endif
346+
return 0;
347+
overflow_label:
348+
// As of 3/2020, LLVM's asm goto (even with output constraints) only provides
349+
// values for the fallthrough path. The values on the taken branches are
350+
// undefined.
351+
int cpu = __rseq_abi.cpu_id;
352+
return f(cpu, cl, item);
309353
}
310354
#endif // defined(__x86_64__)
311355

0 commit comments

Comments
 (0)