|
25 | 25 | #include "tcmalloc/internal/mincore.h"
|
26 | 26 | #include "tcmalloc/internal/percpu.h"
|
27 | 27 |
|
| 28 | +#if defined(PERCPU_USE_RSEQ) |
| 29 | +#if !defined(__clang__) |
| 30 | +#define PERCPU_USE_RSEQ_ASM_GOTO 1 |
| 31 | +#elif __clang_major__ >= 9 && !__has_feature(speculative_load_hardening) |
| 32 | +// asm goto requires the use of Clang 9 or newer: |
| 33 | +// https://releases.llvm.org/9.0.0/tools/clang/docs/ReleaseNotes.html#c-language-changes-in-clang |
| 34 | +// |
| 35 | +// SLH (Speculative Load Hardening) builds do not support asm goto. We can |
| 36 | +// detect these compilation modes since |
| 37 | +// https://github.com/llvm/llvm-project/commit/379e68a763097bed55556c6dc7453e4b732e3d68. |
| 38 | +#define PERCPU_USE_RSEQ_ASM_GOTO 1 |
| 39 | +#else |
| 40 | +#define PERCPU_USE_RSEQ_ASM_GOTO 0 |
| 41 | +#endif |
| 42 | +#else |
| 43 | +#define PERCPU_USE_RSEQ_ASM_GOTO 0 |
| 44 | +#endif |
| 45 | + |
28 | 46 | namespace tcmalloc {
|
29 | 47 |
|
30 | 48 | struct PerCPUMetadataState {
|
@@ -229,10 +247,12 @@ template <size_t Shift, size_t NumClasses>
|
229 | 247 | static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push(
|
230 | 248 | typename TcmallocSlab<Shift, NumClasses>::Slabs* slabs, size_t cl,
|
231 | 249 | void* item, OverflowHandler f) {
|
232 |
| - // TODO(b/149467541): Move this to asm goto. |
233 |
| - uint64_t scratch, current; |
| 250 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 251 | + asm goto( |
| 252 | +#else |
234 | 253 | bool overflow;
|
235 | 254 | asm volatile(
|
| 255 | +#endif |
236 | 256 | // TODO(b/141629158): __rseq_cs only needs to be writeable to allow for
|
237 | 257 | // relocations, but could be read-only for non-PIE builds.
|
238 | 258 | ".pushsection __rseq_cs, \"aw?\"\n"
|
@@ -267,45 +287,69 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push(
|
267 | 287 | "jmp 3f\n"
|
268 | 288 | ".popsection\n"
|
269 | 289 | // Prepare
|
| 290 | + // |
| 291 | + // TODO(b/151503411): Pending widespread availability of LLVM's asm |
| 292 | + // goto with output contraints |
| 293 | + // (https://github.com/llvm/llvm-project/commit/23c2a5ce33f0), we can |
| 294 | + // return the register allocations to the compiler rather than using |
| 295 | + // explicit clobbers. Prior to this, blocks which use asm goto cannot |
| 296 | + // also specify outputs. |
| 297 | + // |
| 298 | + // r10: Scratch |
| 299 | + // r11: Current |
270 | 300 | "3:\n"
|
271 |
| - "lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %[scratch]\n" |
272 |
| - "mov %[scratch], %c[rseq_cs_offset](%[rseq_abi])\n" |
| 301 | + "lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %%r10\n" |
| 302 | + "mov %%r10, %c[rseq_cs_offset](%[rseq_abi])\n" |
273 | 303 | // Start
|
274 | 304 | "4:\n"
|
275 |
| - // scratch = __rseq_abi.cpu_id; |
276 |
| - "mov %c[rseq_cpu_offset](%[rseq_abi]), %k[scratch]\n" |
277 |
| - // scratch = slabs + scratch |
278 |
| - "shl %[shift], %[scratch]\n" |
279 |
| - "add %[slabs], %[scratch]\n" |
| 305 | + // r10 = __rseq_abi.cpu_id; |
| 306 | + "mov %c[rseq_cpu_offset](%[rseq_abi]), %%r10d\n" |
| 307 | + // r10 = slabs + r10 |
| 308 | + "shl %[shift], %%r10\n" |
| 309 | + "add %[slabs], %%r10\n" |
280 | 310 | // r11 = slabs->current;
|
281 |
| - "movzwq (%[scratch], %[cl], 8), %[current]\n" |
| 311 | + "movzwq (%%r10, %[cl], 8), %%r11\n" |
282 | 312 | // if (ABSL_PREDICT_FALSE(r11 >= slabs->end)) { goto overflow; }
|
283 |
| - "cmp 6(%[scratch], %[cl], 8), %w[current]\n" |
| 313 | + "cmp 6(%%r10, %[cl], 8), %%r11w\n" |
| 314 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 315 | + "jae %l[overflow_label]\n" |
| 316 | +#else |
284 | 317 | "jae 5f\n"
|
285 |
| - // Important! code below this must not affect any flags (i.e.: ccae) |
286 |
| - // If so, the above code needs to explicitly set a ccae return value. |
287 |
| - "mov %[item], (%[scratch], %[current], 8)\n" |
288 |
| - "lea 1(%[current]), %[current]\n" |
289 |
| - "mov %w[current], (%[scratch], %[cl], 8)\n" |
| 318 | + // Important! code below this must not affect any flags (i.e.: ccae) |
| 319 | + // If so, the above code needs to explicitly set a ccae return value. |
| 320 | +#endif |
| 321 | + "mov %[item], (%%r10, %%r11, 8)\n" |
| 322 | + "lea 1(%%r11), %%r11\n" |
| 323 | + "mov %%r11w, (%%r10, %[cl], 8)\n" |
290 | 324 | // Commit
|
291 | 325 | "5:\n"
|
292 |
| - : [current] "=&r"(current), [scratch] "=&r"(scratch), |
293 |
| - [overflow] "=@ccae"(overflow) |
| 326 | + : |
| 327 | +#ifndef PERCPU_USE_RSEQ_ASM_GOTO |
| 328 | + [overflow] "=@ccae"(overflow) |
| 329 | +#endif |
294 | 330 | : [rseq_abi] "r"(&__rseq_abi),
|
295 | 331 | [rseq_cs_offset] "n"(offsetof(kernel_rseq, rseq_cs)),
|
296 | 332 | // TODO(b/130894622): When using virtual CPU IDs, this will be dynamic.
|
297 | 333 | [rseq_cpu_offset] "n"(offsetof(kernel_rseq, cpu_id)),
|
298 | 334 | [rseq_sig] "in"(PERCPU_RSEQ_SIGNATURE), [shift] "in"(Shift),
|
299 | 335 | [slabs] "r"(slabs), [cl] "r"(cl), [item] "r"(item)
|
300 |
| - : "cc", "memory"); |
301 |
| - // Undo transformation of cpu_id to the value of scratch. |
302 |
| - int cpu = reinterpret_cast<typename TcmallocSlab<Shift, NumClasses>::Slabs*>( |
303 |
| - scratch) - |
304 |
| - slabs; |
| 336 | + : "cc", "memory", "r10", "r11" |
| 337 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 338 | + : overflow_label |
| 339 | +#endif |
| 340 | + ); |
| 341 | +#ifndef PERCPU_USE_RSEQ_ASM_GOTO |
305 | 342 | if (ABSL_PREDICT_FALSE(overflow)) {
|
306 |
| - return f(cpu, cl, item); |
| 343 | + goto overflow_label; |
307 | 344 | }
|
308 |
| - return cpu; |
| 345 | +#endif |
| 346 | + return 0; |
| 347 | +overflow_label: |
| 348 | + // As of 3/2020, LLVM's asm goto (even with output constraints) only provides |
| 349 | + // values for the fallthrough path. The values on the taken branches are |
| 350 | + // undefined. |
| 351 | + int cpu = __rseq_abi.cpu_id; |
| 352 | + return f(cpu, cl, item); |
309 | 353 | }
|
310 | 354 | #endif // defined(__x86_64__)
|
311 | 355 |
|
|
0 commit comments