Skip to content

Commit 8e7ed1d

Browse files
authored
[Profile] further improvements for Windows-only (#59877)
- Instead of preventing profiling entirely (which might deadlock if DllInit accesses Julia in any way), use a suggestion by @gbaraldi to abort the profile if it takes too long (100ms), allowing the stopped thread to release any locks and free up both threads to continue all. Many, many other projects have encountered this same bug, including dotNet (closed as won't fix) and Firefox (uses a single stepping debugger to find the memory address of the offending locks), but those seem difficult responses for us. - Define correct ordering for taking profile-related locks - Profile all threads instead of just main thread - Refactor needsSymRefreshModuleList to use a modern API for doing so reliably. The API does not appear to implement the documented behaviors, but it seems to do what we need. - Make stack unwinding on Windows thread-safe so that using multiple threads that throw exceptions won't cause memory corruption anymore - Use condition variables for correctly controlling profiler start/stop, instead of volatile
1 parent 76c10f5 commit 8e7ed1d

File tree

9 files changed

+254
-111
lines changed

9 files changed

+254
-111
lines changed

doc/src/devdocs/locks.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,11 @@ exception frames, and taking/releasing locks.
5151
`jl_ast_context_list_t` pool. Likewise, the `ResourcePool<?>::mutexes`
5252
just protect the associated resource pool.
5353

54-
* `jl_in_stackwalk` (`uv_mutex_t`, Win32 only)
5554
* `ResourcePool<?>.mutex` (`std::mutex`)
5655
* `RLST_mutex` (`std::mutex`)
5756
* `llvm_printing_mutex` (`std::mutex`)
5857
* `jl_locked_stream.mutex` (`std::mutex`)
59-
* `debuginfo_asyncsafe` (`uv_rwlock_t`)
58+
* `debuginfo_asyncsafe` (`uv_rwlock_t`) (can still acquire `jl_in_stackwalk` (`uv_mutex_t`, Win32 only))
6059
* `profile_show_peek_cond_lock` (`jl_mutex_t`)
6160
* `trampoline_lock` (`uv_mutex_t`)
6261
* `bt_data_prof_lock` (`uv_mutex_t`)

src/debuginfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,6 @@ bool jl_dylib_DI_for_fptr(size_t pointer, object::SectionRef *Section, int64_t *
10761076
IMAGEHLP_MODULE64 ModuleInfo;
10771077
ModuleInfo.SizeOfStruct = sizeof(IMAGEHLP_MODULE64);
10781078
uv_mutex_lock(&jl_in_stackwalk);
1079-
jl_refresh_dbg_module_list();
10801079
bool isvalid = SymGetModuleInfo64(GetCurrentProcess(), (DWORD64)pointer, &ModuleInfo);
10811080
uv_mutex_unlock(&jl_in_stackwalk);
10821081
if (!isvalid)
@@ -1178,7 +1177,6 @@ static int jl_getDylibFunctionInfo(jl_frame_t **frames, size_t pointer, int skip
11781177
static IMAGEHLP_LINE64 frame_info_line;
11791178
DWORD dwDisplacement = 0;
11801179
uv_mutex_lock(&jl_in_stackwalk);
1181-
jl_refresh_dbg_module_list();
11821180
DWORD64 dwAddress = pointer;
11831181
frame_info_line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
11841182
if (SymGetLineFromAddr64(GetCurrentProcess(), dwAddress, &dwDisplacement, &frame_info_line)) {

src/dlload.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,6 @@ JL_DLLEXPORT void *jl_dlopen(const char *filename, unsigned flags) JL_NOTSAFEPOI
162162
}
163163
else {
164164
lib = LoadLibraryExW(wfilename, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
165-
if (lib)
166-
needsSymRefreshModuleList = 1;
167165
}
168166
return lib;
169167
}

src/init.c

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ extern "C" {
3737
#endif
3838

3939
#ifdef _OS_WINDOWS_
40-
extern int needsSymRefreshModuleList;
41-
extern BOOL (WINAPI *hSymRefreshModuleList)(HANDLE);
40+
extern void jl_init_stackwalk(void);
41+
extern void jl_fin_stackwalk(void);
4242
#else
4343
#include <sys/resource.h>
4444
#include <unistd.h>
@@ -349,6 +349,9 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER
349349
jl_print_timings();
350350
#endif
351351
jl_teardown_codegen(); // prints stats
352+
#ifdef _OS_WINDOWS_
353+
jl_fin_stackwalk();
354+
#endif
352355
}
353356

354357
JL_DLLEXPORT void jl_postoutput_hook(void)
@@ -685,12 +688,7 @@ JL_DLLEXPORT void jl_init_(jl_image_buf_t sysimage)
685688
// initialize backtraces
686689
jl_init_profile_lock();
687690
#ifdef _OS_WINDOWS_
688-
uv_mutex_init(&jl_in_stackwalk);
689-
SymSetOptions(SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS | SYMOPT_LOAD_LINES | SYMOPT_IGNORE_CVREC);
690-
if (!SymInitialize(GetCurrentProcess(), "", 1)) {
691-
jl_safe_printf("WARNING: failed to initialize stack walk info\n");
692-
}
693-
needsSymRefreshModuleList = 0;
691+
jl_init_stackwalk();
694692
#else
695693
// nongnu libunwind initialization is only threadsafe on architecture where the
696694
// author could access TSAN, per https://github.com/libunwind/libunwind/pull/109
@@ -736,10 +734,6 @@ JL_DLLEXPORT void jl_init_(jl_image_buf_t sysimage)
736734
jl_kernel32_handle = jl_dlopen("kernel32.dll", JL_RTLD_NOLOAD);
737735
jl_crtdll_handle = jl_dlopen(jl_crtdll_name, JL_RTLD_NOLOAD);
738736
jl_winsock_handle = jl_dlopen("ws2_32.dll", JL_RTLD_NOLOAD);
739-
HMODULE jl_dbghelp = (HMODULE) jl_dlopen("dbghelp.dll", JL_RTLD_NOLOAD);
740-
needsSymRefreshModuleList = 0;
741-
if (jl_dbghelp)
742-
jl_dlsym(jl_dbghelp, "SymRefreshModuleList", (void **)&hSymRefreshModuleList, 1, 0);
743737
#else
744738
/* macOS dlopen(3): If path is NULL and the option RTLD_FIRST is used, the
745739
handle returned will only search the main executable. */

src/julia_internal.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ extern uv_mutex_t bt_data_prof_lock;
271271
#define PROFILE_STATE_THREAD_SLEEPING (2)
272272
#define PROFILE_STATE_WALL_TIME_PROFILING (3)
273273
void jl_profile_task(void);
274+
#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_)
275+
JL_DLLEXPORT void jl_set_profile_abort_ptr(_Atomic(int) *abort_ptr) JL_NOTSAFEPOINT;
276+
#endif
274277

275278
// number of cycles since power-on
276279
static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
@@ -1529,9 +1532,6 @@ JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
15291532
JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT;
15301533
void jl_fprint_native_codeloc(ios_t *s, uintptr_t ip) JL_NOTSAFEPOINT;
15311534
void jl_fprint_bt_entry_codeloc(ios_t *s, jl_bt_element_t *bt_data) JL_NOTSAFEPOINT;
1532-
#ifdef _OS_WINDOWS_
1533-
JL_DLLEXPORT void jl_refresh_dbg_module_list(void);
1534-
#endif
15351535
void jl_thread_resume(int tid) JL_NOTSAFEPOINT;
15361536
int jl_thread_suspend(int16_t tid, bt_context_t *ctx) JL_NOTSAFEPOINT;
15371537

src/signal-handling.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ void jl_unlock_profile_wr(void)
178178
}
179179

180180

181-
#ifndef _OS_WINDOWS_
182181
static uint64_t profile_cong_rng_seed = 0;
183182
static int *profile_round_robin_thread_order = NULL;
184183
static int profile_round_robin_thread_order_size = 0;
@@ -209,7 +208,6 @@ static int *profile_get_randperm(int size)
209208
jl_shuffle_int_array_inplace(profile_round_robin_thread_order, size, &profile_cong_rng_seed);
210209
return profile_round_robin_thread_order;
211210
}
212-
#endif
213211

214212

215213
JL_DLLEXPORT int jl_profile_is_buffer_full(void)

src/signals-win.c

Lines changed: 107 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -409,9 +409,33 @@ JL_DLLEXPORT void jl_install_sigint_handler(void)
409409
SetConsoleCtrlHandler((PHANDLER_ROUTINE)sigint_handler,1);
410410
}
411411

412-
static volatile HANDLE hBtThread = 0;
412+
static TIMECAPS timecaps;
413+
static HANDLE hBtThread = 0;
414+
static uv_cond_t bt_data_prof_cond = CONDITION_VARIABLE_INIT;
415+
416+
#ifdef _CPU_X86_64_
417+
// Callback data structure for profile timeout
418+
typedef struct {
419+
_Atomic(int) *abort_ptr;
420+
int tid;
421+
} profile_timeout_data_t;
422+
423+
static void CALLBACK profile_timeout_cb(PVOID lpParam, BOOLEAN TimerOrWaitFired)
424+
{
425+
profile_timeout_data_t *data = (profile_timeout_data_t*)lpParam;
426+
if (TimerOrWaitFired && data != NULL && data->abort_ptr != NULL) {
427+
// Timeout reached, signal an abort should occur
428+
// jl_safe_fprintf(ios_safe_stderr, "profile_timeout_cb called.\n");
429+
if (jl_atomic_exchange(data->abort_ptr, 2) == 1) {
430+
// jl_safe_fprintf(ios_safe_stderr, "profile_timeout_cb jl_thread_resume.\n");
431+
jl_thread_resume(data->tid);
432+
data->tid = -1;
433+
}
434+
}
435+
}
436+
#endif
413437

414-
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
438+
static int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
415439
{
416440
(void)timeout;
417441
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
@@ -421,8 +445,10 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
421445
if (ct2 == NULL) // this thread is already dead
422446
return 0;
423447
HANDLE hThread = ptls2->system_id;
424-
if ((DWORD)-1 == SuspendThread(hThread))
448+
if ((DWORD)-1 == SuspendThread(hThread)) {
449+
// jl_safe_fprintf(ios_safe_stderr, "failed to suspend thread %d: %lu\n", tid, GetLastError());
425450
return 0;
451+
}
426452
assert(sizeof(*ctx) == sizeof(CONTEXT));
427453
memset(ctx, 0, sizeof(CONTEXT));
428454
ctx->ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
@@ -439,90 +465,127 @@ void jl_thread_resume(int tid)
439465
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
440466
HANDLE hThread = ptls2->system_id;
441467
if ((DWORD)-1 == ResumeThread(hThread)) {
442-
fputs("failed to resume main thread! aborting.", stderr);
468+
jl_safe_fprintf(ios_safe_stderr, "failed to resume main thread! aborting.\n");
443469
abort();
444470
}
445471
}
446472

447473
int jl_thread_suspend(int16_t tid, bt_context_t *ctx)
448474
{
449-
uv_mutex_lock(&jl_in_stackwalk);
450-
jl_lock_profile();
451-
ULONG_PTR lock_cookie = 0;
452-
LdrLockLoaderLock(0x1, NULL, &lock_cookie);
475+
jl_lock_profile(); // prevent concurrent mutation
476+
uv_mutex_lock(&jl_in_stackwalk); // prevent multi-threaded dbghelp calls
453477
int success = jl_thread_suspend_and_get_state(tid, 0, ctx);
454-
LdrUnlockLoaderLock(0x1, lock_cookie);
455-
jl_unlock_profile();
456478
uv_mutex_unlock(&jl_in_stackwalk);
479+
jl_unlock_profile();
457480
return success;
458481
}
459482

460483
static DWORD WINAPI profile_bt( LPVOID lparam )
461484
{
462485
// Note: illegal to use jl_* functions from this thread except for profiling-specific functions
486+
// Dummy event for RegisterWaitForSingleObject (to use timeout callback)
487+
HANDLE hProfileEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
488+
if (hProfileEvent == NULL) {
489+
jl_safe_fprintf(ios_safe_stderr, "failed to create profile event.\n");
490+
abort();
491+
}
463492
while (1) {
464493
DWORD timeout_ms = nsecprof / (GIGA / 1000);
465494
Sleep(timeout_ms > 0 ? timeout_ms : 1);
466-
if (profile_running) {
467-
if (jl_profile_is_buffer_full()) {
468-
jl_profile_stop_timer(); // does not change the thread state
469-
SuspendThread(GetCurrentThread());
470-
continue;
471-
}
472-
else if (profile_all_tasks) {
473-
// Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
474-
jl_profile_task();
475-
}
476-
else {
477-
// TODO: bring this up to parity with other OS by adding loop over tid here
478-
bt_context_t c;
479-
if (!jl_thread_suspend(0, &c)) {
480-
fputs("failed to suspend main thread. aborting profiling.", stderr);
495+
if (jl_profile_is_buffer_full())
496+
jl_profile_stop_timer(); // does not change the thread state
497+
if (!profile_running) {
498+
uv_mutex_lock(&bt_data_prof_lock);
499+
while (!profile_running)
500+
uv_cond_wait(&bt_data_prof_cond, &bt_data_prof_lock);
501+
uv_mutex_unlock(&bt_data_prof_lock);
502+
}
503+
else if (profile_all_tasks) {
504+
// Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
505+
jl_profile_task();
506+
}
507+
else {
508+
// Profile all threads, similar to Unix implementation
509+
bt_context_t c;
510+
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
511+
int *randperm = profile_get_randperm(nthreads);
512+
for (int idx = nthreads; idx-- > 0; ) {
513+
int tid = randperm[idx];
514+
if (!profile_running)
515+
break;
516+
if (jl_profile_is_buffer_full()) {
481517
jl_profile_stop_timer();
482518
break;
483519
}
520+
if (!jl_thread_suspend(tid, &c))
521+
continue;
522+
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
523+
jl_task_t *t2 = jl_atomic_load_relaxed(&ptls->current_task);
524+
int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
525+
526+
// Set up timeout handler for stackwalk
527+
#ifdef _CPU_X86_64_
528+
_Atomic(int) abort_profiling = 0;
529+
profile_timeout_data_t timeout_data;
530+
timeout_data.abort_ptr = &abort_profiling;
531+
timeout_data.tid = tid;
532+
jl_set_profile_abort_ptr(&abort_profiling);
533+
HANDLE hWaitHandle = NULL;
534+
if (!RegisterWaitForSingleObject(&hWaitHandle, hProfileEvent, profile_timeout_cb,
535+
&timeout_data, 100, WT_EXECUTEONLYONCE | WT_EXECUTEINWAITTHREAD)) {
536+
// Failed to register wait, proceed without timeout protection
537+
hWaitHandle = NULL;
538+
}
539+
#endif
484540
// Get backtrace data
485541
profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
486542
profile_bt_size_max - profile_bt_size_cur - 1, &c, NULL);
487-
488-
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
543+
#ifdef _CPU_X86_64_
544+
// Clear abort pointer from TLS
545+
jl_set_profile_abort_ptr(NULL);
546+
// Wait for callback to complete or cancel before continuing
547+
if (hWaitHandle != NULL)
548+
UnregisterWaitEx(hWaitHandle, INVALID_HANDLE_VALUE);
549+
if (timeout_data.tid != -1)
550+
#endif
551+
jl_thread_resume(tid);
489552

490553
// META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block
491-
profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1;
554+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = tid + 1;
492555

493556
// META_OFFSET_TASKID store task id (never null)
494-
profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
557+
profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t2;
495558

496559
// META_OFFSET_CPUCYCLECLOCK store cpu cycle clock
497560
profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
498561

499562
// store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
500-
int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
501563
profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
502564

503565
// Mark the end of this block with two 0's
504566
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
505567
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
506-
jl_thread_resume(0);
507-
jl_check_profile_autostop();
508568
}
569+
jl_check_profile_autostop();
509570
}
510571
}
511-
uv_mutex_unlock(&jl_in_stackwalk);
512-
jl_profile_stop_timer();
572+
// this is unreachable, but would be the relevant cleanup
573+
uv_mutex_lock(&bt_data_prof_lock);
513574
hBtThread = NULL;
575+
uv_mutex_unlock(&bt_data_prof_lock);
576+
jl_profile_stop_timer();
577+
CloseHandle(hProfileEvent);
514578
return 0;
515579
}
516580

517-
static volatile TIMECAPS timecaps;
518-
519581
JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
520582
{
583+
uv_mutex_lock(&bt_data_prof_lock);
521584
if (hBtThread == NULL) {
522-
523585
TIMECAPS _timecaps;
524586
if (MMSYSERR_NOERROR != timeGetDevCaps(&_timecaps, sizeof(_timecaps))) {
525-
fputs("failed to get timer resolution", stderr);
587+
uv_mutex_unlock(&bt_data_prof_lock);
588+
jl_safe_fprintf(ios_safe_stderr, "failed to get timer resolution.\n");
526589
return -2;
527590
}
528591
timecaps = _timecaps;
@@ -534,15 +597,12 @@ JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
534597
0, // argument to thread function
535598
0, // use default creation flags
536599
0); // returns the thread identifier
537-
if (hBtThread == NULL)
600+
if (hBtThread == NULL) {
601+
uv_mutex_unlock(&bt_data_prof_lock);
602+
jl_safe_fprintf(ios_safe_stderr, "failed to allocate profile thread.\n");
538603
return -1;
539-
(void)SetThreadPriority(hBtThread, THREAD_PRIORITY_ABOVE_NORMAL);
540-
}
541-
else {
542-
if ((DWORD)-1 == ResumeThread(hBtThread)) {
543-
fputs("failed to resume profiling thread.", stderr);
544-
return -2;
545604
}
605+
(void)SetThreadPriority(hBtThread, THREAD_PRIORITY_ABOVE_NORMAL);
546606
}
547607
if (profile_running == 0) {
548608
// Failure to change the timer resolution is not fatal. However, it is important to
@@ -552,6 +612,8 @@ JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
552612
}
553613
profile_all_tasks = all_tasks;
554614
profile_running = 1; // set `profile_running` finally
615+
uv_cond_broadcast(&bt_data_prof_cond);
616+
uv_mutex_unlock(&bt_data_prof_lock);
555617
return 0;
556618
}
557619
JL_DLLEXPORT void jl_profile_stop_timer(void)

0 commit comments

Comments
 (0)