diff --git a/.gitignore b/.gitignore index 126e84655d6..1d2cf964ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,5 @@ lib/*.so # Benchmark result files /benchmarks/*.json + +debug_runtime_callback.rb diff --git a/ext/datadog_profiling_native_extension/datadog_ruby_common.h b/ext/datadog_profiling_native_extension/datadog_ruby_common.h index d55c2bb479f..9fa8c5ed1cc 100644 --- a/ext/datadog_profiling_native_extension/datadog_ruby_common.h +++ b/ext/datadog_profiling_native_extension/datadog_ruby_common.h @@ -41,6 +41,13 @@ static inline ddog_CharSlice char_slice_from_ruby_string(VALUE string) { return char_slice; } +static inline ddog_CharSlice char_slice_from_cstr(const char *cstr) { + if (cstr == NULL) { + return (ddog_CharSlice){.ptr = NULL, .len = 0}; + } + return (ddog_CharSlice){.ptr = cstr, .len = strlen(cstr)}; +} + static inline VALUE log_warning(VALUE warning) { VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog")); VALUE logger = rb_funcall(datadog_module, rb_intern("logger"), 0); diff --git a/ext/libdatadog_api/crashtracker.c b/ext/libdatadog_api/crashtracker.c index bd63e657304..25b87366345 100644 --- a/ext/libdatadog_api/crashtracker.c +++ b/ext/libdatadog_api/crashtracker.c @@ -1,7 +1,49 @@ -#include -#include +#include "extconf.h" + +#ifdef RUBY_MJIT_HEADER + // Pick up internal structures from the private Ruby MJIT header file + #include RUBY_MJIT_HEADER +#else + // The MJIT header was introduced on 2.6 and removed on 3.3; for other Rubies we rely on + // the datadog-ruby_core_source gem to get access to private VM headers. + + // We can't do anything about warnings in VM headers, so we just use this technique to suppress them. + // See https://nelkinda.com/blog/suppress-warnings-in-gcc-and-clang/#d11e364 for details. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #pragma GCC diagnostic ignored "-Wattributes" + #pragma GCC diagnostic ignored "-Wpragmas" + #pragma GCC diagnostic ignored "-Wexpansion-to-defined" + #include + #pragma GCC diagnostic pop + + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #include + #pragma GCC diagnostic pop + + #include + + #ifndef NO_RACTOR_HEADER_INCLUDE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #include + #pragma GCC diagnostic pop + #endif +#endif +#include #include "datadog_ruby_common.h" +#include "datadog_runtime_stack.h" +#include +#include +#include +#include + +// This was renamed in Ruby 3.2 +#if !defined(ccan_list_for_each) && defined(list_for_each) + #define ccan_list_for_each list_for_each +#endif static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self); static VALUE _native_stop(DDTRACE_UNUSED VALUE _self); @@ -17,6 +59,7 @@ void crashtracker_init(VALUE core_module) { rb_define_singleton_method(crashtracker_class, "_native_start_or_update_on_fork", _native_start_or_update_on_fork, -1); rb_define_singleton_method(crashtracker_class, "_native_stop", _native_stop, 0); + datadog_runtime_stack_init(crashtracker_class); } static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self) { @@ -123,3 +166,4 @@ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self) { return Qtrue; } + diff --git a/ext/libdatadog_api/datadog_ruby_common.h b/ext/libdatadog_api/datadog_ruby_common.h index d55c2bb479f..9fa8c5ed1cc 100644 --- a/ext/libdatadog_api/datadog_ruby_common.h +++ b/ext/libdatadog_api/datadog_ruby_common.h @@ -41,6 +41,13 @@ static inline ddog_CharSlice char_slice_from_ruby_string(VALUE string) { return char_slice; } +static inline ddog_CharSlice char_slice_from_cstr(const char *cstr) { + if (cstr == NULL) { + return (ddog_CharSlice){.ptr = NULL, .len = 0}; + } + return (ddog_CharSlice){.ptr = cstr, .len = strlen(cstr)}; +} + static inline VALUE log_warning(VALUE warning) { VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog")); VALUE logger = rb_funcall(datadog_module, rb_intern("logger"), 0); diff --git a/ext/libdatadog_api/datadog_runtime_stack.c b/ext/libdatadog_api/datadog_runtime_stack.c new file mode 100644 index 00000000000..600c5a2b5ac --- /dev/null +++ b/ext/libdatadog_api/datadog_runtime_stack.c @@ -0,0 +1,509 @@ +#include "extconf.h" + +#ifdef RUBY_MJIT_HEADER + // Pick up internal structures from the private Ruby MJIT header file + #include RUBY_MJIT_HEADER +#else + // The MJIT header was introduced on 2.6 and removed on 3.3; for other Rubies we rely on + // the datadog-ruby_core_source gem to get access to private VM headers. + + // We can't do anything about warnings in VM headers, so we just use this technique to suppress them. + // See https://nelkinda.com/blog/suppress-warnings-in-gcc-and-clang/#d11e364 for details. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #pragma GCC diagnostic ignored "-Wattributes" + #pragma GCC diagnostic ignored "-Wpragmas" + #pragma GCC diagnostic ignored "-Wexpansion-to-defined" + #include + #pragma GCC diagnostic pop + + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #include + #pragma GCC diagnostic pop + + #include + + #ifndef NO_RACTOR_HEADER_INCLUDE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-parameter" + #include + #pragma GCC diagnostic pop + #endif +#endif + +#include +#include "datadog_runtime_stack.h" +#include "datadog_ruby_common.h" +#include +#include +#include +#include + +// This was renamed in Ruby 3.2 +#if !defined(ccan_list_for_each) && defined(list_for_each) + #define ccan_list_for_each list_for_each +#endif + +static VALUE _native_register_runtime_stack_callback(VALUE _self); +static VALUE _native_is_runtime_callback_registered(DDTRACE_UNUSED VALUE _self); + +static void ruby_runtime_stack_callback( + void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*) +); + +#if defined(__x86_64__) +# define SYS_MINCORE 0x1B +#elif defined(__aarch64__) +# define SYS_MINCORE 0xE8 +#endif + +long syscall(long number, ...); + +// align down to power of two +static inline uintptr_t align_down(uintptr_t x, uintptr_t align) { + return x & ~(align - 1u); +} + +static inline bool is_pointer_readable(const void *ptr, size_t size) { + if (!ptr || size == 0) return false; + + uintptr_t page_size = (uintptr_t)sysconf(_SC_PAGESIZE); + // fallback for weird value; 0 or not a power of two + if (page_size == 0 || (page_size & (page_size - 1u))) { + page_size = 4096; + } + + const uintptr_t start = align_down((uintptr_t)ptr, page_size); + const uintptr_t end = ((uintptr_t)ptr + size - 1u); + const uintptr_t last = align_down(end, page_size); + + // Number of pages spanned + size_t pages = 1u + (last != start); + if (pages > 2u) pages = 2u; + + unsigned char vec[2]; + + int retries = 5; + for (;;) { + size_t len = pages * (size_t)page_size; + long rc = syscall(SYS_MINCORE, (void*)start, len, vec); + + if (rc == 0) { + return true; + } + + int e = errno; + if (e == ENOMEM || e == EFAULT) { + return false; + } + + if (e == EAGAIN && retries-- > 0) { + continue; + } + + // Unknown errno, we assume mapped to avoid cascading faults in crash path + return true; + } +} + +static bool is_safe_string_encoding(const char *ptr, long len) { + if (!ptr || len <= 0) return false; + + // Sanity check to scan the first 128 bytes to check + // for control characters and high bytes + for (long i = 0; i < len && i < 128; i++) { + unsigned char c = (unsigned char)ptr[i]; + + if (c == 0 && i < len - 1) return false; + + // Control characters (except tab, newline, return) is sus + if (c < 0x20 && c != 0x09 && c != 0x0A && c != 0x0D) return false; + + // High bytes + if (c >= 0xF8) return false; + } + + return true; +} + +static bool is_reasonable_string_size(VALUE str) { + if (str == Qnil) return false; + if (!RB_TYPE_P(str, T_STRING)) return false; + + // Check if the heap object pointed to by str is readable + if (!is_pointer_readable((const void *)str, sizeof(struct RBasic))) return false; + + // For strings, we need to check the full RString structure + if (!is_pointer_readable(RSTRING(str), sizeof(struct RString))) return false; + + long len = RSTRING_LEN(str); + + if (len < 0) return false; // Negative length, probably corrupted + if (len > 1024) return false; // > 1KB path/function name, sus + + return true; +} + +static const char* safe_string_ptr(VALUE str) { + if (str == Qnil) return ""; + if (!RB_TYPE_P(str, T_STRING)) return ""; + + // Validate the VALUE first before touching any of its internals + if (!is_reasonable_string_size(str)) return ""; + + long len = RSTRING_LEN(str); + const char *ptr = RSTRING_PTR(str); + + if (!ptr) return ""; + + if (!is_pointer_readable(ptr, len > 0 ? len : 1)) return ""; + if (!is_safe_string_encoding(ptr, len)) return ""; + + return ptr; +} + +static bool is_valid_control_frame(const rb_control_frame_t *cfp, + const rb_execution_context_t *ec) { + if (!cfp) return false; + + void *stack_start = ec->vm_stack; + void *stack_end = (char*)stack_start + ec->vm_stack_size * sizeof(VALUE); + if ((void*)cfp < stack_start || (void*)cfp >= stack_end) { + return false; + } + + if (!is_pointer_readable(cfp, sizeof(rb_control_frame_t))) { + return false; + } + + return true; +} + +static bool is_valid_iseq(const rb_iseq_t *iseq) { + if (!iseq) return false; + if (!is_pointer_readable(iseq, sizeof(rb_iseq_t))) return false; + + // Check iseq body + if (!iseq->body) return false; + if (!is_pointer_readable(iseq->body, sizeof(*iseq->body))) return false; + + // Validate iseq size + if (iseq->body->iseq_size > 100000) return false; // > 100K instructions, suspicious + + return true; +} + +static void ruby_runtime_stack_callback( + void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*) +) { + + VALUE current_thread = rb_thread_current(); + if (current_thread == Qnil) return; + + static const rb_data_type_t *thread_data_type = NULL; + if (thread_data_type == NULL) { + thread_data_type = RTYPEDDATA_TYPE(current_thread); + if (!thread_data_type) return; + } + + rb_thread_t *th = (rb_thread_t *) rb_check_typeddata(current_thread, thread_data_type); + if (!th) return; + + const rb_execution_context_t *ec = th->ec; + if (!ec) return; + + if (th->status == THREAD_KILLED) return; + if (!ec->vm_stack || ec->vm_stack_size == 0) return; + + const rb_control_frame_t *cfp = ec->cfp; + const rb_control_frame_t *end_cfp = RUBY_VM_END_CONTROL_FRAME(ec); + + if (!cfp || !end_cfp) return; + + // Skip dummy frame, `thread_profile_frames` does this too + end_cfp = RUBY_VM_NEXT_CONTROL_FRAME(end_cfp); + if (end_cfp <= cfp) return; + + end_cfp = RUBY_VM_NEXT_CONTROL_FRAME(end_cfp); + + int frame_count = 0; + const int MAX_FRAMES = 400; + + // Traverse from current frame backwards to older frames, so that we get the crash point at the top + for (; frame_count < MAX_FRAMES && cfp != end_cfp; cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp)) { + if (!is_valid_control_frame(cfp, ec)) { + continue; + } + + + if (VM_FRAME_RUBYFRAME_P(cfp) && cfp->iseq) { + // Handle Ruby frames + const rb_iseq_t *iseq = cfp->iseq; + + if (!is_valid_iseq(iseq)) { + continue; + } + + VALUE name = rb_iseq_base_label(iseq); + const char *function_name = ""; + if (name != Qnil) { + function_name = safe_string_ptr(name); + } + + VALUE filename = rb_iseq_path(iseq); + const char *file_name = ""; + if (filename != Qnil) { + file_name = safe_string_ptr(filename); + } + + int line_no = 0; + if (iseq && iseq->body) { + if (!cfp->pc) { + // Handle case where PC is NULL; using first line number like private_vm_api_access.c + if (iseq->body->type == ISEQ_TYPE_TOP) { + // For TOP type iseqs, line number should be 0 + line_no = 0; + } else { + // Use first line number for other types + # ifndef NO_INT_FIRST_LINENO // Ruby 3.2+ + line_no = iseq->body->location.first_lineno; + # else + line_no = FIX2INT(iseq->body->location.first_lineno); + #endif + } + } else { + // Handle case where PC is available - mirror calc_pos logic + if (is_pointer_readable(iseq->body->iseq_encoded, iseq->body->iseq_size * sizeof(*iseq->body->iseq_encoded)) && + iseq->body->iseq_size > 0) { + ptrdiff_t pc_offset = cfp->pc - iseq->body->iseq_encoded; + + // bounds checking like private_vm_api_access.c PROF-11475 fix + // to prevent crashes when calling rb_iseq_line_no + if (pc_offset >= 0 && pc_offset <= iseq->body->iseq_size) { + size_t pos = (size_t)pc_offset; + if (pos > 0) { + // Use pos-1 because PC points to next instruction + pos--; + } + + // Additional safety check before calling rb_iseq_line_no (PROF-11475 fix) + if (pos < iseq->body->iseq_size) { + line_no = rb_iseq_line_no(iseq, pos); + } + } + } + } + } + + ddog_crasht_RuntimeStackFrame frame = { + .type_name = char_slice_from_cstr(NULL), + .function = char_slice_from_cstr(function_name), + .file = char_slice_from_cstr(file_name), + .line = line_no, + .column = 0 + }; + + emit_frame(&frame); + frame_count++; + } else if (VM_FRAME_CFRAME_P(cfp)) { + const char *function_name = ""; + const char *file_name = ""; + +#ifdef RUBY_MJIT_HEADER + // Only attempt method entry resolution on Ruby versions with MJIT header + // where rb_vm_frame_method_entry is guaranteed to be available + const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(cfp); + if (me && is_pointer_readable(me, sizeof(rb_callable_method_entry_t))) { + if (me->def && is_pointer_readable(me->def, sizeof(*me->def))) { + if (me->def->original_id) { + const char *method_name = rb_id2name(me->def->original_id); + if (method_name && is_pointer_readable(method_name, strlen(method_name))) { + size_t method_name_len = strlen(method_name); + if (method_name_len > 0 && method_name_len < 256) { + function_name = method_name; + } + } + } + + if (me->def->type == VM_METHOD_TYPE_CFUNC && me->owner) { + // Try to get the full class/module path + VALUE owner_name = Qnil; + VALUE actual_owner = me->owner; + + // If this is a singleton class (like Fiddle's singleton class for module methods), + // try to get the attached object which should be the actual module or else we will + // just get `Module` which is not that useful to us + if (RB_TYPE_P(me->owner, T_CLASS) && FL_TEST(me->owner, FL_SINGLETON)) { + VALUE attached = rb_ivar_get(me->owner, rb_intern("__attached__")); + if (attached != Qnil) { + actual_owner = attached; + } + } + + // Get the class/module path + if (RB_TYPE_P(actual_owner, T_CLASS) || RB_TYPE_P(actual_owner, T_MODULE)) { + owner_name = rb_class_path(actual_owner); + } + + // Fallback to rb_class_name if rb_class_path fails + if (owner_name == Qnil) { + owner_name = rb_class_name(actual_owner); + } + + if (owner_name != Qnil) { + const char *owner_str = safe_string_ptr(owner_name); + static char file_buffer[256]; + snprintf(file_buffer, sizeof(file_buffer), "<%s (C extension)>", owner_str); + file_name = file_buffer; + } + } + } + } +#else + // For Ruby versions without MJIT header, use our own rb_vm_frame_method_entry implementation + const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(cfp); + if (me && is_pointer_readable(me, sizeof(rb_callable_method_entry_t))) { + if (me->def && is_pointer_readable(me->def, sizeof(*me->def))) { + if (me->def->original_id) { + const char *method_name = rb_id2name(me->def->original_id); + if (method_name && is_pointer_readable(method_name, strlen(method_name))) { + size_t method_name_len = strlen(method_name); + if (method_name_len > 0 && method_name_len < 256) { + function_name = method_name; + } + } + } + + if (me->def->type == VM_METHOD_TYPE_CFUNC && me->owner) { + // Try to get the full class/module path + VALUE owner_name = Qnil; + VALUE actual_owner = me->owner; + + // If this is a singleton class (like Fiddle's singleton class for module methods), + // try to get the attached object which should be the actual module + if (RB_TYPE_P(me->owner, T_CLASS) && FL_TEST(me->owner, FL_SINGLETON)) { + VALUE attached = rb_ivar_get(me->owner, rb_intern("__attached__")); + if (attached != Qnil) { + actual_owner = attached; + } + } + + // Get the class/module path + if (RB_TYPE_P(actual_owner, T_CLASS) || RB_TYPE_P(actual_owner, T_MODULE)) { + owner_name = rb_class_path(actual_owner); + } + + // Fallback to rb_class_name if rb_class_path fails + if (owner_name == Qnil) { + owner_name = rb_class_name(actual_owner); + } + + if (owner_name != Qnil) { + const char *owner_str = safe_string_ptr(owner_name); + static char file_buffer[256]; + snprintf(file_buffer, sizeof(file_buffer), "<%s (C extension)>", owner_str); + file_name = file_buffer; + } + } + } + } +#endif + + ddog_crasht_RuntimeStackFrame frame = { + .type_name = char_slice_from_cstr(NULL), + .function = char_slice_from_cstr(function_name), + .file = char_slice_from_cstr(file_name), + .line = 0, + .column = 0 + }; + + emit_frame(&frame); + frame_count++; + } + } +} + +// Support code for Ruby versions without MJIT header (copied from private_vm_api_access.c) +#ifndef RUBY_MJIT_HEADER + +#define MJIT_STATIC // No-op on older Rubies + +#ifndef FALSE +# define FALSE false +#elif FALSE +# error FALSE must be false +#endif + +#ifndef TRUE +# define TRUE true +#elif ! TRUE +# error TRUE must be true +#endif + +static rb_callable_method_entry_t * +check_method_entry(VALUE obj, int can_be_svar) +{ + if (obj == Qfalse) return NULL; + + switch (imemo_type(obj)) { + case imemo_ment: + return (rb_callable_method_entry_t *)obj; + case imemo_cref: + return NULL; + case imemo_svar: + if (can_be_svar) { + return check_method_entry(((struct vm_svar *)obj)->cref_or_me, FALSE); + } + // fallthrough + default: + return NULL; + } +} + +MJIT_STATIC const rb_callable_method_entry_t * +rb_vm_frame_method_entry(const rb_control_frame_t *cfp) +{ + const VALUE *ep = cfp->ep; + rb_callable_method_entry_t *me; + + while (!VM_ENV_LOCAL_P(ep)) { + if ((me = check_method_entry(ep[VM_ENV_DATA_INDEX_ME_CREF], FALSE)) != NULL) return me; + ep = VM_ENV_PREV_EP(ep); + } + + return check_method_entry(ep[VM_ENV_DATA_INDEX_ME_CREF], TRUE); +} +#endif // RUBY_MJIT_HEADER + +static VALUE _native_register_runtime_stack_callback(DDTRACE_UNUSED VALUE _self) { + enum ddog_crasht_CallbackResult result = ddog_crasht_register_runtime_frame_callback( + ruby_runtime_stack_callback + ); + + switch (result) { + case DDOG_CRASHT_CALLBACK_RESULT_OK: + return Qtrue; + default: + return Qfalse; + } + + return Qfalse; +} + +static VALUE _native_is_runtime_callback_registered(DDTRACE_UNUSED VALUE _self) { + return ddog_crasht_is_runtime_callback_registered() ? Qtrue : Qfalse; +} + +void datadog_runtime_stack_init(VALUE crashtracker_class) { + rb_define_singleton_method(crashtracker_class, "_native_register_runtime_stack_callback", _native_register_runtime_stack_callback, 0); + rb_define_singleton_method(crashtracker_class, "_native_is_runtime_callback_registered", _native_is_runtime_callback_registered, 0); +} + +VALUE datadog_runtime_stack_register_callback(void) { + return _native_register_runtime_stack_callback(Qnil); +} + +VALUE datadog_runtime_stack_is_callback_registered(void) { + return _native_is_runtime_callback_registered(Qnil); +} diff --git a/ext/libdatadog_api/datadog_runtime_stack.h b/ext/libdatadog_api/datadog_runtime_stack.h new file mode 100644 index 00000000000..56397609c59 --- /dev/null +++ b/ext/libdatadog_api/datadog_runtime_stack.h @@ -0,0 +1,10 @@ +#pragma once + +#include "datadog_ruby_common.h" +#include + +void datadog_runtime_stack_init(VALUE crashtracker_class); + +VALUE datadog_runtime_stack_register_callback(void); + +VALUE datadog_runtime_stack_is_callback_registered(void); \ No newline at end of file diff --git a/ext/libdatadog_api/extconf.rb b/ext/libdatadog_api/extconf.rb index 41549a8ea8c..10108045d35 100644 --- a/ext/libdatadog_api/extconf.rb +++ b/ext/libdatadog_api/extconf.rb @@ -42,6 +42,9 @@ def skip_building_extension!(reason) # (https://github.com/msgpack/msgpack-ruby/blob/18ce08f6d612fe973843c366ac9a0b74c4e50599/ext/msgpack/extconf.rb#L8) append_cflags '-std=gnu99' +# Gets really noisy when we include the MJIT header, let's omit it (TODO: Use #pragma GCC diagnostic instead?) +append_cflags "-Wno-unused-function" + # Allow defining variables at any point in a function append_cflags '-Wno-declaration-after-statement' @@ -98,13 +101,153 @@ def skip_building_extension!(reason) extra_relative_rpaths.each { |folder| $LDFLAGS += " -Wl,-rpath,$$$\\\\{ORIGIN\\}/#{folder.to_str}" } Logging.message("[datadog] After pkg-config $LDFLAGS were set to: #{$LDFLAGS.inspect}\n") +# Enable access to Ruby VM internal headers for crashtracker stack walking +# Ruby version compatibility definitions similar to profiling extension + +# On Ruby 3.5, we can't ask the object_id from IMEMOs (https://github.com/ruby/ruby/pull/13347) +$defs << "-DNO_IMEMO_OBJECT_ID" unless RUBY_VERSION < "3.5" + +# On Ruby 2.5 and 3.3, this symbol was not visible. It is on 2.6 to 3.2, as well as 3.4+ +$defs << "-DNO_RB_OBJ_INFO" if RUBY_VERSION.start_with?("2.5", "3.3") + +# On older Rubies, rb_postponed_job_preregister/rb_postponed_job_trigger did not exist +$defs << "-DNO_POSTPONED_TRIGGER" if RUBY_VERSION < "3.3" + +# On older Rubies, M:N threads were not available +$defs << "-DNO_MN_THREADS_AVAILABLE" if RUBY_VERSION < "3.3" + +# On older Rubies, we did not need to include the ractor header (this was built into the MJIT header) +$defs << "-DNO_RACTOR_HEADER_INCLUDE" if RUBY_VERSION < "3.3" + +# On older Rubies, some of the Ractor internal APIs were directly accessible +$defs << "-DUSE_RACTOR_INTERNAL_APIS_DIRECTLY" if RUBY_VERSION < "3.3" + +# On older Rubies, there was no GVL instrumentation API and APIs created to support it +$defs << "-DNO_GVL_INSTRUMENTATION" if RUBY_VERSION < "3.2" + +# Supporting GVL instrumentation on 3.2 needs some workarounds +$defs << "-DUSE_GVL_PROFILING_3_2_WORKAROUNDS" if RUBY_VERSION.start_with?("3.2") + +# On older Rubies, there was no struct rb_native_thread. See private_vm_api_acccess.c for details. +$defs << "-DNO_RB_NATIVE_THREAD" if RUBY_VERSION < "3.2" + +# On older Rubies, there was no struct rb_thread_sched (it was struct rb_global_vm_lock_struct) +$defs << "-DNO_RB_THREAD_SCHED" if RUBY_VERSION < "3.2" + +# On older Rubies, the first_lineno inside a location was a VALUE and not a int (https://github.com/ruby/ruby/pull/6430) +$defs << "-DNO_INT_FIRST_LINENO" if RUBY_VERSION < "3.2" + +# On older Rubies, there was no tid member in the internal thread structure +$defs << "-DNO_THREAD_TID" if RUBY_VERSION < "3.1" + +# On older Rubies, there was no jit_return member on the rb_control_frame_t struct +$defs << "-DNO_JIT_RETURN" if RUBY_VERSION < "3.1" + +# On older Rubies, there are no Ractors +$defs << "-DNO_RACTORS" if RUBY_VERSION < "3" + +# On older Rubies, rb_imemo_name did not exist +$defs << "-DNO_IMEMO_NAME" if RUBY_VERSION < "3" + +# On older Rubies, objects would not move +$defs << "-DNO_T_MOVED" if RUBY_VERSION < "2.7" + +# On older Rubies, rb_global_vm_lock_struct did not include the owner field +$defs << "-DNO_GVL_OWNER" if RUBY_VERSION < "2.6" + +# On older Rubies, there was no thread->invoke_arg +$defs << "-DNO_THREAD_INVOKE_ARG" if RUBY_VERSION < "2.6" + # Tag the native extension library with the Ruby version and Ruby platform. # This makes it easier for development (avoids "oops I forgot to rebuild when I switched my Ruby") and ensures that # the wrong library is never loaded. # When requiring, we need to use the exact same string, including the version and the platform. EXTENSION_NAME = "libdatadog_api.#{RUBY_VERSION[/\d+.\d+/]}_#{RUBY_PLATFORM}".freeze -create_makefile(EXTENSION_NAME) +# Setup Ruby VM private headers access +CAN_USE_MJIT_HEADER = RUBY_VERSION.start_with?("2.6", "2.7", "3.0.", "3.1.", "3.2.") + +if CAN_USE_MJIT_HEADER + mjit_header_file_name = "rb_mjit_min_header-#{RUBY_VERSION}.h" + + # Validate that the mjit header can actually be compiled on this system. We learned via + # https://github.com/DataDog/dd-trace-rb/issues/1799 and https://github.com/DataDog/dd-trace-rb/issues/1792 + # that even if the header seems to exist, it may not even compile. + # `have_macro` actually tries to compile a file that mentions the given macro, so if this passes, we should be good to + # use the MJIT header. + # Finally, the `COMMON_HEADERS` conflict with the MJIT header so we need to temporarily disable them for this check. + original_common_headers = MakeMakefile::COMMON_HEADERS + MakeMakefile::COMMON_HEADERS = "".freeze + unless have_macro("RUBY_MJIT_H", mjit_header_file_name) + skip_building_extension!('MJIT header compilation failed - required for crashtracker stack walking') + end + MakeMakefile::COMMON_HEADERS = original_common_headers + + $defs << "-DRUBY_MJIT_HEADER='\"#{mjit_header_file_name}\"'" + + # NOTE: This needs to come after all changes to $defs + create_header + + # Warn on unused parameters to functions. Use `DDTRACE_UNUSED` to mark things as known-to-not-be-used. + # This is added as late as possible because in some Rubies we support (e.g. 3.3), adding this flag before + # checking if internal VM headers are available causes those checks to fail because of this warning (and not + # because the headers are not available.) + append_cflags "-Wunused-parameter" + + create_makefile(EXTENSION_NAME) +else + # The MJIT header was introduced on 2.6 and removed on 3.3; for other Rubies we rely on + # the datadog-ruby_core_source gem to get access to private VM headers. + # This gem ships source code copies of these VM headers for the different Ruby VM versions; + # see https://github.com/DataDog/datadog-ruby_core_source for details + + create_header + + require "datadog/ruby_core_source" + dir_config("ruby") # allow user to pass in non-standard core include directory + + # This is a workaround for a weird issue... + # + # The mkmf tool defines a `with_cppflags` helper that datadog-ruby_core_source uses. This helper temporarily + # replaces `$CPPFLAGS` (aka the C pre-processor [not c++!] flags) with a different set when doing something. + # + # The datadog-ruby_core_source gem uses `with_cppflags` during makefile generation to inject extra headers into the + # path. But because `with_cppflags` replaces `$CPPFLAGS`, well, the default `$CPPFLAGS` are not included in the + # makefile. + # + # This is a problem because the default `$CPPFLAGS` carries configuration that was set when Ruby was being built. + # Thus, if we ignore it, we don't compile the profiler with the exact same configuration as Ruby. + # In practice, this can generate crashes and weird bugs if the Ruby configuration is tweaked in a manner that + # changes some of the internal structures that the profiler relies on. Concretely, setting for instance + # `VM_CHECK_MODE=1` when building Ruby will trigger this issue (because somethings in structures the profiler reads + # are ifdef'd out using this setting). + # + # To workaround this issue, we override `with_cppflags` for datadog-ruby_core_source to still include `$CPPFLAGS`. + Datadog::RubyCoreSource.define_singleton_method(:with_cppflags) do |newflags, &block| + super("#{newflags} #{$CPPFLAGS}", &block) + end + + Datadog::RubyCoreSource + .create_makefile_with_core( + proc do + headers_available = + have_header("vm_core.h") && + have_header("iseq.h") && + (RUBY_VERSION < "3.3" || have_header("ractor_core.h")) + + if headers_available + # Warn on unused parameters to functions. Use `DDTRACE_UNUSED` to mark things as known-to-not-be-used. + # This is added as late as possible because in some Rubies we support (e.g. 3.3), adding this flag before + # checking if internal VM headers are available causes those checks to fail because of this warning (and not + # because the headers are not available.) + append_cflags "-Wunused-parameter" + end + + headers_available + end, + EXTENSION_NAME + ) +end # rubocop:enable Style/GlobalVars # rubocop:enable Style/StderrPuts diff --git a/lib/datadog/core/crashtracking/component.rb b/lib/datadog/core/crashtracking/component.rb index 1d39858d517..55db43ed771 100644 --- a/lib/datadog/core/crashtracking/component.rb +++ b/lib/datadog/core/crashtracking/component.rb @@ -57,6 +57,7 @@ def start Utils::AtForkMonkeyPatch.apply! start_or_update_on_fork(action: :start, tags: tags) + register_runtime_stack_callback ONLY_ONCE.run do Utils::AtForkMonkeyPatch.at_fork(:child) do @@ -81,8 +82,29 @@ def stop logger.error("Failed to stop crash tracking: #{e.message}") end + def runtime_callback_registered? + self.class._native_is_runtime_callback_registered + rescue => e + logger.error("Failed to check runtime callback registration status: #{e.message}") + false + end + private + def register_runtime_stack_callback + # Always use frame-based callback since that's the only type we support + success = self.class._native_register_runtime_stack_callback + + unless success + error_message = 'Failed to register runtime stack callback: registration returned false (may be already registered or unsupported)' + logger.error(error_message) + raise StandardError, error_message + end + rescue => e + logger.error("Failed to register runtime stack callback: #{e.message}") + raise + end + attr_reader :tags, :agent_base_url, :ld_library_path, :path_to_crashtracking_receiver_binary, :logger def start_or_update_on_fork(action:, tags:) diff --git a/sig/datadog/core/crashtracking.rbs b/sig/datadog/core/crashtracking.rbs new file mode 100644 index 00000000000..356434fbad2 --- /dev/null +++ b/sig/datadog/core/crashtracking.rbs @@ -0,0 +1,32 @@ +module Datadog + module Core + module Crashtracking + class Component + def self.build: (untyped settings, untyped agent_settings, logger: untyped) -> Component? + + def initialize: ( + tags: Hash[String, String], + agent_base_url: String, + ld_library_path: String, + path_to_crashtracking_receiver_binary: String, + logger: untyped + ) -> void + + def start: () -> void + def update_on_fork: (?settings: untyped) -> void + def stop: () -> void + def runtime_callback_registered?: () -> bool + + private + + def register_runtime_stack_callback: () -> void + + # Native method signatures + def self._native_start_or_update_on_fork: (*untyped) -> untyped + def self._native_stop: () -> untyped + def self._native_register_runtime_stack_callback: () -> bool + def self._native_is_runtime_callback_registered: () -> bool + end + end + end +end \ No newline at end of file diff --git a/spec/datadog/core/crashtracking/component_spec.rb b/spec/datadog/core/crashtracking/component_spec.rb index a165161bd5e..4c946eda2dc 100644 --- a/spec/datadog/core/crashtracking/component_spec.rb +++ b/spec/datadog/core/crashtracking/component_spec.rb @@ -110,6 +110,29 @@ crashtracker.start end end + + it 'registers the runtime stack callback automatically' do + crashtracker = build_crashtracker(logger: logger) + + expect(described_class).to receive(:_native_start_or_update_on_fork) + expect(described_class).to receive(:_native_register_runtime_stack_callback).and_return(true) + + crashtracker.start + end + + context 'when runtime stack callback registration fails' do + it 'logs the error and re-raises' do + crashtracker = build_crashtracker(logger: logger) + error = StandardError.new('Callback registration failed') + + expect(described_class).to receive(:_native_start_or_update_on_fork) + expect(described_class).to receive(:_native_register_runtime_stack_callback).and_raise(error) + allow(logger).to receive(:debug) # Allow other debug messages + expect(logger).to receive(:error).with('Failed to register runtime stack callback: Callback registration failed') + + expect { crashtracker.start }.to raise_error(error) + end + end end describe '#stop' do @@ -198,14 +221,16 @@ let(:parsed_request) { JSON.parse(request.body, symbolize_names: true) } let(:crash_report) { parsed_request.fetch(:payload).first } let(:crash_report_message) { JSON.parse(crash_report.fetch(:message), symbolize_names: true) } + let(:crash_report_experimental) { crash_report_message.fetch(:experimental) } + let(:log_messages) { crash_report_message.fetch(:log_messages) } let(:stack_trace) { crash_report_message.fetch(:error).fetch(:stack).fetch(:frames) } # NOTE: If any of these tests seem flaky, the `upload_timeout_seconds` may need to be raised (or otherwise # we need to tweak libdatadog to not need such high timeouts). [ - [:fiddle, "rb_fiddle_free", proc { Fiddle.free(42) }], - [:signal, "rb_f_kill", proc { Process.kill("SEGV", Process.pid) }], + [:fiddle, 'rb_fiddle_free', proc { Fiddle.free(42) }], + [:signal, 'rb_f_kill', proc { Process.kill('SEGV', Process.pid) }], ].each do |trigger_name, function, trigger| it "reports crashes via http when app crashes with #{trigger_name}" do expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do @@ -213,10 +238,8 @@ crash_tracker.start trigger.call end - expect(stack_trace).to match(array_including(hash_including(function: function))) expect(stack_trace.size).to be > 10 - expect(crash_report[:tags]).to include('si_signo:11', 'si_signo_human_readable:SIGSEGV') expect(crash_report_message[:metadata]).to include( @@ -318,6 +341,88 @@ end end end + + describe 'Ruby and C method runtime stack capture' do + let(:runtime_stack) { crash_report_experimental[:runtime_stack] } + + it 'captures both Ruby and C method frames in mixed stacks' do + # Create standalone Ruby script to completely avoid RSpec context + # We do this because RSpec context frames show up if we crash within + # top level RSpec context + test_script_content = <<~RUBY + require 'fiddle' + require 'datadog' + + # Configure crashtracker + Datadog.configure do |c| + c.agent.host = '127.0.0.1' + c.agent.port = #{agent_base_url.match(/:(\d+)/)[1]} + end + + def top_level_ruby_method + ruby_method_with_c_calls + end + + def ruby_method_with_c_calls + "hello world".gsub(/world/) do |match| + {a: 1, b: 2}.each do |key, value| + if key == :a + Fiddle.free(42) + end + end + end + end + + top_level_ruby_method + RUBY + + temp_script = Tempfile.new(['crash_test_script', '.rb']) + temp_script.write(test_script_content) + temp_script.close + + expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do + exec(RbConfig.ruby, temp_script.path) + end + + frames = runtime_stack[:frames] + # Check that the crashing function is captured + expect(frames).to include( + hash_including( + function: 'free' + ) + ) + + temp_script.unlink + end + end + end + + describe '#runtime_callback_registered?' do + it 'returns true when callback is registered' do + crashtracker = build_crashtracker(logger: logger) + + expect(described_class).to receive(:_native_is_runtime_callback_registered).and_return(true) + + expect(crashtracker.runtime_callback_registered?).to be true + end + + it 'returns false when callback is not registered' do + crashtracker = build_crashtracker(logger: logger) + + expect(described_class).to receive(:_native_is_runtime_callback_registered).and_return(false) + + expect(crashtracker.runtime_callback_registered?).to be false + end + + it 'returns false and logs errors when native method raises exception' do + crashtracker = build_crashtracker(logger: logger) + error = StandardError.new('Native error') + + expect(described_class).to receive(:_native_is_runtime_callback_registered).and_raise(error) + expect(logger).to receive(:error).with('Failed to check runtime callback registration status: Native error') + + expect(crashtracker.runtime_callback_registered?).to be false + end end end