Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 57 additions & 17 deletions lib/llm/src/block_manager/storage/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ use std::{
};

use cudarc::driver::{CudaContext, sys};
use cudarc::runtime::sys::{cudaError, cudaHostRegister};
use nix::libc::{MAP_ANONYMOUS, MAP_FAILED, MAP_HUGETLB, MAP_PRIVATE, PROT_READ, PROT_WRITE};

use crate::block_manager::numa_allocator;

Expand Down Expand Up @@ -178,25 +180,63 @@ impl PinnedStorage {
unsafe {
ctx.bind_to_thread().map_err(StorageError::Cuda)?;

// Try NUMA-aware allocation if enabled, otherwise use direct allocation
let ptr = if numa_allocator::is_numa_enabled() {
let device_id = ctx.cu_device() as u32;
match numa_allocator::worker_pool::NumaWorkerPool::global()
.allocate_pinned_for_gpu(size, device_id)
{
Ok(ptr) => ptr,
Err(e) => {
tracing::warn!("NUMA allocation failed: {}, using direct allocation", e);
cudarc::driver::result::malloc_host(
size,
sys::CU_MEMHOSTALLOC_WRITECOMBINED,
)
.map_err(StorageError::Cuda)? as *mut u8
let numa_aware = numa_allocator::is_numa_enabled();
let use_huge_pages =
std::env::var("DYN_KVBM_USE_HUGE_PAGES").unwrap_or("0".to_string()) == "1";

let ptr = match (numa_aware, use_huge_pages) {
(true, true) => {
return Err(StorageError::InvalidConfig(
"NUMA-aware and huge pages are not supported together".into(),
));
}
(true, false) => {
let device_id = ctx.cu_device() as u32;
match numa_allocator::worker_pool::NumaWorkerPool::global()
.allocate_pinned_for_gpu(size, device_id)
{
Ok(ptr) => ptr,
Err(e) => {
tracing::warn!(
"NUMA allocation failed: {}, using direct allocation",
e
);
cudarc::driver::result::malloc_host(
size,
sys::CU_MEMHOSTALLOC_WRITECOMBINED,
)
.map_err(StorageError::Cuda)? as *mut u8
}
}
}
(false, true) => {
let ptr = nix::libc::mmap(
std::ptr::null_mut(),
size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-1,
0,
);

if ptr == MAP_FAILED {
return Err(StorageError::AllocationFailed(
"Failed to allocate pinned memory".into(),
));
}

if cudaHostRegister(ptr, size, 0) != cudaError::cudaSuccess {
return Err(StorageError::AllocationFailed(
"Failed to register memory".into(),
));
}

ptr as *mut u8
}
Comment on lines +228 to +235
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Wrong teardown for huge-page allocations

In the huge-page branch we mmap anonymous pages and register them with cudaHostRegister, but Drop still calls cudarc::driver::result::free_host, which is only valid for pointers returned by cuMemHostAlloc. Doing that on an mmapd region is undefined behaviour and leaks the huge-page reservation. We also skip cudaHostUnregister / munmap on both the success case and the error path where cudaHostRegister fails, so the mapping lives forever.

Please track whether the allocation came from the huge-page path, call cudaHostUnregister + munmap on drop, and unwind the mapping whenever registration fails. Something along these lines should address it:

@@
-use cudarc::runtime::sys::{cudaError, cudaHostRegister};
+use cudarc::runtime::sys::{cudaError, cudaHostRegister, cudaHostUnregister};
@@
 pub struct PinnedStorage {
     ptr: u64,
     size: usize,
     handles: RegistrationHandles,
     ctx: Arc<CudaContext>,
+    huge_page_alloc: bool,
 }
@@
-            let ptr = match (numa_aware, use_huge_pages) {
+            let mut huge_page_alloc = false;
+            let ptr = match (numa_aware, use_huge_pages) {
@@
                 (false, true) => {
                     let ptr = nix::libc::mmap(
@@
                     if ptr == MAP_FAILED {
                         return Err(StorageError::AllocationFailed(
                             "Failed to allocate pinned memory".into(),
                         ));
                     }
 
-                    if cudaHostRegister(ptr, size, 0) != cudaError::cudaSuccess {
-                        return Err(StorageError::AllocationFailed(
-                            "Failed to register memory".into(),
-                        ));
+                    if cudaHostRegister(ptr, size, 0) != cudaError::cudaSuccess {
+                        unsafe { nix::libc::munmap(ptr, size) };
+                        return Err(StorageError::AllocationFailed(
+                            "Failed to register memory".into(),
+                        ));
                     }
 
+                    huge_page_alloc = true;
                     ptr as *mut u8
                 }
@@
             Ok(Self {
                 ptr,
                 size,
                 handles: RegistrationHandles::new(),
                 ctx: ctx.clone(),
+                huge_page_alloc,
             })
@@
     fn drop(&mut self) {
         self.handles.release();
-        unsafe { cudarc::driver::result::free_host(self.ptr as _) }.unwrap();
+        unsafe {
+            if self.huge_page_alloc {
+                cudaHostUnregister(self.ptr as _);
+                nix::libc::munmap(self.ptr as *mut _, self.size);
+            } else {
+                cudarc::driver::result::free_host(self.ptr as _).unwrap();
+            }
+        }
     }

Also applies to: 259-261

🤖 Prompt for AI Agents
In lib/llm/src/block_manager/storage/cuda.rs around lines 228-235 (and similarly
lines 259-261), the huge-page allocation branch mmaps anonymous memory and
registers it with cudaHostRegister but the Drop implementation and error paths
still call cudarc::driver::result::free_host (valid only for cuMemHostAlloc),
and they omit cudaHostUnregister/munmap, causing UB and leaks; change the
allocation tracking to record whether the pointer came from the huge-page (mmap)
path, and on Drop call cudaHostUnregister then munmap for huge-page allocations
(instead of free_host), ensure that if cudaHostRegister fails you immediately
call munmap to unwind the mapping before returning Err, and keep using free_host
only for cuMemHostAlloc-backed allocations so both success and error paths
properly clean up.

(false, false) => {
cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
.map_err(StorageError::Cuda)? as *mut u8
}
} else {
cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
.map_err(StorageError::Cuda)? as *mut u8
};

assert!(!ptr.is_null(), "Failed to allocate pinned memory");
Expand Down
Loading