From 666a688ab5c140a7c9435ecad717f32a0e3bdd47 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 18 Mar 2026 01:41:49 +0000 Subject: [PATCH 1/5] keeping input data mem location --- c/include/cuvs/neighbors/nn_descent.h | 20 +- c/src/neighbors/nn_descent.cpp | 6 +- cpp/include/cuvs/neighbors/nn_descent.hpp | 20 +- cpp/src/neighbors/detail/nn_descent.cuh | 374 ++++++++++-------- cpp/src/neighbors/detail/nn_descent_gnnd.hpp | 15 +- .../cuvs/neighbors/nn_descent/nn_descent.pxd | 9 +- .../cuvs/neighbors/nn_descent/nn_descent.pyx | 25 +- python/cuvs/cuvs/tests/test_nn_descent.py | 14 +- 8 files changed, 251 insertions(+), 232 deletions(-) diff --git a/c/include/cuvs/neighbors/nn_descent.h b/c/include/cuvs/neighbors/nn_descent.h index 0c7102e3e1..1db0eea139 100644 --- a/c/include/cuvs/neighbors/nn_descent.h +++ b/c/include/cuvs/neighbors/nn_descent.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -15,18 +15,6 @@ extern "C" { #endif -/** - * @brief Dtype to use for distance computation - * - `NND_DIST_COMP_AUTO`: Automatically determine the best dtype for distance computation based on the dataset dimensions. - * - `NND_DIST_COMP_FP32`: Use fp32 distance computation for better precision at the cost of performance and memory usage. - * - `NND_DIST_COMP_FP16`: Use fp16 distance computation. - */ -typedef enum { - NND_DIST_COMP_AUTO = 0, - NND_DIST_COMP_FP32 = 1, - NND_DIST_COMP_FP16 = 2 -} cuvsNNDescentDistCompDtype; - /** * @defgroup nn_descent_c_index_params The nn-descent algorithm parameters. * @{ @@ -47,7 +35,9 @@ typedef enum { * the graph for. More iterations produce a better quality graph at cost of performance * `termination_threshold`: The delta at which nn-descent will terminate its iterations * `return_distances`: Boolean to decide whether to return distances array - * `dist_comp_dtype`: dtype to use for distance computation. Defaults to `NND_DIST_COMP_AUTO` which automatically determines the best dtype for distance computation based on the dataset dimensions. Use `NND_DIST_COMP_FP32` for better precision at the cost of performance and memory usage. This option is only valid when data type is fp32. Use `NND_DIST_COMP_FP16` for better performance and memory usage at the cost of precision. + * `compress_to_fp16`: When true and the input data is fp32, distance computation is done in + * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on + * non-fp32 input types. */ struct cuvsNNDescentIndexParams { cuvsDistanceType metric; @@ -57,7 +47,7 @@ struct cuvsNNDescentIndexParams { size_t max_iterations; float termination_threshold; bool return_distances; - cuvsNNDescentDistCompDtype dist_comp_dtype; + bool compress_to_fp16; }; typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t; diff --git a/c/src/neighbors/nn_descent.cpp b/c/src/neighbors/nn_descent.cpp index 708056144a..d3753e925a 100644 --- a/c/src/neighbors/nn_descent.cpp +++ b/c/src/neighbors/nn_descent.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -43,7 +43,7 @@ void* _build(cuvsResources_t res, build_params.max_iterations = params.max_iterations; build_params.termination_threshold = params.termination_threshold; build_params.return_distances = params.return_distances; - build_params.dist_comp_dtype = static_cast(static_cast(params.dist_comp_dtype)); + build_params.compress_to_fp16 = params.compress_to_fp16; using graph_type = raft::host_matrix_view; std::optional graph; @@ -179,7 +179,7 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t .max_iterations = cpp_params.max_iterations, .termination_threshold = cpp_params.termination_threshold, .return_distances = cpp_params.return_distances, - .dist_comp_dtype = static_cast(static_cast(cpp_params.dist_comp_dtype))}; + .compress_to_fp16 = cpp_params.compress_to_fp16}; }); } diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index 8e94edfaa5..dc84e398c7 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -24,16 +24,6 @@ namespace cuvs::neighbors::nn_descent { * @{ */ -/** - * @brief Dtype to use for distance computation - * - `AUTO`: Automatically determine the best dtype for distance computation based on the dataset - * dimensions. - * - `FP32`: Use fp32 distance computation for better precision at the cost of performance and - * memory usage. - * - `FP16`: Use fp16 distance computation. - */ -enum class DIST_COMP_DTYPE { AUTO = 0, FP32 = 1, FP16 = 2 }; - /** * @brief Parameters used to build an nn-descent index * - `graph_degree`: For an input dataset of dimensions (N, D), @@ -47,11 +37,9 @@ enum class DIST_COMP_DTYPE { AUTO = 0, FP32 = 1, FP16 = 2 }; * the graph for. More iterations produce a better quality graph at cost of performance * - `termination_threshold`: The delta at which nn-descent will terminate its iterations * - `return_distances`: Boolean to decide whether to return distances array - * - `dist_comp_dtype`: dtype to use for distance computation. Defaults to `AUTO` which - * automatically determines the best dtype for distance computation based on the dataset dimensions. - * Use `FP32` for better precision at the cost of performance and memory usage. This option is only - * valid when data type is fp32. Use `FP16` for better performance and memory usage at the cost of - * precision. + * - `compress_to_fp16`: When true and the input data is fp32, distance computation is done in + * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on + * non-fp32 input types (fp16, int8, uint8) which always use fp16 distance computation. */ struct index_params : cuvs::neighbors::index_params { size_t graph_degree = 64; @@ -59,7 +47,7 @@ struct index_params : cuvs::neighbors::index_params { size_t max_iterations = 20; float termination_threshold = 0.0001; bool return_distances = true; - DIST_COMP_DTYPE dist_comp_dtype = DIST_COMP_DTYPE::AUTO; + bool compress_to_fp16 = false; /** @brief Construct NN descent parameters for a specific kNN graph degree * diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 2b6e5459f2..67b165dc47 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -141,6 +141,22 @@ struct dtype_traits<__half> { static __device__ __forceinline__ float to_float(__half v) { return __half2float(v); } }; +template <> +struct dtype_traits { + static constexpr int APAD = 16; + static constexpr int BPAD = 16; + static constexpr int TILE_COL_WIDTH = 128; + static __device__ __forceinline__ float to_float(uint8_t v) { return static_cast(v); } +}; + +template <> +struct dtype_traits { + static constexpr int APAD = 16; + static constexpr int BPAD = 16; + static constexpr int TILE_COL_WIDTH = 128; + static __device__ __forceinline__ float to_float(int8_t v) { return static_cast(v); } +}; + template __device__ __forceinline__ ResultItem xor_swap(ResultItem x, int mask, int dir) { @@ -244,69 +260,64 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer, } } -// TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827 -/** Calculate L2 norm, and cast data to Output_t */ -template -RAFT_KERNEL preprocess_data_kernel( - const Data_t* input_data, - Output_t* output_data, - int dim, - DistData_t* l2_norms, - size_t list_offset = 0, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) +/** Converting load: loads Data_t from global memory into __half shared memory buffer. */ +template + requires(!std::is_same_v) +__device__ __forceinline__ void load_vec(__half* vec_buffer, + const Data_t* d_vec, + const int load_dims, + const int padding_dims, + const int lane_id) +{ + constexpr int num_load_elems_per_warp = raft::warp_size(); + __half half_0 = __float2half(0.0f); + for (int step = 0; step < raft::ceildiv(padding_dims, num_load_elems_per_warp); step++) { + int idx = step * num_load_elems_per_warp + lane_id; + if (idx < load_dims) { + vec_buffer[idx] = __float2half(static_cast(d_vec[idx])); + } else if (idx < padding_dims) { + vec_buffer[idx] = half_0; + } + } +} + +/** One warp per block. Computes squared L2 norm for each row. */ +template +RAFT_KERNEL compute_l2_norms_kernel(const Data_t* data, int dim, DistData_t* l2_norms) { extern __shared__ char buffer[]; __shared__ float l2_norm; Data_t* s_vec = (Data_t*)buffer; - size_t list_id = list_offset + blockIdx.x; + size_t list_id = blockIdx.x; + int lane_id = threadIdx.x % raft::warp_size(); - load_vec(s_vec, - input_data + static_cast(blockIdx.x) * dim, - dim, - dim, - threadIdx.x % raft::warp_size()); + load_vec(s_vec, data + static_cast(blockIdx.x) * dim, dim, dim, lane_id); if (threadIdx.x == 0) { l2_norm = 0; } __syncthreads(); - if (metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded || - metric == cuvs::distance::DistanceType::CosineExpanded) { - int lane_id = threadIdx.x % raft::warp_size(); - for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { - int idx = step * raft::warp_size() + lane_id; - float part_dist = 0; - if (idx < dim) { - part_dist = s_vec[idx]; - part_dist = part_dist * part_dist; - } - __syncwarp(); - for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { - part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); - } - if (lane_id == 0) { l2_norm += part_dist; } - __syncwarp(); - } - } - for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { - int idx = step * raft::warp_size() + threadIdx.x; + int idx = step * raft::warp_size() + lane_id; + float part_dist = 0; if (idx < dim) { - if (metric == cuvs::distance::DistanceType::InnerProduct || - metric == cuvs::distance::DistanceType::L1) { - output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; - } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { - output_data[list_id * dim + idx] = - (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm); - } else if (metric == cuvs::distance::DistanceType::BitwiseHamming) { - int idx_for_byte = list_id * dim + idx; // uint8 or int8 data - uint8_t* output_bytes = reinterpret_cast(output_data); - output_bytes[idx_for_byte] = input_data[(size_t)blockIdx.x * dim + idx]; - } else { // L2Expanded or L2SqrtExpanded - output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; - if (idx == 0) { l2_norms[list_id] = l2_norm; } - } + part_dist = static_cast(s_vec[idx]); + part_dist = part_dist * part_dist; + } + __syncwarp(); + for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { + part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); } + if (lane_id == 0) { l2_norm += part_dist; } + __syncwarp(); } + + if (lane_id == 0) { l2_norms[list_id] = l2_norm; } +} + +template +RAFT_KERNEL convert_copy_kernel(const Src_t* src, Dst_t* dst, size_t n) +{ + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < n) { dst[idx] = static_cast(src[idx]); } } template @@ -527,7 +538,9 @@ __device__ __forceinline__ void calculate_metric(float* s_distances, if (metric == cuvs::distance::DistanceType::InnerProduct && can_postprocess_dist) { s_distances[i] = -s_distances[i]; } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { - s_distances[i] = 1.0 - s_distances[i]; + float norm_product = l2_norms[row_neighbors[row_id]] * l2_norms[col_neighbors[col_id]]; + s_distances[i] = + (norm_product > 0.0f) ? (1.0f - s_distances[i] / sqrtf(norm_product)) : 0.0f; } else if (metric == cuvs::distance::DistanceType::BitwiseHamming) { s_distances[i] = 0.0; int n1 = row_neighbors[row_id]; @@ -573,12 +586,12 @@ struct DistAccumulator { // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM // is 1024 and 1536 respectively, which means the bounds don't work anymore // SIMT kernel: scalar element-wise distance computation. -// Used for fp32 data (all metrics) and fp16 data with L1 distance (which cannot use tensor cores). +// Used for fp32 data (all metrics) and L1 distance computation for all dtypes (which cannot use +// tensor cores). template , typename DistEpilogue_t> - requires(std::is_same_v || std::is_same_v) RAFT_KERNEL #ifdef __CUDA_ARCH__ // Use minBlocksPerMultiprocessor = 4 on specific arches @@ -689,6 +702,7 @@ __launch_bounds__(BLOCK_SIZE) if (idx < list_new_size) { size_t neighbor_id = new_neighbors[idx]; size_t idx_in_data = neighbor_id * data_dim; + // loaded to shared memory while keeping the original dtype load_vec(s_nv[idx], data + idx_in_data + step * TILE_COL_WIDTH, num_load_elems, @@ -706,6 +720,7 @@ __launch_bounds__(BLOCK_SIZE) if (tmp_row < list_new_size && tmp_col < list_new_size) { float acc = 0.0f; for (int d = 0; d < num_load_elems; d++) { + // converted to float for distance computation float a = dtype_traits::to_float(s_nv[tmp_row][d]); float b = dtype_traits::to_float(s_nv[tmp_col][d]); acc += dist_acc(a, b); @@ -844,7 +859,11 @@ __launch_bounds__(BLOCK_SIZE) // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM // is 1024 and 1536 respectively, which means the bounds don't work anymore -template , typename DistEpilogue_t> +// Used for fp32 data compressed to fp16, and all types using non-L1 distance metric. +template , + typename DistEpilogue_t> RAFT_KERNEL #ifdef __CUDA_ARCH__ // Use minBlocksPerMultiprocessor = 4 on specific arches @@ -862,7 +881,7 @@ __launch_bounds__(BLOCK_SIZE) const Index_t* rev_graph_old, const int2* sizes_old, const int width, - const __half* data, + const Data_t* data, const int data_dim, ID_t* graph, DistData_t* dists, @@ -958,6 +977,7 @@ __launch_bounds__(BLOCK_SIZE) if (idx < list_new_size) { size_t neighbor_id = new_neighbors[idx]; size_t idx_in_data = neighbor_id * data_dim; + // converted to fp16 on-the-fly while loading load_vec(s_nv[idx], data + idx_in_data + step * TILE_COL_WIDTH, num_load_elems, @@ -1353,20 +1373,12 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build static_assert(NUM_SAMPLES <= 32); using input_t = typename std::remove_const::type; - if (std::is_same_v && - (build_config.dist_comp_dtype == cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::FP32 || - (build_config.dist_comp_dtype == cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::AUTO && - build_config.dataset_dim <= 16))) { - // use fp32 distance computation for better precision with smaller dimension - d_data_float_.emplace( - raft::make_device_matrix(res, nrow_, ndim_)); - } else { - d_data_half_.emplace(raft::make_device_matrix( - res, - nrow_, - build_config.metric == cuvs::distance::DistanceType::BitwiseHamming - ? (build_config.dataset_dim + 1) / 2 - : build_config.dataset_dim)); + if (build_config.compress_to_fp16 && build_config.dataset_dim <= 16 && + std::is_same_v) { + RAFT_LOG_WARN( + "Using fp16 for distance computation for data in fp32 with small dimensions (%zu) <= 16 may " + "result in low quality results. Consider setting compress_to_fp16 = false.", + build_config.dataset_dim); } raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); @@ -1376,7 +1388,11 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build raft::matrix::fill(res, d_locks_.view(), 0); if (build_config.metric == cuvs::distance::DistanceType::L2Expanded || - build_config.metric == cuvs::distance::DistanceType::L2SqrtExpanded) { + build_config.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + build_config.metric == cuvs::distance::DistanceType::CosineExpanded) { + // for device memory efficiency, we do not allocate a separate array for the data + // to normalize the data when using CosineExpanded metric. Instead, we use the l2_norms_ vector + // and compute inside the calculate_metric kernel. l2_norms_ = raft::make_device_vector(res, nrow_); } }; @@ -1414,62 +1430,62 @@ template void GNND::local_join(cudaStream_t stream, DistEpilogue_t dist_epilogue) { raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); - // Kernel dispatch logic: - // fp32 data -> SIMT (metric resolved at runtime inside the kernel) - // fp16 data + L1 distance -> SIMT (L1 needs element-wise ops, cannot use tensor cores) - // fp16 data + other metrics -> WMMA (tensor-core accelerated dot product) - if (d_data_float_.has_value()) { - local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_float_->data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); - } else if (build_config_.metric == cuvs::distance::DistanceType::L1) { - local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_half_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + // fp32 data can have an effective type of fp32 OR fp16 (when compress_to_fp16 flag = True for + // wmma usage) Based on EFFECTIVE dtype: + // fp32 data || L1 distance -> SIMT: internally converted to fp32 for distance computation + // on-the-fly dypte <= fp16 && non-L1 metrics -> WMMA (tensor-core accelerated dot product): + // internally converted to fp16 for distance computation on-the-fly + + bool use_simt = (std::is_same_v && !build_config_.compress_to_fp16) || + build_config_.metric == cuvs::distance::DistanceType::L1; + + auto launch_kernel = [&](auto* typed_ptr) { + if (use_simt) { + std::cout << "using simt kernel" << std::endl; + local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + typed_ptr, + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); + } else { + std::cout << "using wmma kernel" << std::endl; + local_join_kernel_wmma<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + typed_ptr, + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); + } + }; + + if (d_data_half_.has_value()) { + // Host fp32 input compressed to fp16 via compress_to_fp16. + launch_kernel(static_cast(d_data_ptr_)); } else { - local_join_kernel_wmma<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_half_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + // Data stored as input_t: device data used directly, or host data copied as-is. + launch_kernel(static_cast(d_data_ptr_)); } } @@ -1497,45 +1513,81 @@ void GNND::build(Data_t* data, update_counter_ = 0; graph_.h_graph = (InternalID_t*)output_graph; - if (d_data_float_.has_value()) { - raft::matrix::fill(res, d_data_float_.value().view(), static_cast(0)); - } else { - raft::matrix::fill(res, d_data_half_.value().view(), static_cast(0)); - } + d_data_ptr_ = nullptr; cudaPointerAttributes data_ptr_attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&data_ptr_attr, data)); - size_t batch_size = (data_ptr_attr.devicePointer == nullptr) ? 100000 : nrow_; - - cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{ - data, static_cast(nrow_), build_config_.dataset_dim, batch_size, stream}; - for (auto const& batch : vec_batches) { - if (d_data_float_.has_value()) { - preprocess_data_kernel<<(raft::warp_size())) * - raft::warp_size(), - stream>>>(batch.data(), - d_data_float_.value().data_handle(), - build_config_.dataset_dim, - l2_norms_.data_handle(), - batch.offset(), - build_config_.metric); + bool data_on_device = (data_ptr_attr.type == cudaMemoryTypeDevice); + + bool needs_l2_norms = build_config_.metric == cuvs::distance::DistanceType::L2Expanded || + build_config_.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + build_config_.metric == cuvs::distance::DistanceType::CosineExpanded; + + bool compress_host_data = + !data_on_device && std::is_same_v && build_config_.compress_to_fp16; + + if (data_on_device) { + // When user-given data is on device, we use it directly. This can be any type (fp32, fp16, + // int8, uint8) + d_data_ptr_ = data; + std::cout << "data is on device. using user-given data directly" << std::endl; + } else if (compress_host_data) { + // When user-given data is fp32 host data, and compress_to_fp16 is true, we allocate fp16 buffer + // to copy the data. This allows the wmma kernel to be used for distance computation instead of + // simt kernel. + if (!d_data_half_.has_value()) { + d_data_half_.emplace(raft::make_device_matrix( + res, build_config_.max_dataset_size, build_config_.dataset_dim)); + } + size_t batch_size = 100000; + cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{ + data, static_cast(nrow_), build_config_.dataset_dim, batch_size, stream}; + constexpr int TPB = 256; + for (auto const& batch : vec_batches) { + size_t n_elems = batch.size() * build_config_.dataset_dim; + int num_blocks = raft::ceildiv(n_elems, static_cast(TPB)); + size_t dst_offset = batch.offset() * build_config_.dataset_dim; + convert_copy_kernel<<>>( + batch.data(), d_data_half_.value().data_handle() + dst_offset, n_elems); + } + d_data_ptr_ = d_data_half_.value().data_handle(); + std::cout << "data is on host and fp32 and user decided to compress" << std::endl; + } else { + // In other cases where user-given data is not device-accessible, we allocate a device buffer to + // copy the data. The input type is kept as-is (fp32, fp16, int8, uint8). + if (!d_data_direct_.has_value()) { + d_data_direct_.emplace(raft::make_device_matrix( + res, build_config_.max_dataset_size, build_config_.dataset_dim)); + } + raft::copy(d_data_direct_.value().data_handle(), + data, + static_cast(nrow_) * build_config_.dataset_dim, + stream); + d_data_ptr_ = d_data_direct_.value().data_handle(); + std::cout << "keeping data dtype and copying to device" << std::endl; + } + + if (needs_l2_norms) { + if (d_data_half_.has_value()) { + compute_l2_norms_kernel<<< + nrow_, + raft::warp_size(), + sizeof(half) * ceildiv(build_config_.dataset_dim, static_cast(raft::warp_size())) * + raft::warp_size(), + stream>>>( + static_cast(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle()); + raft::resource::sync_stream(res); } else { - preprocess_data_kernel<<(raft::warp_size())) * - raft::warp_size(), - stream>>>(batch.data(), - d_data_half_.value().data_handle(), - build_config_.dataset_dim, - l2_norms_.data_handle(), - batch.offset(), - build_config_.metric); + compute_l2_norms_kernel<<(raft::warp_size())) * + raft::warp_size(), + stream>>>(static_cast(d_data_ptr_), + build_config_.dataset_dim, + l2_norms_.data_handle()); + raft::resource::sync_stream(res); } } @@ -1587,7 +1639,7 @@ void GNND::build(Data_t* data, // __CUDA_ARCH__ >= 700. Since RAFT supports compilation for ARCH 600, // we need to ensure that `local_join_kernel` (which uses tensor) operations // is not only not compiled, but also a runtime error is presented to the user - auto kernel = preprocess_data_kernel; + auto kernel = compute_l2_norms_kernel; void* kernel_ptr = reinterpret_cast(kernel); auto runtime_arch = raft::util::arch::kernel_virtual_arch(kernel_ptr); auto wmma_range = diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp index a2639e4f43..bc86ff014e 100644 --- a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp +++ b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp @@ -64,8 +64,7 @@ struct BuildConfig { float termination_threshold{0.0001}; size_t output_graph_degree{32}; cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded}; - cuvs::neighbors::nn_descent::DIST_COMP_DTYPE dist_comp_dtype{ - cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::AUTO}; + bool compress_to_fp16{false}; }; template @@ -228,8 +227,16 @@ class GNND { size_t nrow_; size_t ndim_; - std::optional> d_data_float_; + using input_t = std::remove_const_t; + + // d_data_half_ is used for a special case when input data is fp32 on host and compress_to_fp16 + // flag is True std::optional> d_data_half_; + // d_data_direct_ is used when input data is on host, and we need to copy it to device + std::optional> d_data_direct_; + + // d_data_ptr_ is used to store the general pointer to the input data + const void* d_data_ptr_{nullptr}; raft::device_vector l2_norms_; raft::device_matrix graph_buffer_; @@ -307,7 +314,7 @@ inline BuildConfig get_build_config(raft::resources const& res, .termination_threshold = params.termination_threshold, .output_graph_degree = params.graph_degree, .metric = params.metric, - .dist_comp_dtype = params.dist_comp_dtype}; + .compress_to_fp16 = params.compress_to_fp16}; return build_config; } diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd index 9568c88082..8d26921784 100644 --- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd +++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # # cython: language_level=3 @@ -13,11 +13,6 @@ from cuvs.distance_type cimport cuvsDistanceType cdef extern from "cuvs/neighbors/nn_descent.h" nogil: - enum cuvsNNDescentDistCompDtype: - NND_DIST_COMP_AUTO = 0, - NND_DIST_COMP_FP32 = 1, - NND_DIST_COMP_FP16 = 2 - ctypedef struct cuvsNNDescentIndexParams: cuvsDistanceType metric float metric_arg @@ -26,7 +21,7 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil: size_t max_iterations float termination_threshold bool return_distances - cuvsNNDescentDistCompDtype dist_comp_dtype + bool compress_to_fp16 ctypedef cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx index 7cb7f59b60..658a2ff983 100644 --- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx +++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # # cython: language_level=3 @@ -63,12 +63,10 @@ cdef class IndexParams: The delta at which nn-descent will terminate its iterations return_distances : bool Whether to return distances array - dist_comp_dtype : str, default = "auto" - Dtype to use for distance computation. - Supported dtypes are `auto`, `fp32`, and `fp16` - `auto` automatically determines the best dtype for distance computation based on the dataset dimensions. - `fp32` uses fp32 distance computation for better precision at the cost of performance and memory usage. This option is only valid when data type is fp32. - `fp16` uses fp16 distance computation for better performance and memory usage at the cost of precision. + compress_to_fp16 : bool, default = False + When True and the input data is fp32, distance computation is done in + fp16 for better performance and lower memory usage at the cost of + precision. Has no effect on non-fp32 input types. """ cdef cuvsNNDescentIndexParams* params @@ -88,7 +86,7 @@ cdef class IndexParams: max_iterations=None, termination_threshold=None, return_distances=None, - dist_comp_dtype="auto" + compress_to_fp16=None ): if metric is not None: self.params.metric = DISTANCE_TYPES[metric] @@ -102,15 +100,8 @@ cdef class IndexParams: self.params.termination_threshold = termination_threshold if return_distances is not None: self.params.return_distances = return_distances - - if dist_comp_dtype is "auto": - self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_AUTO - elif dist_comp_dtype is "fp32": - self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_FP32 - elif dist_comp_dtype is "fp16": - self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_FP16 - else: - raise ValueError(f"Invalid dist_comp_dtype: {dist_comp_dtype}. Supported options are 'auto', 'fp32', and 'fp16'.") + if compress_to_fp16 is not None: + self.params.compress_to_fp16 = compress_to_fp16 @property def metric(self): diff --git a/python/cuvs/cuvs/tests/test_nn_descent.py b/python/cuvs/cuvs/tests/test_nn_descent.py index 4463142fcb..93b39727d3 100644 --- a/python/cuvs/cuvs/tests/test_nn_descent.py +++ b/python/cuvs/cuvs/tests/test_nn_descent.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -61,9 +61,9 @@ def test_nn_descent( @pytest.mark.parametrize("n_cols", [2, 17, 32]) -@pytest.mark.parametrize("dist_comp_dtype", ["auto", "fp32", "fp16"]) +@pytest.mark.parametrize("compress_to_fp16", [False, True]) @pytest.mark.parametrize("dtype", [np.float32, np.float16]) -def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype): +def test_nn_descent_compress_to_fp16(n_cols, compress_to_fp16, dtype): metric = "sqeuclidean" graph_degree = 32 n_rows = 100_000 @@ -77,7 +77,7 @@ def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype): metric=metric, graph_degree=graph_degree, return_distances=True, - dist_comp_dtype=dist_comp_dtype, + compress_to_fp16=compress_to_fp16, ) index = nn_descent.build(params, X) @@ -88,8 +88,4 @@ def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype): _, bf_indices = brute_force.search(index, gpu_X, k=graph_degree) bf_indices = bf_indices.copy_to_host() - if n_cols <= 16 and dist_comp_dtype == "fp16" and dtype == np.float32: - # for small dim, if data is fp32 but dist_comp_dtype is fp16, the recall will be low - assert calc_recall(nnd_indices, bf_indices) < 0.7 - else: - assert calc_recall(nnd_indices, bf_indices) > 0.9 + assert calc_recall(nnd_indices, bf_indices) > 0.9 From 2855e585515cc52e735720d21d7feda98a9cbaac Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 18 Mar 2026 20:03:52 +0000 Subject: [PATCH 2/5] compute norms in fp32 if data is fp32 --- cpp/src/neighbors/detail/nn_descent.cuh | 42 ++++++++++++------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 67b165dc47..78b6fed0c7 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -274,7 +274,7 @@ __device__ __forceinline__ void load_vec(__half* vec_buffer, for (int step = 0; step < raft::ceildiv(padding_dims, num_load_elems_per_warp); step++) { int idx = step * num_load_elems_per_warp + lane_id; if (idx < load_dims) { - vec_buffer[idx] = __float2half(static_cast(d_vec[idx])); + vec_buffer[idx] = d_vec[idx]; } else if (idx < padding_dims) { vec_buffer[idx] = half_0; } @@ -1547,6 +1547,17 @@ void GNND::build(Data_t* data, size_t n_elems = batch.size() * build_config_.dataset_dim; int num_blocks = raft::ceildiv(n_elems, static_cast(TPB)); size_t dst_offset = batch.offset() * build_config_.dataset_dim; + if (needs_l2_norms) { + // we compute l2 norms on the fp32 data directly. + compute_l2_norms_kernel<<(raft::warp_size())) * + raft::warp_size(), + stream>>>( + batch.data(), build_config_.dataset_dim, l2_norms_.data_handle() + batch.offset()); + } convert_copy_kernel<<>>( batch.data(), d_data_half_.value().data_handle() + dst_offset, n_elems); } @@ -1567,28 +1578,15 @@ void GNND::build(Data_t* data, std::cout << "keeping data dtype and copying to device" << std::endl; } - if (needs_l2_norms) { - if (d_data_half_.has_value()) { - compute_l2_norms_kernel<<< - nrow_, + if (needs_l2_norms && !compress_host_data) { + compute_l2_norms_kernel<<< + nrow_, + raft::warp_size(), + sizeof(input_t) * ceildiv(build_config_.dataset_dim, static_cast(raft::warp_size())) * raft::warp_size(), - sizeof(half) * ceildiv(build_config_.dataset_dim, static_cast(raft::warp_size())) * - raft::warp_size(), - stream>>>( - static_cast(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle()); - raft::resource::sync_stream(res); - } else { - compute_l2_norms_kernel<<(raft::warp_size())) * - raft::warp_size(), - stream>>>(static_cast(d_data_ptr_), - build_config_.dataset_dim, - l2_norms_.data_handle()); - raft::resource::sync_stream(res); - } + stream>>>( + static_cast(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle()); + raft::resource::sync_stream(res); } graph_.clear(); From fbb902306242d3de0247536c4240941af32720a8 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 19 Mar 2026 02:35:30 +0000 Subject: [PATCH 3/5] change padding for uint8 and int8 --- cpp/src/neighbors/detail/nn_descent.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 78b6fed0c7..c51cdc06f2 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -143,16 +143,16 @@ struct dtype_traits<__half> { template <> struct dtype_traits { - static constexpr int APAD = 16; - static constexpr int BPAD = 16; + static constexpr int APAD = 4; // 1 byte padding to avoid bank conflict + static constexpr int BPAD = 4; static constexpr int TILE_COL_WIDTH = 128; static __device__ __forceinline__ float to_float(uint8_t v) { return static_cast(v); } }; template <> struct dtype_traits { - static constexpr int APAD = 16; - static constexpr int BPAD = 16; + static constexpr int APAD = 4; // 1 byte padding to avoid bank conflict + static constexpr int BPAD = 4; static constexpr int TILE_COL_WIDTH = 128; static __device__ __forceinline__ float to_float(int8_t v) { return static_cast(v); } }; From 157fe30f6d9130be8acdeadb83d96a0ef91462ea Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 20 Mar 2026 21:47:24 +0000 Subject: [PATCH 4/5] rm print and revert test --- cpp/src/neighbors/detail/nn_descent.cuh | 2 -- python/cuvs/cuvs/tests/test_nn_descent.py | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index a62c68afd9..90fca202e4 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -1442,7 +1442,6 @@ void GNND::local_join(cudaStream_t stream, DistEpilogue_t dist_ auto launch_kernel = [&](auto* typed_ptr) { if (use_simt) { - std::cout << "using simt kernel" << std::endl; local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), h_rev_graph_new_.data_handle(), d_list_sizes_new_.data_handle(), @@ -1460,7 +1459,6 @@ void GNND::local_join(cudaStream_t stream, DistEpilogue_t dist_ build_config_.metric, dist_epilogue); } else { - std::cout << "using wmma kernel" << std::endl; local_join_kernel_wmma<<>>(graph_.h_graph_new.data_handle(), h_rev_graph_new_.data_handle(), d_list_sizes_new_.data_handle(), diff --git a/python/cuvs/cuvs/tests/test_nn_descent.py b/python/cuvs/cuvs/tests/test_nn_descent.py index 93b39727d3..27403ab70b 100644 --- a/python/cuvs/cuvs/tests/test_nn_descent.py +++ b/python/cuvs/cuvs/tests/test_nn_descent.py @@ -88,4 +88,8 @@ def test_nn_descent_compress_to_fp16(n_cols, compress_to_fp16, dtype): _, bf_indices = brute_force.search(index, gpu_X, k=graph_degree) bf_indices = bf_indices.copy_to_host() - assert calc_recall(nnd_indices, bf_indices) > 0.9 + if n_cols <= 16 and compress_to_fp16 and dtype == np.float32: + # for small dim, if data is fp32 but compress_to_fp16 is True, the recall will be low + assert calc_recall(nnd_indices, bf_indices) < 0.7 + else: + assert calc_recall(nnd_indices, bf_indices) > 0.9 From e59e6c7c8dd472f083cdf643da7f3ad1b827a882 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 8 Apr 2026 00:07:45 +0000 Subject: [PATCH 5/5] fix c abi breakages --- c/include/cuvs/neighbors/nn_descent.h | 138 +++++++++++++++++- c/src/neighbors/nn_descent.cpp | 128 ++++++++++++---- .../cuvs/neighbors/nn_descent/nn_descent.pxd | 38 +++-- .../cuvs/neighbors/nn_descent/nn_descent.pyx | 10 +- 4 files changed, 265 insertions(+), 49 deletions(-) diff --git a/c/include/cuvs/neighbors/nn_descent.h b/c/include/cuvs/neighbors/nn_descent.h index 1db0eea139..1c04cfe263 100644 --- a/c/include/cuvs/neighbors/nn_descent.h +++ b/c/include/cuvs/neighbors/nn_descent.h @@ -15,6 +15,23 @@ extern "C" { #endif +/** + * @brief Dtype to use for distance computation + * - `NND_DIST_COMP_AUTO`: Automatically determine the best dtype for distance computation based on + * the dataset dimensions. + * - `NND_DIST_COMP_FP32`: Use fp32 distance computation for better precision at the cost of + * performance and memory usage. + * - `NND_DIST_COMP_FP16`: Use fp16 distance computation. + * + * @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with compress_to_fp16 + * instead. + */ +typedef enum { + NND_DIST_COMP_AUTO = 0, + NND_DIST_COMP_FP32 = 1, + NND_DIST_COMP_FP16 = 2 +} cuvsNNDescentDistCompDtype; + /** * @defgroup nn_descent_c_index_params The nn-descent algorithm parameters. * @{ @@ -35,11 +52,50 @@ extern "C" { * the graph for. More iterations produce a better quality graph at cost of performance * `termination_threshold`: The delta at which nn-descent will terminate its iterations * `return_distances`: Boolean to decide whether to return distances array + * `dist_comp_dtype`: dtype to use for distance computation. Defaults to `NND_DIST_COMP_AUTO` which + * automatically determines the best dtype for distance computation based on the dataset dimensions. + * Use `NND_DIST_COMP_FP32` for better precision at the cost of performance and memory usage. This + * option is only valid when data type is fp32. Use `NND_DIST_COMP_FP16` for better performance and + * memory usage at the cost of precision. + * + * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6. + */ +struct cuvsNNDescentIndexParams { + cuvsDistanceType metric; + float metric_arg; + size_t graph_degree; + size_t intermediate_graph_degree; + size_t max_iterations; + float termination_threshold; + bool return_distances; + cuvsNNDescentDistCompDtype dist_comp_dtype; +}; + +typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t; + +/** + * @brief Parameters used to build an nn-descent index (v6) + * + * `metric`: The distance metric to use + * `metric_arg`: The argument used by distance metrics like Minkowskidistance + * `graph_degree`: For an input dataset of dimensions (N, D), + * determines the final dimensions of the all-neighbors knn graph + * which turns out to be of dimensions (N, graph_degree) + * `intermediate_graph_degree`: Internally, nn-descent builds an + * all-neighbors knn graph of dimensions (N, intermediate_graph_degree) + * before selecting the final `graph_degree` neighbors. It's recommended + * that `intermediate_graph_degree` >= 1.5 * graph_degree + * `max_iterations`: The number of iterations that nn-descent will refine + * the graph for. More iterations produce a better quality graph at cost of performance + * `termination_threshold`: The delta at which nn-descent will terminate its iterations + * `return_distances`: Boolean to decide whether to return distances array * `compress_to_fp16`: When true and the input data is fp32, distance computation is done in * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on * non-fp32 input types. + * + * @since 26.06 */ -struct cuvsNNDescentIndexParams { +struct cuvsNNDescentIndexParams_v6 { cuvsDistanceType metric; float metric_arg; size_t graph_degree; @@ -50,23 +106,47 @@ struct cuvsNNDescentIndexParams { bool compress_to_fp16; }; -typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t; +typedef struct cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t; /** * @brief Allocate NN-Descent Index params, and populate with default values * + * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParamsCreate_v6. + * * @param[in] index_params cuvsNNDescentIndexParams_t to allocate * @return cuvsError_t */ cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t* index_params); +/** + * @brief Allocate NN-Descent Index params (v6), and populate with default values + * + * @since 26.06 + * + * @param[in] index_params cuvsNNDescentIndexParams_v6_t to allocate + * @return cuvsError_t + */ +cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParams_v6_t* index_params); + /** * @brief De-allocate NN-Descent Index params * + * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParamsDestroy_v6. + * * @param[in] index_params * @return cuvsError_t */ cuvsError_t cuvsNNDescentIndexParamsDestroy(cuvsNNDescentIndexParams_t index_params); + +/** + * @brief De-allocate NN-Descent Index params (v6) + * + * @since 26.06 + * + * @param[in] index_params + * @return cuvsError_t + */ +cuvsError_t cuvsNNDescentIndexParamsDestroy_v6(cuvsNNDescentIndexParams_v6_t index_params); /** * @} */ @@ -145,6 +225,8 @@ cuvsError_t cuvsNNDescentIndexDestroy(cuvsNNDescentIndex_t index); * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res); * @endcode * + * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentBuild_v6. + * * @param[in] res cuvsResources_t opaque C handle * @param[in] index_params cuvsNNDescentIndexParams_t used to build NN-Descent index * @param[in] dataset DLManagedTensor* training dataset on host or device memory @@ -157,6 +239,58 @@ cuvsError_t cuvsNNDescentBuild(cuvsResources_t res, DLManagedTensor* dataset, DLManagedTensor* graph, cuvsNNDescentIndex_t index); + +/** + * @brief Build a NN-Descent index (v6) with a `DLManagedTensor` which has underlying + * `DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`, + * or `kDLCPU`. Also, acceptable underlying types are: + * 1. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32` + * 2. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 16` + * 3. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8` + * 4. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8` + * + * @code {.c} + * #include + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // Assume a populated `DLManagedTensor` type here + * DLManagedTensor dataset; + * + * // Create default index params + * cuvsNNDescentIndexParams_v6_t index_params; + * cuvsError_t params_create_status = cuvsNNDescentIndexParamsCreate_v6(&index_params); + * + * // Create NN-Descent index + * cuvsNNDescentIndex_t index; + * cuvsError_t index_create_status = cuvsNNDescentIndexCreate(&index); + * + * // Build the NN-Descent Index + * cuvsError_t build_status = cuvsNNDescentBuild_v6(res, index_params, &dataset, NULL, index); + * + * // de-allocate `index_params`, `index` and `res` + * cuvsError_t params_destroy_status = cuvsNNDescentIndexParamsDestroy_v6(index_params); + * cuvsError_t index_destroy_status = cuvsNNDescentIndexDestroy(index); + * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res); + * @endcode + * + * @since 26.06 + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] index_params cuvsNNDescentIndexParams_v6_t used to build NN-Descent index + * @param[in] dataset DLManagedTensor* training dataset on host or device memory + * @param[inout] graph Optional preallocated graph on host memory to store output + * @param[out] index cuvsNNDescentIndex_t Newly built NN-Descent index + * @return cuvsError_t + */ +cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res, + cuvsNNDescentIndexParams_v6_t index_params, + DLManagedTensor* dataset, + DLManagedTensor* graph, + cuvsNNDescentIndex_t index); /** * @} */ diff --git a/c/src/neighbors/nn_descent.cpp b/c/src/neighbors/nn_descent.cpp index d3753e925a..a704bcd3eb 100644 --- a/c/src/neighbors/nn_descent.cpp +++ b/c/src/neighbors/nn_descent.cpp @@ -28,23 +28,13 @@ namespace { template void* _build(cuvsResources_t res, - cuvsNNDescentIndexParams params, + cuvs::neighbors::nn_descent::index_params build_params, DLManagedTensor* dataset_tensor, DLManagedTensor* graph_tensor) { auto res_ptr = reinterpret_cast(res); auto dataset = dataset_tensor->dl_tensor; - auto build_params = cuvs::neighbors::nn_descent::index_params(); - build_params.metric = static_cast((int)params.metric), - build_params.metric_arg = params.metric_arg; - build_params.graph_degree = params.graph_degree; - build_params.intermediate_graph_degree = params.intermediate_graph_degree; - build_params.max_iterations = params.max_iterations; - build_params.termination_threshold = params.termination_threshold; - build_params.return_distances = params.return_distances; - build_params.compress_to_fp16 = params.compress_to_fp16; - using graph_type = raft::host_matrix_view; std::optional graph; if (graph_tensor != NULL) { graph = cuvs::core::from_dlpack(graph_tensor); } @@ -64,6 +54,35 @@ void* _build(cuvsResources_t res, } } +cuvs::neighbors::nn_descent::index_params convert_params(cuvsNNDescentIndexParams const& params) +{ + auto build_params = cuvs::neighbors::nn_descent::index_params(); + build_params.metric = static_cast((int)params.metric); + build_params.metric_arg = params.metric_arg; + build_params.graph_degree = params.graph_degree; + build_params.intermediate_graph_degree = params.intermediate_graph_degree; + build_params.max_iterations = params.max_iterations; + build_params.termination_threshold = params.termination_threshold; + build_params.return_distances = params.return_distances; + build_params.compress_to_fp16 = (params.dist_comp_dtype == NND_DIST_COMP_FP16); + return build_params; +} + +cuvs::neighbors::nn_descent::index_params convert_params_v6( + cuvsNNDescentIndexParams_v6 const& params) +{ + auto build_params = cuvs::neighbors::nn_descent::index_params(); + build_params.metric = static_cast((int)params.metric); + build_params.metric_arg = params.metric_arg; + build_params.graph_degree = params.graph_degree; + build_params.intermediate_graph_degree = params.intermediate_graph_degree; + build_params.max_iterations = params.max_iterations; + build_params.termination_threshold = params.termination_threshold; + build_params.return_distances = params.return_distances; + build_params.compress_to_fp16 = params.compress_to_fp16; + return build_params; +} + template void _get_graph(cuvsResources_t res, cuvsNNDescentIndex_t index, DLManagedTensor* graph) { @@ -113,6 +132,37 @@ void _get_distances(cuvsResources_t res, cuvsNNDescentIndex_t index, DLManagedTe RAFT_FAIL("Unsupported nn-descent index dtype: %d and bits: %d", dtype.code, dtype.bits); } } + +cuvsError_t _nn_descent_build(cuvsResources_t res, + DLManagedTensor* dataset_tensor, + DLManagedTensor* graph_tensor, + cuvsNNDescentIndex_t index, + cuvs::neighbors::nn_descent::index_params build_params) +{ + return cuvs::core::translate_exceptions([=] { + index->dtype.code = kDLUInt; + index->dtype.bits = 32; + + auto dtype = dataset_tensor->dl_tensor.dtype; + + if ((dtype.code == kDLFloat) && (dtype.bits == 32)) { + index->addr = reinterpret_cast( + _build(res, build_params, dataset_tensor, graph_tensor)); + } else if ((dtype.code == kDLFloat) && (dtype.bits == 16)) { + index->addr = reinterpret_cast( + _build(res, build_params, dataset_tensor, graph_tensor)); + } else if ((dtype.code == kDLInt) && (dtype.bits == 8)) { + index->addr = reinterpret_cast( + _build(res, build_params, dataset_tensor, graph_tensor)); + } else if ((dtype.code == kDLUInt) && (dtype.bits == 8)) { + index->addr = reinterpret_cast( + _build(res, build_params, dataset_tensor, graph_tensor)); + } else { + RAFT_FAIL("Unsupported nn-descent dataset dtype: %d and bits: %d", dtype.code, dtype.bits); + } + }); +} + } // namespace extern "C" cuvsError_t cuvsNNDescentIndexCreate(cuvsNNDescentIndex_t* index) @@ -141,28 +191,18 @@ extern "C" cuvsError_t cuvsNNDescentBuild(cuvsResources_t res, DLManagedTensor* graph_tensor, cuvsNNDescentIndex_t index) { - return cuvs::core::translate_exceptions([=] { - index->dtype.code = kDLUInt; - index->dtype.bits = 32; - - auto dtype = dataset_tensor->dl_tensor.dtype; + auto build_params = convert_params(*params); + return _nn_descent_build(res, dataset_tensor, graph_tensor, index, build_params); +} - if ((dtype.code == kDLFloat) && (dtype.bits == 32)) { - index->addr = reinterpret_cast( - _build(res, *params, dataset_tensor, graph_tensor)); - } else if ((dtype.code == kDLFloat) && (dtype.bits == 16)) { - index->addr = reinterpret_cast( - _build(res, *params, dataset_tensor, graph_tensor)); - } else if ((dtype.code == kDLInt) && (dtype.bits == 8)) { - index->addr = reinterpret_cast( - _build(res, *params, dataset_tensor, graph_tensor)); - } else if ((dtype.code == kDLUInt) && (dtype.bits == 8)) { - index->addr = reinterpret_cast( - _build(res, *params, dataset_tensor, graph_tensor)); - } else { - RAFT_FAIL("Unsupported nn-descent dataset dtype: %d and bits: %d", dtype.code, dtype.bits); - } - }); +extern "C" cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res, + cuvsNNDescentIndexParams_v6_t params, + DLManagedTensor* dataset_tensor, + DLManagedTensor* graph_tensor, + cuvsNNDescentIndex_t index) +{ + auto build_params = convert_params_v6(*params); + return _nn_descent_build(res, dataset_tensor, graph_tensor, index, build_params); } extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t* params) @@ -172,6 +212,25 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t cuvs::neighbors::nn_descent::index_params cpp_params; *params = new cuvsNNDescentIndexParams{ + .metric = static_cast((int)cpp_params.metric), + .metric_arg = cpp_params.metric_arg, + .graph_degree = cpp_params.graph_degree, + .intermediate_graph_degree = cpp_params.intermediate_graph_degree, + .max_iterations = cpp_params.max_iterations, + .termination_threshold = cpp_params.termination_threshold, + .return_distances = cpp_params.return_distances, + .dist_comp_dtype = cpp_params.compress_to_fp16 ? NND_DIST_COMP_FP16 + : NND_DIST_COMP_AUTO}; + }); +} + +extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParams_v6_t* params) +{ + return cuvs::core::translate_exceptions([=] { + // get defaults from cpp parameters struct + cuvs::neighbors::nn_descent::index_params cpp_params; + + *params = new cuvsNNDescentIndexParams_v6{ .metric = static_cast((int)cpp_params.metric), .metric_arg = cpp_params.metric_arg, .graph_degree = cpp_params.graph_degree, @@ -188,6 +247,11 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsDestroy(cuvsNNDescentIndexParams_ return cuvs::core::translate_exceptions([=] { delete params; }); } +extern "C" cuvsError_t cuvsNNDescentIndexParamsDestroy_v6(cuvsNNDescentIndexParams_v6_t params) +{ + return cuvs::core::translate_exceptions([=] { delete params; }); +} + extern "C" cuvsError_t cuvsNNDescentIndexGetGraph(cuvsResources_t res, cuvsNNDescentIndex_t index, DLManagedTensor* graph) diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd index 8d26921784..aece64be8f 100644 --- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd +++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd @@ -13,6 +13,12 @@ from cuvs.distance_type cimport cuvsDistanceType cdef extern from "cuvs/neighbors/nn_descent.h" nogil: + # Deprecated — to be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6. + ctypedef enum cuvsNNDescentDistCompDtype: + NND_DIST_COMP_AUTO + NND_DIST_COMP_FP32 + NND_DIST_COMP_FP16 + ctypedef struct cuvsNNDescentIndexParams: cuvsDistanceType metric float metric_arg @@ -21,21 +27,33 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil: size_t max_iterations float termination_threshold bool return_distances - bool compress_to_fp16 + cuvsNNDescentDistCompDtype dist_comp_dtype ctypedef cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t + ctypedef struct cuvsNNDescentIndexParams_v6: + cuvsDistanceType metric + float metric_arg + size_t graph_degree + size_t intermediate_graph_degree + size_t max_iterations + float termination_threshold + bool return_distances + bool compress_to_fp16 + + ctypedef cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t + ctypedef struct cuvsNNDescentIndex: uintptr_t addr DLDataType dtype ctypedef cuvsNNDescentIndex* cuvsNNDescentIndex_t - cuvsError_t cuvsNNDescentIndexParamsCreate( - cuvsNNDescentIndexParams_t* params) + cuvsError_t cuvsNNDescentIndexParamsCreate_v6( + cuvsNNDescentIndexParams_v6_t* params) - cuvsError_t cuvsNNDescentIndexParamsDestroy( - cuvsNNDescentIndexParams_t index) + cuvsError_t cuvsNNDescentIndexParamsDestroy_v6( + cuvsNNDescentIndexParams_v6_t index) cuvsError_t cuvsNNDescentIndexCreate(cuvsNNDescentIndex_t* index) @@ -49,8 +67,8 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil: cuvsNNDescentIndex_t index, DLManagedTensor * output) - cuvsError_t cuvsNNDescentBuild(cuvsResources_t res, - cuvsNNDescentIndexParams* params, - DLManagedTensor* dataset, - DLManagedTensor* graph, - cuvsNNDescentIndex_t index) except + + cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res, + cuvsNNDescentIndexParams_v6* params, + DLManagedTensor* dataset, + DLManagedTensor* graph, + cuvsNNDescentIndex_t index) except + diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx index 658a2ff983..2df9fb9699 100644 --- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx +++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx @@ -69,14 +69,14 @@ cdef class IndexParams: precision. Has no effect on non-fp32 input types. """ - cdef cuvsNNDescentIndexParams* params + cdef cuvsNNDescentIndexParams_v6* params cdef object _metric def __cinit__(self): - cuvsNNDescentIndexParamsCreate(&self.params) + cuvsNNDescentIndexParamsCreate_v6(&self.params) def __dealloc__(self): - check_cuvs(cuvsNNDescentIndexParamsDestroy(self.params)) + check_cuvs(cuvsNNDescentIndexParamsDestroy_v6(self.params)) def __init__(self, *, metric=None, @@ -236,7 +236,7 @@ def build(IndexParams index_params, dataset, graph=None, resources=None): cdef Index idx = Index() cdef cydlpack.DLManagedTensor* dataset_dlpack = \ cydlpack.dlpack_c(dataset_ai) - cdef cuvsNNDescentIndexParams* params = index_params.params + cdef cuvsNNDescentIndexParams_v6* params = index_params.params cdef cuvsResources_t res = resources.get_c_obj() @@ -256,7 +256,7 @@ def build(IndexParams index_params, dataset, graph=None, resources=None): graph_dlpack = cydlpack.dlpack_c(graph_ai) with cuda_interruptible(): - check_cuvs(cuvsNNDescentBuild( + check_cuvs(cuvsNNDescentBuild_v6( res, params, dataset_dlpack,