From 666a688ab5c140a7c9435ecad717f32a0e3bdd47 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 18 Mar 2026 01:41:49 +0000
Subject: [PATCH 1/5] keeping input data mem location

---
 c/include/cuvs/neighbors/nn_descent.h         |  20 +-
 c/src/neighbors/nn_descent.cpp                |   6 +-
 cpp/include/cuvs/neighbors/nn_descent.hpp     |  20 +-
 cpp/src/neighbors/detail/nn_descent.cuh       | 374 ++++++++++--------
 cpp/src/neighbors/detail/nn_descent_gnnd.hpp  |  15 +-
 .../cuvs/neighbors/nn_descent/nn_descent.pxd  |   9 +-
 .../cuvs/neighbors/nn_descent/nn_descent.pyx  |  25 +-
 python/cuvs/cuvs/tests/test_nn_descent.py     |  14 +-
 8 files changed, 251 insertions(+), 232 deletions(-)
diff --git a/c/include/cuvs/neighbors/nn_descent.h b/c/include/cuvs/neighbors/nn_descent.h
index 0c7102e3e1..1db0eea139 100644
--- a/c/include/cuvs/neighbors/nn_descent.h
+++ b/c/include/cuvs/neighbors/nn_descent.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -15,18 +15,6 @@
 extern "C" {
 #endif
 
-/**
- * @brief Dtype to use for distance computation
- * - `NND_DIST_COMP_AUTO`: Automatically determine the best dtype for distance computation based on the dataset dimensions.
- * - `NND_DIST_COMP_FP32`: Use fp32 distance computation for better precision at the cost of performance and memory usage.
- * - `NND_DIST_COMP_FP16`: Use fp16 distance computation.
- */
-typedef enum {
-  NND_DIST_COMP_AUTO = 0,
-  NND_DIST_COMP_FP32 = 1,
-  NND_DIST_COMP_FP16 = 2
-} cuvsNNDescentDistCompDtype;
-
 /**
  * @defgroup nn_descent_c_index_params The nn-descent algorithm parameters.
  * @{
@@ -47,7 +35,9 @@ typedef enum {
  * the graph for. More iterations produce a better quality graph at cost of performance
  * `termination_threshold`: The delta at which nn-descent will terminate its iterations
  * `return_distances`: Boolean to decide whether to return distances array
- * `dist_comp_dtype`: dtype to use for distance computation. Defaults to `NND_DIST_COMP_AUTO` which automatically determines the best dtype for distance computation based on the dataset dimensions. Use `NND_DIST_COMP_FP32` for better precision at the cost of performance and memory usage. This option is only valid when data type is fp32. Use `NND_DIST_COMP_FP16` for better performance and memory usage at the cost of precision.
+ * `compress_to_fp16`: When true and the input data is fp32, distance computation is done in
+ * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on
+ * non-fp32 input types.
  */
 struct cuvsNNDescentIndexParams {
   cuvsDistanceType metric;
@@ -57,7 +47,7 @@ struct cuvsNNDescentIndexParams {
   size_t max_iterations;
   float termination_threshold;
   bool return_distances;
-  cuvsNNDescentDistCompDtype dist_comp_dtype;
+  bool compress_to_fp16;
 };
 
 typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t;
diff --git a/c/src/neighbors/nn_descent.cpp b/c/src/neighbors/nn_descent.cpp
index 708056144a..d3753e925a 100644
--- a/c/src/neighbors/nn_descent.cpp
+++ b/c/src/neighbors/nn_descent.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -43,7 +43,7 @@ void* _build(cuvsResources_t res,
   build_params.max_iterations            = params.max_iterations;
   build_params.termination_threshold     = params.termination_threshold;
   build_params.return_distances          = params.return_distances;
-  build_params.dist_comp_dtype           = static_cast<cuvs::neighbors::nn_descent::DIST_COMP_DTYPE>(static_cast<int>(params.dist_comp_dtype));
+  build_params.compress_to_fp16          = params.compress_to_fp16;
 
   using graph_type = raft::host_matrix_view<IdxT, int64_t, raft::row_major>;
   std::optional<graph_type> graph;
@@ -179,7 +179,7 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t
       .max_iterations            = cpp_params.max_iterations,
       .termination_threshold     = cpp_params.termination_threshold,
       .return_distances          = cpp_params.return_distances,
-      .dist_comp_dtype           = static_cast<cuvsNNDescentDistCompDtype>(static_cast<int>(cpp_params.dist_comp_dtype))};
+      .compress_to_fp16          = cpp_params.compress_to_fp16};
   });
 }
 
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index 8e94edfaa5..dc84e398c7 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -24,16 +24,6 @@ namespace cuvs::neighbors::nn_descent {
  * @{
  */
 
-/**
- * @brief Dtype to use for distance computation
- * - `AUTO`: Automatically determine the best dtype for distance computation based on the dataset
- * dimensions.
- * - `FP32`: Use fp32 distance computation for better precision at the cost of performance and
- * memory usage.
- * - `FP16`: Use fp16 distance computation.
- */
-enum class DIST_COMP_DTYPE { AUTO = 0, FP32 = 1, FP16 = 2 };
-
 /**
  * @brief Parameters used to build an nn-descent index
  * - `graph_degree`: For an input dataset of dimensions (N, D),
@@ -47,11 +37,9 @@ enum class DIST_COMP_DTYPE { AUTO = 0, FP32 = 1, FP16 = 2 };
  * the graph for. More iterations produce a better quality graph at cost of performance
  * - `termination_threshold`: The delta at which nn-descent will terminate its iterations
  * - `return_distances`: Boolean to decide whether to return distances array
- * - `dist_comp_dtype`: dtype to use for distance computation. Defaults to `AUTO` which
- * automatically determines the best dtype for distance computation based on the dataset dimensions.
- * Use `FP32` for better precision at the cost of performance and memory usage. This option is only
- * valid when data type is fp32. Use `FP16` for better performance and memory usage at the cost of
- * precision.
+ * - `compress_to_fp16`: When true and the input data is fp32, distance computation is done in
+ * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on
+ * non-fp32 input types (fp16, int8, uint8) which always use fp16 distance computation.
  */
 struct index_params : cuvs::neighbors::index_params {
   size_t graph_degree              = 64;
@@ -59,7 +47,7 @@ struct index_params : cuvs::neighbors::index_params {
   size_t max_iterations            = 20;
   float termination_threshold      = 0.0001;
   bool return_distances            = true;
-  DIST_COMP_DTYPE dist_comp_dtype  = DIST_COMP_DTYPE::AUTO;
+  bool compress_to_fp16            = false;
 
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 2b6e5459f2..67b165dc47 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -141,6 +141,22 @@ struct dtype_traits<__half> {
   static __device__ __forceinline__ float to_float(__half v) { return __half2float(v); }
 };
 
+template <>
+struct dtype_traits<uint8_t> {
+  static constexpr int APAD           = 16;
+  static constexpr int BPAD           = 16;
+  static constexpr int TILE_COL_WIDTH = 128;
+  static __device__ __forceinline__ float to_float(uint8_t v) { return static_cast<float>(v); }
+};
+
+template <>
+struct dtype_traits<int8_t> {
+  static constexpr int APAD           = 16;
+  static constexpr int BPAD           = 16;
+  static constexpr int TILE_COL_WIDTH = 128;
+  static __device__ __forceinline__ float to_float(int8_t v) { return static_cast<float>(v); }
+};
+
 template <typename T>
 __device__ __forceinline__ ResultItem<T> xor_swap(ResultItem<T> x, int mask, int dir)
 {
@@ -244,69 +260,64 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
   }
 }
 
-// TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
-/** Calculate L2 norm, and cast data to Output_t */
-template <typename Data_t, typename Output_t = __half>
-RAFT_KERNEL preprocess_data_kernel(
-  const Data_t* input_data,
-  Output_t* output_data,
-  int dim,
-  DistData_t* l2_norms,
-  size_t list_offset                  = 0,
-  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
+/** Converting load: loads Data_t from global memory into __half shared memory buffer. */
+template <typename Data_t>
+  requires(!std::is_same_v<Data_t, __half>)
+__device__ __forceinline__ void load_vec(__half* vec_buffer,
+                                         const Data_t* d_vec,
+                                         const int load_dims,
+                                         const int padding_dims,
+                                         const int lane_id)
+{
+  constexpr int num_load_elems_per_warp = raft::warp_size();
+  __half half_0                         = __float2half(0.0f);
+  for (int step = 0; step < raft::ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+    int idx = step * num_load_elems_per_warp + lane_id;
+    if (idx < load_dims) {
+      vec_buffer[idx] = __float2half(static_cast<float>(d_vec[idx]));
+    } else if (idx < padding_dims) {
+      vec_buffer[idx] = half_0;
+    }
+  }
+}
+
+/** One warp per block. Computes squared L2 norm for each row. */
+template <typename Data_t>
+RAFT_KERNEL compute_l2_norms_kernel(const Data_t* data, int dim, DistData_t* l2_norms)
 {
   extern __shared__ char buffer[];
   __shared__ float l2_norm;
   Data_t* s_vec  = (Data_t*)buffer;
-  size_t list_id = list_offset + blockIdx.x;
+  size_t list_id = blockIdx.x;
+  int lane_id    = threadIdx.x % raft::warp_size();
 
-  load_vec(s_vec,
-           input_data + static_cast<size_t>(blockIdx.x) * dim,
-           dim,
-           dim,
-           threadIdx.x % raft::warp_size());
+  load_vec(s_vec, data + static_cast<size_t>(blockIdx.x) * dim, dim, dim, lane_id);
   if (threadIdx.x == 0) { l2_norm = 0; }
   __syncthreads();
 
-  if (metric == cuvs::distance::DistanceType::L2Expanded ||
-      metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
-      metric == cuvs::distance::DistanceType::CosineExpanded) {
-    int lane_id = threadIdx.x % raft::warp_size();
-    for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
-      int idx         = step * raft::warp_size() + lane_id;
-      float part_dist = 0;
-      if (idx < dim) {
-        part_dist = s_vec[idx];
-        part_dist = part_dist * part_dist;
-      }
-      __syncwarp();
-      for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
-        part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
-      }
-      if (lane_id == 0) { l2_norm += part_dist; }
-      __syncwarp();
-    }
-  }
-
   for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
-    int idx = step * raft::warp_size() + threadIdx.x;
+    int idx         = step * raft::warp_size() + lane_id;
+    float part_dist = 0;
     if (idx < dim) {
-      if (metric == cuvs::distance::DistanceType::InnerProduct ||
-          metric == cuvs::distance::DistanceType::L1) {
-        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
-      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
-        output_data[list_id * dim + idx] =
-          (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
-      } else if (metric == cuvs::distance::DistanceType::BitwiseHamming) {
-        int idx_for_byte           = list_id * dim + idx;  // uint8 or int8 data
-        uint8_t* output_bytes      = reinterpret_cast<uint8_t*>(output_data);
-        output_bytes[idx_for_byte] = input_data[(size_t)blockIdx.x * dim + idx];
-      } else {  // L2Expanded or L2SqrtExpanded
-        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
-        if (idx == 0) { l2_norms[list_id] = l2_norm; }
-      }
+      part_dist = static_cast<float>(s_vec[idx]);
+      part_dist = part_dist * part_dist;
+    }
+    __syncwarp();
+    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
     }
+    if (lane_id == 0) { l2_norm += part_dist; }
+    __syncwarp();
   }
+
+  if (lane_id == 0) { l2_norms[list_id] = l2_norm; }
+}
+
+template <typename Src_t, typename Dst_t>
+RAFT_KERNEL convert_copy_kernel(const Src_t* src, Dst_t* dst, size_t n)
+{
+  size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < n) { dst[idx] = static_cast<Dst_t>(src[idx]); }
 }
 
 template <typename Index_t>
@@ -527,7 +538,9 @@ __device__ __forceinline__ void calculate_metric(float* s_distances,
       if (metric == cuvs::distance::DistanceType::InnerProduct && can_postprocess_dist) {
         s_distances[i] = -s_distances[i];
       } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
-        s_distances[i] = 1.0 - s_distances[i];
+        float norm_product = l2_norms[row_neighbors[row_id]] * l2_norms[col_neighbors[col_id]];
+        s_distances[i] =
+          (norm_product > 0.0f) ? (1.0f - s_distances[i] / sqrtf(norm_product)) : 0.0f;
       } else if (metric == cuvs::distance::DistanceType::BitwiseHamming) {
         s_distances[i] = 0.0;
         int n1         = row_neighbors[row_id];
@@ -573,12 +586,12 @@ struct DistAccumulator {
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
 // SIMT kernel: scalar element-wise distance computation.
-// Used for fp32 data (all metrics) and fp16 data with L1 distance (which cannot use tensor cores).
+// Used for fp32 data (all metrics) and L1 distance computation for all dtypes (which cannot use
+// tensor cores).
 template <typename Data_t,
           typename Index_t,
           typename ID_t = InternalID_t<Index_t>,
           typename DistEpilogue_t>
-  requires(std::is_same_v<Data_t, float> || std::is_same_v<Data_t, __half>)
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 // Use minBlocksPerMultiprocessor = 4 on specific arches
@@ -689,6 +702,7 @@ __launch_bounds__(BLOCK_SIZE)
       if (idx < list_new_size) {
         size_t neighbor_id = new_neighbors[idx];
         size_t idx_in_data = neighbor_id * data_dim;
+        // loaded to shared memory while keeping the original dtype
         load_vec(s_nv[idx],
                  data + idx_in_data + step * TILE_COL_WIDTH,
                  num_load_elems,
@@ -706,6 +720,7 @@ __launch_bounds__(BLOCK_SIZE)
       if (tmp_row < list_new_size && tmp_col < list_new_size) {
         float acc = 0.0f;
         for (int d = 0; d < num_load_elems; d++) {
+          // converted to float for distance computation
           float a = dtype_traits<Data_t>::to_float(s_nv[tmp_row][d]);
           float b = dtype_traits<Data_t>::to_float(s_nv[tmp_col][d]);
           acc += dist_acc(a, b);
@@ -844,7 +859,11 @@ __launch_bounds__(BLOCK_SIZE)
 // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
-template <typename Index_t, typename ID_t = InternalID_t<Index_t>, typename DistEpilogue_t>
+// Used for fp32 data compressed to fp16, and all types using non-L1 distance metric.
+template <typename Data_t,
+          typename Index_t,
+          typename ID_t = InternalID_t<Index_t>,
+          typename DistEpilogue_t>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 // Use minBlocksPerMultiprocessor = 4 on specific arches
@@ -862,7 +881,7 @@ __launch_bounds__(BLOCK_SIZE)
                          const Index_t* rev_graph_old,
                          const int2* sizes_old,
                          const int width,
-                         const __half* data,
+                         const Data_t* data,
                          const int data_dim,
                          ID_t* graph,
                          DistData_t* dists,
@@ -958,6 +977,7 @@ __launch_bounds__(BLOCK_SIZE)
         if (idx < list_new_size) {
           size_t neighbor_id = new_neighbors[idx];
           size_t idx_in_data = neighbor_id * data_dim;
+          // converted to fp16 on-the-fly while loading
           load_vec(s_nv[idx],
                    data + idx_in_data + step * TILE_COL_WIDTH,
                    num_load_elems,
@@ -1353,20 +1373,12 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
   static_assert(NUM_SAMPLES <= 32);
 
   using input_t = typename std::remove_const<Data_t>::type;
-  if (std::is_same_v<input_t, float> &&
-      (build_config.dist_comp_dtype == cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::FP32 ||
-       (build_config.dist_comp_dtype == cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::AUTO &&
-        build_config.dataset_dim <= 16))) {
-    // use fp32 distance computation for better precision with smaller dimension
-    d_data_float_.emplace(
-      raft::make_device_matrix<float, size_t, raft::row_major>(res, nrow_, ndim_));
-  } else {
-    d_data_half_.emplace(raft::make_device_matrix<half, size_t, raft::row_major>(
-      res,
-      nrow_,
-      build_config.metric == cuvs::distance::DistanceType::BitwiseHamming
-        ? (build_config.dataset_dim + 1) / 2
-        : build_config.dataset_dim));
+  if (build_config.compress_to_fp16 && build_config.dataset_dim <= 16 &&
+      std::is_same_v<input_t, float>) {
+    RAFT_LOG_WARN(
+      "Using fp16 for distance computation for data in fp32 with small dimensions (%zu) <= 16 may "
+      "result in low quality results. Consider setting compress_to_fp16 = false.",
+      build_config.dataset_dim);
   }
 
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
@@ -1376,7 +1388,11 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
   raft::matrix::fill(res, d_locks_.view(), 0);
 
   if (build_config.metric == cuvs::distance::DistanceType::L2Expanded ||
-      build_config.metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+      build_config.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
+      build_config.metric == cuvs::distance::DistanceType::CosineExpanded) {
+    // for device memory efficiency, we do not allocate a separate array for the data
+    // to normalize the data when using CosineExpanded metric. Instead, we use the l2_norms_ vector
+    // and compute inside the calculate_metric kernel.
     l2_norms_ = raft::make_device_vector<DistData_t, size_t>(res, nrow_);
   }
 };
@@ -1414,62 +1430,62 @@ template <typename DistEpilogue_t>
 void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_epilogue)
 {
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
-
   // Kernel dispatch logic:
-  //   fp32 data                  -> SIMT (metric resolved at runtime inside the kernel)
-  //   fp16 data + L1 distance    -> SIMT (L1 needs element-wise ops, cannot use tensor cores)
-  //   fp16 data + other metrics  -> WMMA (tensor-core accelerated dot product)
-  if (d_data_float_.has_value()) {
-    local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
-                                                             h_rev_graph_new_.data_handle(),
-                                                             d_list_sizes_new_.data_handle(),
-                                                             h_graph_old_.data_handle(),
-                                                             h_rev_graph_old_.data_handle(),
-                                                             d_list_sizes_old_.data_handle(),
-                                                             NUM_SAMPLES,
-                                                             d_data_float_->data_handle(),
-                                                             ndim_,
-                                                             graph_buffer_.data_handle(),
-                                                             dists_buffer_.data_handle(),
-                                                             DEGREE_ON_DEVICE,
-                                                             d_locks_.data_handle(),
-                                                             l2_norms_.data_handle(),
-                                                             build_config_.metric,
-                                                             dist_epilogue);
-  } else if (build_config_.metric == cuvs::distance::DistanceType::L1) {
-    local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
-                                                             h_rev_graph_new_.data_handle(),
-                                                             d_list_sizes_new_.data_handle(),
-                                                             h_graph_old_.data_handle(),
-                                                             h_rev_graph_old_.data_handle(),
-                                                             d_list_sizes_old_.data_handle(),
-                                                             NUM_SAMPLES,
-                                                             d_data_half_.value().data_handle(),
-                                                             ndim_,
-                                                             graph_buffer_.data_handle(),
-                                                             dists_buffer_.data_handle(),
-                                                             DEGREE_ON_DEVICE,
-                                                             d_locks_.data_handle(),
-                                                             l2_norms_.data_handle(),
-                                                             build_config_.metric,
-                                                             dist_epilogue);
+  // fp32 data can have an effective type of fp32 OR fp16 (when compress_to_fp16 flag = True for
+  // wmma usage) Based on EFFECTIVE dtype:
+  //   fp32 data || L1 distance  -> SIMT: internally converted to fp32 for distance computation
+  //   on-the-fly dypte <= fp16 && non-L1 metrics  -> WMMA (tensor-core accelerated dot product):
+  //   internally converted to fp16 for distance computation on-the-fly
+
+  bool use_simt = (std::is_same_v<input_t, float> && !build_config_.compress_to_fp16) ||
+                  build_config_.metric == cuvs::distance::DistanceType::L1;
+
+  auto launch_kernel = [&](auto* typed_ptr) {
+    if (use_simt) {
+      std::cout << "using simt kernel" << std::endl;
+      local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                               h_rev_graph_new_.data_handle(),
+                                                               d_list_sizes_new_.data_handle(),
+                                                               h_graph_old_.data_handle(),
+                                                               h_rev_graph_old_.data_handle(),
+                                                               d_list_sizes_old_.data_handle(),
+                                                               NUM_SAMPLES,
+                                                               typed_ptr,
+                                                               ndim_,
+                                                               graph_buffer_.data_handle(),
+                                                               dists_buffer_.data_handle(),
+                                                               DEGREE_ON_DEVICE,
+                                                               d_locks_.data_handle(),
+                                                               l2_norms_.data_handle(),
+                                                               build_config_.metric,
+                                                               dist_epilogue);
+    } else {
+      std::cout << "using wmma kernel" << std::endl;
+      local_join_kernel_wmma<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                               h_rev_graph_new_.data_handle(),
+                                                               d_list_sizes_new_.data_handle(),
+                                                               h_graph_old_.data_handle(),
+                                                               h_rev_graph_old_.data_handle(),
+                                                               d_list_sizes_old_.data_handle(),
+                                                               NUM_SAMPLES,
+                                                               typed_ptr,
+                                                               ndim_,
+                                                               graph_buffer_.data_handle(),
+                                                               dists_buffer_.data_handle(),
+                                                               DEGREE_ON_DEVICE,
+                                                               d_locks_.data_handle(),
+                                                               l2_norms_.data_handle(),
+                                                               build_config_.metric,
+                                                               dist_epilogue);
+    }
+  };
+
+  if (d_data_half_.has_value()) {
+    // Host fp32 input compressed to fp16 via compress_to_fp16.
+    launch_kernel(static_cast<const half*>(d_data_ptr_));
   } else {
-    local_join_kernel_wmma<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
-                                                             h_rev_graph_new_.data_handle(),
-                                                             d_list_sizes_new_.data_handle(),
-                                                             h_graph_old_.data_handle(),
-                                                             h_rev_graph_old_.data_handle(),
-                                                             d_list_sizes_old_.data_handle(),
-                                                             NUM_SAMPLES,
-                                                             d_data_half_.value().data_handle(),
-                                                             ndim_,
-                                                             graph_buffer_.data_handle(),
-                                                             dists_buffer_.data_handle(),
-                                                             DEGREE_ON_DEVICE,
-                                                             d_locks_.data_handle(),
-                                                             l2_norms_.data_handle(),
-                                                             build_config_.metric,
-                                                             dist_epilogue);
+    // Data stored as input_t: device data used directly, or host data copied as-is.
+    launch_kernel(static_cast<const input_t*>(d_data_ptr_));
   }
 }
 
@@ -1497,45 +1513,81 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
   update_counter_ = 0;
   graph_.h_graph  = (InternalID_t<Index_t>*)output_graph;
 
-  if (d_data_float_.has_value()) {
-    raft::matrix::fill(res, d_data_float_.value().view(), static_cast<float>(0));
-  } else {
-    raft::matrix::fill(res, d_data_half_.value().view(), static_cast<half>(0));
-  }
+  d_data_ptr_ = nullptr;
 
   cudaPointerAttributes data_ptr_attr;
   RAFT_CUDA_TRY(cudaPointerGetAttributes(&data_ptr_attr, data));
-  size_t batch_size = (data_ptr_attr.devicePointer == nullptr) ? 100000 : nrow_;
-
-  cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{
-    data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
-  for (auto const& batch : vec_batches) {
-    if (d_data_float_.has_value()) {
-      preprocess_data_kernel<<<batch.size(),
-                               raft::warp_size(),
-                               sizeof(Data_t) *
-                                 ceildiv(build_config_.dataset_dim,
-                                         static_cast<size_t>(raft::warp_size())) *
-                                 raft::warp_size(),
-                               stream>>>(batch.data(),
-                                         d_data_float_.value().data_handle(),
-                                         build_config_.dataset_dim,
-                                         l2_norms_.data_handle(),
-                                         batch.offset(),
-                                         build_config_.metric);
+  bool data_on_device = (data_ptr_attr.type == cudaMemoryTypeDevice);
+
+  bool needs_l2_norms = build_config_.metric == cuvs::distance::DistanceType::L2Expanded ||
+                        build_config_.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
+                        build_config_.metric == cuvs::distance::DistanceType::CosineExpanded;
+
+  bool compress_host_data =
+    !data_on_device && std::is_same_v<input_t, float> && build_config_.compress_to_fp16;
+
+  if (data_on_device) {
+    // When user-given data is on device, we use it directly. This can be any type (fp32, fp16,
+    // int8, uint8)
+    d_data_ptr_ = data;
+    std::cout << "data is on device. using user-given data directly" << std::endl;
+  } else if (compress_host_data) {
+    // When user-given data is fp32 host data, and compress_to_fp16 is true, we allocate fp16 buffer
+    // to copy the data. This allows the wmma kernel to be used for distance computation instead of
+    // simt kernel.
+    if (!d_data_half_.has_value()) {
+      d_data_half_.emplace(raft::make_device_matrix<half, size_t, raft::row_major>(
+        res, build_config_.max_dataset_size, build_config_.dataset_dim));
+    }
+    size_t batch_size = 100000;
+    cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{
+      data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
+    constexpr int TPB = 256;
+    for (auto const& batch : vec_batches) {
+      size_t n_elems    = batch.size() * build_config_.dataset_dim;
+      int num_blocks    = raft::ceildiv(n_elems, static_cast<size_t>(TPB));
+      size_t dst_offset = batch.offset() * build_config_.dataset_dim;
+      convert_copy_kernel<<<num_blocks, TPB, 0, stream>>>(
+        batch.data(), d_data_half_.value().data_handle() + dst_offset, n_elems);
+    }
+    d_data_ptr_ = d_data_half_.value().data_handle();
+    std::cout << "data is on host and fp32 and user decided to compress" << std::endl;
+  } else {
+    // In other cases where user-given data is not device-accessible, we allocate a device buffer to
+    // copy the data. The input type is kept as-is (fp32, fp16, int8, uint8).
+    if (!d_data_direct_.has_value()) {
+      d_data_direct_.emplace(raft::make_device_matrix<input_t, size_t, raft::row_major>(
+        res, build_config_.max_dataset_size, build_config_.dataset_dim));
+    }
+    raft::copy(d_data_direct_.value().data_handle(),
+               data,
+               static_cast<size_t>(nrow_) * build_config_.dataset_dim,
+               stream);
+    d_data_ptr_ = d_data_direct_.value().data_handle();
+    std::cout << "keeping data dtype and copying to device" << std::endl;
+  }
+
+  if (needs_l2_norms) {
+    if (d_data_half_.has_value()) {
+      compute_l2_norms_kernel<<<
+        nrow_,
+        raft::warp_size(),
+        sizeof(half) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+          raft::warp_size(),
+        stream>>>(
+        static_cast<const half*>(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle());
+      raft::resource::sync_stream(res);
     } else {
-      preprocess_data_kernel<<<batch.size(),
-                               raft::warp_size(),
-                               sizeof(Data_t) *
-                                 ceildiv(build_config_.dataset_dim,
-                                         static_cast<size_t>(raft::warp_size())) *
-                                 raft::warp_size(),
-                               stream>>>(batch.data(),
-                                         d_data_half_.value().data_handle(),
-                                         build_config_.dataset_dim,
-                                         l2_norms_.data_handle(),
-                                         batch.offset(),
-                                         build_config_.metric);
+      compute_l2_norms_kernel<<<nrow_,
+                                raft::warp_size(),
+                                sizeof(input_t) *
+                                  ceildiv(build_config_.dataset_dim,
+                                          static_cast<size_t>(raft::warp_size())) *
+                                  raft::warp_size(),
+                                stream>>>(static_cast<const input_t*>(d_data_ptr_),
+                                          build_config_.dataset_dim,
+                                          l2_norms_.data_handle());
+      raft::resource::sync_stream(res);
     }
   }
 
@@ -1587,7 +1639,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
     // __CUDA_ARCH__ >= 700. Since RAFT supports compilation for ARCH 600,
     // we need to ensure that `local_join_kernel` (which uses tensor) operations
     // is not only not compiled, but also a runtime error is presented to the user
-    auto kernel       = preprocess_data_kernel<input_t>;
+    auto kernel       = compute_l2_norms_kernel<input_t>;
     void* kernel_ptr  = reinterpret_cast<void*>(kernel);
     auto runtime_arch = raft::util::arch::kernel_virtual_arch(kernel_ptr);
     auto wmma_range =
diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
index a2639e4f43..bc86ff014e 100644
--- a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
+++ b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
@@ -64,8 +64,7 @@ struct BuildConfig {
   float termination_threshold{0.0001};
   size_t output_graph_degree{32};
   cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
-  cuvs::neighbors::nn_descent::DIST_COMP_DTYPE dist_comp_dtype{
-    cuvs::neighbors::nn_descent::DIST_COMP_DTYPE::AUTO};
+  bool compress_to_fp16{false};
 };
 
 template <typename Index_t>
@@ -228,8 +227,16 @@ class GNND {
   size_t nrow_;
   size_t ndim_;
 
-  std::optional<raft::device_matrix<float, size_t, raft::row_major>> d_data_float_;
+  using input_t = std::remove_const_t<Data_t>;
+
+  // d_data_half_ is used for a special case when input data is fp32 on host and compress_to_fp16
+  // flag is True
   std::optional<raft::device_matrix<half, size_t, raft::row_major>> d_data_half_;
+  // d_data_direct_ is used when input data is on host, and we need to copy it to device
+  std::optional<raft::device_matrix<input_t, size_t, raft::row_major>> d_data_direct_;
+
+  // d_data_ptr_ is used to store the general pointer to the input data
+  const void* d_data_ptr_{nullptr};
   raft::device_vector<DistData_t, size_t> l2_norms_;
 
   raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
@@ -307,7 +314,7 @@ inline BuildConfig get_build_config(raft::resources const& res,
                            .termination_threshold = params.termination_threshold,
                            .output_graph_degree   = params.graph_degree,
                            .metric                = params.metric,
-                           .dist_comp_dtype       = params.dist_comp_dtype};
+                           .compress_to_fp16      = params.compress_to_fp16};
   return build_config;
 }
 
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
index 9568c88082..8d26921784 100644
--- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
+++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 # cython: language_level=3
@@ -13,11 +13,6 @@ from cuvs.distance_type cimport cuvsDistanceType
 
 
 cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
-    enum cuvsNNDescentDistCompDtype:
-        NND_DIST_COMP_AUTO = 0,
-        NND_DIST_COMP_FP32 = 1,
-        NND_DIST_COMP_FP16 = 2
-
     ctypedef struct cuvsNNDescentIndexParams:
         cuvsDistanceType metric
         float metric_arg
@@ -26,7 +21,7 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
         size_t max_iterations
         float termination_threshold
         bool return_distances
-        cuvsNNDescentDistCompDtype dist_comp_dtype
+        bool compress_to_fp16
 
     ctypedef cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t
 
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
index 7cb7f59b60..658a2ff983 100644
--- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
+++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 # cython: language_level=3
@@ -63,12 +63,10 @@ cdef class IndexParams:
         The delta at which nn-descent will terminate its iterations
     return_distances : bool
         Whether to return distances array
-    dist_comp_dtype : str, default = "auto"
-        Dtype to use for distance computation.
-        Supported dtypes are `auto`, `fp32`, and `fp16`
-        `auto` automatically determines the best dtype for distance computation based on the dataset dimensions.
-        `fp32` uses fp32 distance computation for better precision at the cost of performance and memory usage. This option is only valid when data type is fp32.
-        `fp16` uses fp16 distance computation for better performance and memory usage at the cost of precision.
+    compress_to_fp16 : bool, default = False
+        When True and the input data is fp32, distance computation is done in
+        fp16 for better performance and lower memory usage at the cost of
+        precision. Has no effect on non-fp32 input types.
     """
 
     cdef cuvsNNDescentIndexParams* params
@@ -88,7 +86,7 @@ cdef class IndexParams:
                  max_iterations=None,
                  termination_threshold=None,
                  return_distances=None,
-                 dist_comp_dtype="auto"
+                 compress_to_fp16=None
                  ):
         if metric is not None:
             self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
@@ -102,15 +100,8 @@ cdef class IndexParams:
             self.params.termination_threshold = termination_threshold
         if return_distances is not None:
             self.params.return_distances = return_distances
-
-        if dist_comp_dtype is "auto":
-            self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_AUTO
-        elif dist_comp_dtype is "fp32":
-            self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_FP32
-        elif dist_comp_dtype is "fp16":
-            self.params.dist_comp_dtype = cuvsNNDescentDistCompDtype.NND_DIST_COMP_FP16
-        else:
-            raise ValueError(f"Invalid dist_comp_dtype: {dist_comp_dtype}. Supported options are 'auto', 'fp32', and 'fp16'.")
+        if compress_to_fp16 is not None:
+            self.params.compress_to_fp16 = compress_to_fp16
 
     @property
     def metric(self):
diff --git a/python/cuvs/cuvs/tests/test_nn_descent.py b/python/cuvs/cuvs/tests/test_nn_descent.py
index 4463142fcb..93b39727d3 100644
--- a/python/cuvs/cuvs/tests/test_nn_descent.py
+++ b/python/cuvs/cuvs/tests/test_nn_descent.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 
@@ -61,9 +61,9 @@ def test_nn_descent(
 
 
 @pytest.mark.parametrize("n_cols", [2, 17, 32])
-@pytest.mark.parametrize("dist_comp_dtype", ["auto", "fp32", "fp16"])
+@pytest.mark.parametrize("compress_to_fp16", [False, True])
 @pytest.mark.parametrize("dtype", [np.float32, np.float16])
-def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype):
+def test_nn_descent_compress_to_fp16(n_cols, compress_to_fp16, dtype):
     metric = "sqeuclidean"
     graph_degree = 32
     n_rows = 100_000
@@ -77,7 +77,7 @@ def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype):
         metric=metric,
         graph_degree=graph_degree,
         return_distances=True,
-        dist_comp_dtype=dist_comp_dtype,
+        compress_to_fp16=compress_to_fp16,
     )
 
     index = nn_descent.build(params, X)
@@ -88,8 +88,4 @@ def test_nn_descent_dist_comp_dtype(n_cols, dist_comp_dtype, dtype):
     _, bf_indices = brute_force.search(index, gpu_X, k=graph_degree)
     bf_indices = bf_indices.copy_to_host()
 
-    if n_cols <= 16 and dist_comp_dtype == "fp16" and dtype == np.float32:
-        # for small dim, if data is fp32 but dist_comp_dtype is fp16, the recall will be low
-        assert calc_recall(nnd_indices, bf_indices) < 0.7
-    else:
-        assert calc_recall(nnd_indices, bf_indices) > 0.9
+    assert calc_recall(nnd_indices, bf_indices) > 0.9

From 2855e585515cc52e735720d21d7feda98a9cbaac Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 18 Mar 2026 20:03:52 +0000
Subject: [PATCH 2/5] compute norms in fp32 if data is fp32

---
 cpp/src/neighbors/detail/nn_descent.cuh | 42 ++++++++++++-------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 67b165dc47..78b6fed0c7 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -274,7 +274,7 @@ __device__ __forceinline__ void load_vec(__half* vec_buffer,
   for (int step = 0; step < raft::ceildiv(padding_dims, num_load_elems_per_warp); step++) {
     int idx = step * num_load_elems_per_warp + lane_id;
     if (idx < load_dims) {
-      vec_buffer[idx] = __float2half(static_cast<float>(d_vec[idx]));
+      vec_buffer[idx] = d_vec[idx];
     } else if (idx < padding_dims) {
       vec_buffer[idx] = half_0;
     }
@@ -1547,6 +1547,17 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
       size_t n_elems    = batch.size() * build_config_.dataset_dim;
       int num_blocks    = raft::ceildiv(n_elems, static_cast<size_t>(TPB));
       size_t dst_offset = batch.offset() * build_config_.dataset_dim;
+      if (needs_l2_norms) {
+        // we compute l2 norms on the fp32 data directly.
+        compute_l2_norms_kernel<<<batch.size(),
+                                  raft::warp_size(),
+                                  sizeof(float) *
+                                    ceildiv(build_config_.dataset_dim,
+                                            static_cast<size_t>(raft::warp_size())) *
+                                    raft::warp_size(),
+                                  stream>>>(
+          batch.data(), build_config_.dataset_dim, l2_norms_.data_handle() + batch.offset());
+      }
       convert_copy_kernel<<<num_blocks, TPB, 0, stream>>>(
         batch.data(), d_data_half_.value().data_handle() + dst_offset, n_elems);
     }
@@ -1567,28 +1578,15 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
     std::cout << "keeping data dtype and copying to device" << std::endl;
   }
 
-  if (needs_l2_norms) {
-    if (d_data_half_.has_value()) {
-      compute_l2_norms_kernel<<<
-        nrow_,
+  if (needs_l2_norms && !compress_host_data) {
+    compute_l2_norms_kernel<<<
+      nrow_,
+      raft::warp_size(),
+      sizeof(input_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
         raft::warp_size(),
-        sizeof(half) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
-          raft::warp_size(),
-        stream>>>(
-        static_cast<const half*>(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle());
-      raft::resource::sync_stream(res);
-    } else {
-      compute_l2_norms_kernel<<<nrow_,
-                                raft::warp_size(),
-                                sizeof(input_t) *
-                                  ceildiv(build_config_.dataset_dim,
-                                          static_cast<size_t>(raft::warp_size())) *
-                                  raft::warp_size(),
-                                stream>>>(static_cast<const input_t*>(d_data_ptr_),
-                                          build_config_.dataset_dim,
-                                          l2_norms_.data_handle());
-      raft::resource::sync_stream(res);
-    }
+      stream>>>(
+      static_cast<const input_t*>(d_data_ptr_), build_config_.dataset_dim, l2_norms_.data_handle());
+    raft::resource::sync_stream(res);
   }
 
   graph_.clear();

From fbb902306242d3de0247536c4240941af32720a8 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Thu, 19 Mar 2026 02:35:30 +0000
Subject: [PATCH 3/5] change padding for uint8 and int8

---
 cpp/src/neighbors/detail/nn_descent.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 78b6fed0c7..c51cdc06f2 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -143,16 +143,16 @@ struct dtype_traits<__half> {
 
 template <>
 struct dtype_traits<uint8_t> {
-  static constexpr int APAD           = 16;
-  static constexpr int BPAD           = 16;
+  static constexpr int APAD           = 4;  // 1 byte padding to avoid bank conflict
+  static constexpr int BPAD           = 4;
   static constexpr int TILE_COL_WIDTH = 128;
   static __device__ __forceinline__ float to_float(uint8_t v) { return static_cast<float>(v); }
 };
 
 template <>
 struct dtype_traits<int8_t> {
-  static constexpr int APAD           = 16;
-  static constexpr int BPAD           = 16;
+  static constexpr int APAD           = 4;  // 1 byte padding to avoid bank conflict
+  static constexpr int BPAD           = 4;
   static constexpr int TILE_COL_WIDTH = 128;
   static __device__ __forceinline__ float to_float(int8_t v) { return static_cast<float>(v); }
 };

From 157fe30f6d9130be8acdeadb83d96a0ef91462ea Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 20 Mar 2026 21:47:24 +0000
Subject: [PATCH 4/5] rm print and revert test

---
 cpp/src/neighbors/detail/nn_descent.cuh   | 2 --
 python/cuvs/cuvs/tests/test_nn_descent.py | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index a62c68afd9..90fca202e4 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -1442,7 +1442,6 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
 
   auto launch_kernel = [&](auto* typed_ptr) {
     if (use_simt) {
-      std::cout << "using simt kernel" << std::endl;
       local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
                                                                h_rev_graph_new_.data_handle(),
                                                                d_list_sizes_new_.data_handle(),
@@ -1460,7 +1459,6 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
                                                                build_config_.metric,
                                                                dist_epilogue);
     } else {
-      std::cout << "using wmma kernel" << std::endl;
       local_join_kernel_wmma<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
                                                                h_rev_graph_new_.data_handle(),
                                                                d_list_sizes_new_.data_handle(),
diff --git a/python/cuvs/cuvs/tests/test_nn_descent.py b/python/cuvs/cuvs/tests/test_nn_descent.py
index 93b39727d3..27403ab70b 100644
--- a/python/cuvs/cuvs/tests/test_nn_descent.py
+++ b/python/cuvs/cuvs/tests/test_nn_descent.py
@@ -88,4 +88,8 @@ def test_nn_descent_compress_to_fp16(n_cols, compress_to_fp16, dtype):
     _, bf_indices = brute_force.search(index, gpu_X, k=graph_degree)
     bf_indices = bf_indices.copy_to_host()
 
-    assert calc_recall(nnd_indices, bf_indices) > 0.9
+    if n_cols <= 16 and compress_to_fp16 and dtype == np.float32:
+        # for small dim, if data is fp32 but compress_to_fp16 is True, the recall will be low
+        assert calc_recall(nnd_indices, bf_indices) < 0.7
+    else:
+        assert calc_recall(nnd_indices, bf_indices) > 0.9

From e59e6c7c8dd472f083cdf643da7f3ad1b827a882 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 8 Apr 2026 00:07:45 +0000
Subject: [PATCH 5/5] fix c abi breakages

---
 c/include/cuvs/neighbors/nn_descent.h         | 138 +++++++++++++++++-
 c/src/neighbors/nn_descent.cpp                | 128 ++++++++++++----
 .../cuvs/neighbors/nn_descent/nn_descent.pxd  |  38 +++--
 .../cuvs/neighbors/nn_descent/nn_descent.pyx  |  10 +-
 4 files changed, 265 insertions(+), 49 deletions(-)

diff --git a/c/include/cuvs/neighbors/nn_descent.h b/c/include/cuvs/neighbors/nn_descent.h
index 1db0eea139..1c04cfe263 100644
--- a/c/include/cuvs/neighbors/nn_descent.h
+++ b/c/include/cuvs/neighbors/nn_descent.h
@@ -15,6 +15,23 @@
 extern "C" {
 #endif
 
+/**
+ * @brief Dtype to use for distance computation
+ * - `NND_DIST_COMP_AUTO`: Automatically determine the best dtype for distance computation based on
+ * the dataset dimensions.
+ * - `NND_DIST_COMP_FP32`: Use fp32 distance computation for better precision at the cost of
+ * performance and memory usage.
+ * - `NND_DIST_COMP_FP16`: Use fp16 distance computation.
+ *
+ * @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with compress_to_fp16
+ * instead.
+ */
+typedef enum {
+  NND_DIST_COMP_AUTO = 0,
+  NND_DIST_COMP_FP32 = 1,
+  NND_DIST_COMP_FP16 = 2
+} cuvsNNDescentDistCompDtype;
+
 /**
  * @defgroup nn_descent_c_index_params The nn-descent algorithm parameters.
  * @{
@@ -35,11 +52,50 @@ extern "C" {
  * the graph for. More iterations produce a better quality graph at cost of performance
  * `termination_threshold`: The delta at which nn-descent will terminate its iterations
  * `return_distances`: Boolean to decide whether to return distances array
+ * `dist_comp_dtype`: dtype to use for distance computation. Defaults to `NND_DIST_COMP_AUTO` which
+ * automatically determines the best dtype for distance computation based on the dataset dimensions.
+ * Use `NND_DIST_COMP_FP32` for better precision at the cost of performance and memory usage. This
+ * option is only valid when data type is fp32. Use `NND_DIST_COMP_FP16` for better performance and
+ * memory usage at the cost of precision.
+ *
+ * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6.
+ */
+struct cuvsNNDescentIndexParams {
+  cuvsDistanceType metric;
+  float metric_arg;
+  size_t graph_degree;
+  size_t intermediate_graph_degree;
+  size_t max_iterations;
+  float termination_threshold;
+  bool return_distances;
+  cuvsNNDescentDistCompDtype dist_comp_dtype;
+};
+
+typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t;
+
+/**
+ * @brief Parameters used to build an nn-descent index (v6)
+ *
+ * `metric`: The distance metric to use
+ * `metric_arg`: The argument used by distance metrics like Minkowskidistance
+ * `graph_degree`: For an input dataset of dimensions (N, D),
+ * determines the final dimensions of the all-neighbors knn graph
+ * which turns out to be of dimensions (N, graph_degree)
+ * `intermediate_graph_degree`: Internally, nn-descent builds an
+ * all-neighbors knn graph of dimensions (N, intermediate_graph_degree)
+ * before selecting the final `graph_degree` neighbors. It's recommended
+ * that `intermediate_graph_degree` >= 1.5 * graph_degree
+ * `max_iterations`: The number of iterations that nn-descent will refine
+ * the graph for. More iterations produce a better quality graph at cost of performance
+ * `termination_threshold`: The delta at which nn-descent will terminate its iterations
+ * `return_distances`: Boolean to decide whether to return distances array
  * `compress_to_fp16`: When true and the input data is fp32, distance computation is done in
  * fp16 for better performance and lower memory usage at the cost of precision. Has no effect on
  * non-fp32 input types.
+ *
+ * @since 26.06
  */
-struct cuvsNNDescentIndexParams {
+struct cuvsNNDescentIndexParams_v6 {
   cuvsDistanceType metric;
   float metric_arg;
   size_t graph_degree;
@@ -50,23 +106,47 @@ struct cuvsNNDescentIndexParams {
   bool compress_to_fp16;
 };
 
-typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t;
+typedef struct cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t;
 
 /**
  * @brief Allocate NN-Descent Index params, and populate with default values
  *
+ * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParamsCreate_v6.
+ *
  * @param[in] index_params cuvsNNDescentIndexParams_t to allocate
  * @return cuvsError_t
  */
 cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t* index_params);
 
+/**
+ * @brief Allocate NN-Descent Index params (v6), and populate with default values
+ *
+ * @since 26.06
+ *
+ * @param[in] index_params cuvsNNDescentIndexParams_v6_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParams_v6_t* index_params);
+
 /**
  * @brief De-allocate NN-Descent Index params
  *
+ * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentIndexParamsDestroy_v6.
+ *
  * @param[in] index_params
  * @return cuvsError_t
  */
 cuvsError_t cuvsNNDescentIndexParamsDestroy(cuvsNNDescentIndexParams_t index_params);
+
+/**
+ * @brief De-allocate NN-Descent Index params (v6)
+ *
+ * @since 26.06
+ *
+ * @param[in] index_params
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsNNDescentIndexParamsDestroy_v6(cuvsNNDescentIndexParams_v6_t index_params);
 /**
  * @}
  */
@@ -145,6 +225,8 @@ cuvsError_t cuvsNNDescentIndexDestroy(cuvsNNDescentIndex_t index);
  * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
  * @endcode
  *
+ * @deprecated To be removed in 26.08 and replaced by cuvsNNDescentBuild_v6.
+ *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] index_params cuvsNNDescentIndexParams_t used to build NN-Descent index
  * @param[in] dataset DLManagedTensor* training dataset on host or device memory
@@ -157,6 +239,58 @@ cuvsError_t cuvsNNDescentBuild(cuvsResources_t res,
                                DLManagedTensor* dataset,
                                DLManagedTensor* graph,
                                cuvsNNDescentIndex_t index);
+
+/**
+ * @brief Build a NN-Descent index (v6) with a `DLManagedTensor` which has underlying
+ *        `DLDeviceType` equal to `kDLCUDA`, `kDLCUDAHost`, `kDLCUDAManaged`,
+ *        or `kDLCPU`. Also, acceptable underlying types are:
+ *        1. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+ *        2. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 16`
+ *        3. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8`
+ *        4. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
+ *
+ * @code {.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/nn_descent.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Assume a populated `DLManagedTensor` type here
+ * DLManagedTensor dataset;
+ *
+ * // Create default index params
+ * cuvsNNDescentIndexParams_v6_t index_params;
+ * cuvsError_t params_create_status = cuvsNNDescentIndexParamsCreate_v6(&index_params);
+ *
+ * // Create NN-Descent index
+ * cuvsNNDescentIndex_t index;
+ * cuvsError_t index_create_status = cuvsNNDescentIndexCreate(&index);
+ *
+ * // Build the NN-Descent Index
+ * cuvsError_t build_status = cuvsNNDescentBuild_v6(res, index_params, &dataset, NULL, index);
+ *
+ * // de-allocate `index_params`, `index` and `res`
+ * cuvsError_t params_destroy_status = cuvsNNDescentIndexParamsDestroy_v6(index_params);
+ * cuvsError_t index_destroy_status = cuvsNNDescentIndexDestroy(index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ *
+ * @since 26.06
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] index_params cuvsNNDescentIndexParams_v6_t used to build NN-Descent index
+ * @param[in] dataset DLManagedTensor* training dataset on host or device memory
+ * @param[inout] graph Optional preallocated graph on host memory to store output
+ * @param[out] index cuvsNNDescentIndex_t Newly built NN-Descent index
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res,
+                                  cuvsNNDescentIndexParams_v6_t index_params,
+                                  DLManagedTensor* dataset,
+                                  DLManagedTensor* graph,
+                                  cuvsNNDescentIndex_t index);
 /**
  * @}
  */
diff --git a/c/src/neighbors/nn_descent.cpp b/c/src/neighbors/nn_descent.cpp
index d3753e925a..a704bcd3eb 100644
--- a/c/src/neighbors/nn_descent.cpp
+++ b/c/src/neighbors/nn_descent.cpp
@@ -28,23 +28,13 @@ namespace {
 
 template <typename T, typename IdxT = uint32_t>
 void* _build(cuvsResources_t res,
-             cuvsNNDescentIndexParams params,
+             cuvs::neighbors::nn_descent::index_params build_params,
              DLManagedTensor* dataset_tensor,
              DLManagedTensor* graph_tensor)
 {
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
   auto dataset = dataset_tensor->dl_tensor;
 
-  auto build_params         = cuvs::neighbors::nn_descent::index_params();
-  build_params.metric       = static_cast<cuvs::distance::DistanceType>((int)params.metric),
-  build_params.metric_arg   = params.metric_arg;
-  build_params.graph_degree = params.graph_degree;
-  build_params.intermediate_graph_degree = params.intermediate_graph_degree;
-  build_params.max_iterations            = params.max_iterations;
-  build_params.termination_threshold     = params.termination_threshold;
-  build_params.return_distances          = params.return_distances;
-  build_params.compress_to_fp16          = params.compress_to_fp16;
-
   using graph_type = raft::host_matrix_view<IdxT, int64_t, raft::row_major>;
   std::optional<graph_type> graph;
   if (graph_tensor != NULL) { graph = cuvs::core::from_dlpack<graph_type>(graph_tensor); }
@@ -64,6 +54,35 @@ void* _build(cuvsResources_t res,
   }
 }
 
+cuvs::neighbors::nn_descent::index_params convert_params(cuvsNNDescentIndexParams const& params)
+{
+  auto build_params                      = cuvs::neighbors::nn_descent::index_params();
+  build_params.metric                    = static_cast<cuvs::distance::DistanceType>((int)params.metric);
+  build_params.metric_arg                = params.metric_arg;
+  build_params.graph_degree              = params.graph_degree;
+  build_params.intermediate_graph_degree = params.intermediate_graph_degree;
+  build_params.max_iterations            = params.max_iterations;
+  build_params.termination_threshold     = params.termination_threshold;
+  build_params.return_distances          = params.return_distances;
+  build_params.compress_to_fp16          = (params.dist_comp_dtype == NND_DIST_COMP_FP16);
+  return build_params;
+}
+
+cuvs::neighbors::nn_descent::index_params convert_params_v6(
+  cuvsNNDescentIndexParams_v6 const& params)
+{
+  auto build_params                      = cuvs::neighbors::nn_descent::index_params();
+  build_params.metric                    = static_cast<cuvs::distance::DistanceType>((int)params.metric);
+  build_params.metric_arg                = params.metric_arg;
+  build_params.graph_degree              = params.graph_degree;
+  build_params.intermediate_graph_degree = params.intermediate_graph_degree;
+  build_params.max_iterations            = params.max_iterations;
+  build_params.termination_threshold     = params.termination_threshold;
+  build_params.return_distances          = params.return_distances;
+  build_params.compress_to_fp16          = params.compress_to_fp16;
+  return build_params;
+}
+
 template <typename output_mdspan_type>
 void _get_graph(cuvsResources_t res, cuvsNNDescentIndex_t index, DLManagedTensor* graph)
 {
@@ -113,6 +132,37 @@ void _get_distances(cuvsResources_t res, cuvsNNDescentIndex_t index, DLManagedTe
     RAFT_FAIL("Unsupported nn-descent index dtype: %d and bits: %d", dtype.code, dtype.bits);
   }
 }
+
+cuvsError_t _nn_descent_build(cuvsResources_t res,
+                              DLManagedTensor* dataset_tensor,
+                              DLManagedTensor* graph_tensor,
+                              cuvsNNDescentIndex_t index,
+                              cuvs::neighbors::nn_descent::index_params build_params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    index->dtype.code = kDLUInt;
+    index->dtype.bits = 32;
+
+    auto dtype = dataset_tensor->dl_tensor.dtype;
+
+    if ((dtype.code == kDLFloat) && (dtype.bits == 32)) {
+      index->addr = reinterpret_cast<uintptr_t>(
+        _build<float, uint32_t>(res, build_params, dataset_tensor, graph_tensor));
+    } else if ((dtype.code == kDLFloat) && (dtype.bits == 16)) {
+      index->addr = reinterpret_cast<uintptr_t>(
+        _build<half, uint32_t>(res, build_params, dataset_tensor, graph_tensor));
+    } else if ((dtype.code == kDLInt) && (dtype.bits == 8)) {
+      index->addr = reinterpret_cast<uintptr_t>(
+        _build<int8_t, uint32_t>(res, build_params, dataset_tensor, graph_tensor));
+    } else if ((dtype.code == kDLUInt) && (dtype.bits == 8)) {
+      index->addr = reinterpret_cast<uintptr_t>(
+        _build<uint8_t, uint32_t>(res, build_params, dataset_tensor, graph_tensor));
+    } else {
+      RAFT_FAIL("Unsupported nn-descent dataset dtype: %d and bits: %d", dtype.code, dtype.bits);
+    }
+  });
+}
+
 }  // namespace
 
 extern "C" cuvsError_t cuvsNNDescentIndexCreate(cuvsNNDescentIndex_t* index)
@@ -141,28 +191,18 @@ extern "C" cuvsError_t cuvsNNDescentBuild(cuvsResources_t res,
                                           DLManagedTensor* graph_tensor,
                                           cuvsNNDescentIndex_t index)
 {
-  return cuvs::core::translate_exceptions([=] {
-    index->dtype.code = kDLUInt;
-    index->dtype.bits = 32;
-
-    auto dtype = dataset_tensor->dl_tensor.dtype;
+  auto build_params = convert_params(*params);
+  return _nn_descent_build(res, dataset_tensor, graph_tensor, index, build_params);
+}
 
-    if ((dtype.code == kDLFloat) && (dtype.bits == 32)) {
-      index->addr = reinterpret_cast<uintptr_t>(
-        _build<float, uint32_t>(res, *params, dataset_tensor, graph_tensor));
-    } else if ((dtype.code == kDLFloat) && (dtype.bits == 16)) {
-      index->addr = reinterpret_cast<uintptr_t>(
-        _build<half, uint32_t>(res, *params, dataset_tensor, graph_tensor));
-    } else if ((dtype.code == kDLInt) && (dtype.bits == 8)) {
-      index->addr = reinterpret_cast<uintptr_t>(
-        _build<int8_t, uint32_t>(res, *params, dataset_tensor, graph_tensor));
-    } else if ((dtype.code == kDLUInt) && (dtype.bits == 8)) {
-      index->addr = reinterpret_cast<uintptr_t>(
-        _build<uint8_t, uint32_t>(res, *params, dataset_tensor, graph_tensor));
-    } else {
-      RAFT_FAIL("Unsupported nn-descent dataset dtype: %d and bits: %d", dtype.code, dtype.bits);
-    }
-  });
+extern "C" cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res,
+                                             cuvsNNDescentIndexParams_v6_t params,
+                                             DLManagedTensor* dataset_tensor,
+                                             DLManagedTensor* graph_tensor,
+                                             cuvsNNDescentIndex_t index)
+{
+  auto build_params = convert_params_v6(*params);
+  return _nn_descent_build(res, dataset_tensor, graph_tensor, index, build_params);
 }
 
 extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t* params)
@@ -172,6 +212,25 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t
     cuvs::neighbors::nn_descent::index_params cpp_params;
 
     *params = new cuvsNNDescentIndexParams{
+      .metric                    = static_cast<cuvsDistanceType>((int)cpp_params.metric),
+      .metric_arg                = cpp_params.metric_arg,
+      .graph_degree              = cpp_params.graph_degree,
+      .intermediate_graph_degree = cpp_params.intermediate_graph_degree,
+      .max_iterations            = cpp_params.max_iterations,
+      .termination_threshold     = cpp_params.termination_threshold,
+      .return_distances          = cpp_params.return_distances,
+      .dist_comp_dtype           = cpp_params.compress_to_fp16 ? NND_DIST_COMP_FP16
+                                                               : NND_DIST_COMP_AUTO};
+  });
+}
+
+extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParams_v6_t* params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    // get defaults from cpp parameters struct
+    cuvs::neighbors::nn_descent::index_params cpp_params;
+
+    *params = new cuvsNNDescentIndexParams_v6{
       .metric                    = static_cast<cuvsDistanceType>((int)cpp_params.metric),
       .metric_arg                = cpp_params.metric_arg,
       .graph_degree              = cpp_params.graph_degree,
@@ -188,6 +247,11 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsDestroy(cuvsNNDescentIndexParams_
   return cuvs::core::translate_exceptions([=] { delete params; });
 }
 
+extern "C" cuvsError_t cuvsNNDescentIndexParamsDestroy_v6(cuvsNNDescentIndexParams_v6_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
 extern "C" cuvsError_t cuvsNNDescentIndexGetGraph(cuvsResources_t res,
                                                   cuvsNNDescentIndex_t index,
                                                   DLManagedTensor* graph)
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
index 8d26921784..aece64be8f 100644
--- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
+++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
@@ -13,6 +13,12 @@ from cuvs.distance_type cimport cuvsDistanceType
 
 
 cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
+    # Deprecated — to be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6.
+    ctypedef enum cuvsNNDescentDistCompDtype:
+        NND_DIST_COMP_AUTO
+        NND_DIST_COMP_FP32
+        NND_DIST_COMP_FP16
+
     ctypedef struct cuvsNNDescentIndexParams:
         cuvsDistanceType metric
         float metric_arg
@@ -21,21 +27,33 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
         size_t max_iterations
         float termination_threshold
         bool return_distances
-        bool compress_to_fp16
+        cuvsNNDescentDistCompDtype dist_comp_dtype
 
     ctypedef cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t
 
+    ctypedef struct cuvsNNDescentIndexParams_v6:
+        cuvsDistanceType metric
+        float metric_arg
+        size_t graph_degree
+        size_t intermediate_graph_degree
+        size_t max_iterations
+        float termination_threshold
+        bool return_distances
+        bool compress_to_fp16
+
+    ctypedef cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t
+
     ctypedef struct cuvsNNDescentIndex:
         uintptr_t addr
         DLDataType dtype
 
     ctypedef cuvsNNDescentIndex* cuvsNNDescentIndex_t
 
-    cuvsError_t cuvsNNDescentIndexParamsCreate(
-        cuvsNNDescentIndexParams_t* params)
+    cuvsError_t cuvsNNDescentIndexParamsCreate_v6(
+        cuvsNNDescentIndexParams_v6_t* params)
 
-    cuvsError_t cuvsNNDescentIndexParamsDestroy(
-        cuvsNNDescentIndexParams_t index)
+    cuvsError_t cuvsNNDescentIndexParamsDestroy_v6(
+        cuvsNNDescentIndexParams_v6_t index)
 
     cuvsError_t cuvsNNDescentIndexCreate(cuvsNNDescentIndex_t* index)
 
@@ -49,8 +67,8 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
                                                cuvsNNDescentIndex_t index,
                                                DLManagedTensor * output)
 
-    cuvsError_t cuvsNNDescentBuild(cuvsResources_t res,
-                                   cuvsNNDescentIndexParams* params,
-                                   DLManagedTensor* dataset,
-                                   DLManagedTensor* graph,
-                                   cuvsNNDescentIndex_t index) except +
+    cuvsError_t cuvsNNDescentBuild_v6(cuvsResources_t res,
+                                      cuvsNNDescentIndexParams_v6* params,
+                                      DLManagedTensor* dataset,
+                                      DLManagedTensor* graph,
+                                      cuvsNNDescentIndex_t index) except +
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
index 658a2ff983..2df9fb9699 100644
--- a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
+++ b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
@@ -69,14 +69,14 @@ cdef class IndexParams:
         precision. Has no effect on non-fp32 input types.
     """
 
-    cdef cuvsNNDescentIndexParams* params
+    cdef cuvsNNDescentIndexParams_v6* params
     cdef object _metric
 
     def __cinit__(self):
-        cuvsNNDescentIndexParamsCreate(&self.params)
+        cuvsNNDescentIndexParamsCreate_v6(&self.params)
 
     def __dealloc__(self):
-        check_cuvs(cuvsNNDescentIndexParamsDestroy(self.params))
+        check_cuvs(cuvsNNDescentIndexParamsDestroy_v6(self.params))
 
     def __init__(self, *,
                  metric=None,
@@ -236,7 +236,7 @@ def build(IndexParams index_params, dataset, graph=None, resources=None):
     cdef Index idx = Index()
     cdef cydlpack.DLManagedTensor* dataset_dlpack = \
         cydlpack.dlpack_c(dataset_ai)
-    cdef cuvsNNDescentIndexParams* params = index_params.params
+    cdef cuvsNNDescentIndexParams_v6* params = index_params.params
 
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
 
@@ -256,7 +256,7 @@ def build(IndexParams index_params, dataset, graph=None, resources=None):
         graph_dlpack = cydlpack.dlpack_c(graph_ai)
 
     with cuda_interruptible():
-        check_cuvs(cuvsNNDescentBuild(
+        check_cuvs(cuvsNNDescentBuild_v6(
             res,
             params,
             dataset_dlpack,