diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index e4169f6f8..2df090126 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "cuvs_cagra_wrapper.h"
+#include <chrono>
 #include <cuvs/neighbors/hnsw.hpp>
 
 #include <memory>
@@ -85,13 +86,21 @@ class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
 {
+  auto start_time = std::chrono::high_resolution_clock::now();
   cagra_build_.build(dataset, nrow);
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "CAGRA build time: " << duration.count() << " ms" << std::endl;
   auto* cagra_index      = cagra_build_.get_index();
   auto host_dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
   auto opt_dataset_view =
     std::optional<raft::host_matrix_view<const T, int64_t>>(std::move(host_dataset_view));
+  start_time  = std::chrono::high_resolution_clock::now();
   hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
     handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view);
+  end_time = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+  std::cout << "HNSW conversion time: " << duration.count() << " ms" << std::endl;
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index b7eda54b8..fd36c16b4 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -51,9 +51,8 @@ struct cuvsHnswIndexParams {
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction;
   /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
-  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
-  to parallelism, and increasing the number of threads can reduce the quality of the index.
-   */
+      When the value is 0, the number of threads is automatically determined to the maximum
+      number of threads available.
   int num_threads;
 };
 
@@ -65,389 +64,389 @@ typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t;
  * @param[in] params cuvsHnswIndexParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params);
-
-/**
- * @brief De-allocate HNSW Index params
- *
- * @param[in] params
- * @return cuvsError_t
- */
-cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params);
-
-/**
- * @}
- */
+  cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params);
 
-/**
- * @defgroup hnsw_c_index C API for hnswlib wrapper index
- * @{
- */
-
-/**
- * @brief Struct to hold address of cuvs::neighbors::Hnsw::index and its active trained dtype
- *
- */
-typedef struct {
-  uintptr_t addr;
-  DLDataType dtype;
+  /**
+   * @brief De-allocate HNSW Index params
+   *
+   * @param[in] params
+   * @return cuvsError_t
+   */
+  cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params);
 
-} cuvsHnswIndex;
+  /**
+   * @}
+   */
 
-typedef cuvsHnswIndex* cuvsHnswIndex_t;
+  /**
+   * @defgroup hnsw_c_index C API for hnswlib wrapper index
+   * @{
+   */
 
-/**
- * @brief Allocate HNSW index
- *
- * @param[in] index cuvsHnswIndex_t to allocate
- * @return HnswError_t
- */
-cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index);
+  /**
+   * @brief Struct to hold address of cuvs::neighbors::Hnsw::index and its active trained dtype
+   *
+   */
+  typedef struct {
+    uintptr_t addr;
+    DLDataType dtype;
 
-/**
- * @brief De-allocate HNSW index
- *
- * @param[in] index cuvsHnswIndex_t to de-allocate
- */
-cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
+  } cuvsHnswIndex;
 
-/**
- * @}
- */
+  typedef cuvsHnswIndex* cuvsHnswIndex_t;
 
-/**
- * @defgroup hnsw_c_extend_params Parameters for extending HNSW index
- * @{
- */
+  /**
+   * @brief Allocate HNSW index
+   *
+   * @param[in] index cuvsHnswIndex_t to allocate
+   * @return HnswError_t
+   */
+  cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index);
 
-struct cuvsHnswExtendParams {
-  /** Number of CPU threads used to extend additional vectors */
-  int num_threads;
-};
+  /**
+   * @brief De-allocate HNSW index
+   *
+   * @param[in] index cuvsHnswIndex_t to de-allocate
+   */
+  cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
 
-typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t;
+  /**
+   * @}
+   */
 
-/**
- * @brief Allocate HNSW extend params, and populate with default values
- *
- * @param[in] params cuvsHnswExtendParams_t to allocate
- * @return cuvsError_t
- */
-cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params);
+  /**
+   * @defgroup hnsw_c_extend_params Parameters for extending HNSW index
+   * @{
+   */
 
-/**
- * @brief De-allocate HNSW extend params
- *
- * @param[in] params cuvsHnswExtendParams_t to de-allocate
- * @return cuvsError_t
- */
+  struct cuvsHnswExtendParams {
+    /** Number of CPU threads used to extend additional vectors */
+    int num_threads;
+  };
 
-cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
+  typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t;
 
-/**
- * @}
- */
+  /**
+   * @brief Allocate HNSW extend params, and populate with default values
+   *
+   * @param[in] params cuvsHnswExtendParams_t to allocate
+   * @return cuvsError_t
+   */
+  cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params);
 
-/**
- * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index
- * @{
- */
+  /**
+   * @brief De-allocate HNSW extend params
+   *
+   * @param[in] params cuvsHnswExtendParams_t to de-allocate
+   * @return cuvsError_t
+   */
 
-/**
- * @brief Convert a CAGRA Index to an HNSW index.
- * NOTE: When hierarchy is:
- *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
- * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
- * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
- * the format is not compatible with the original hnswlib.
- *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
- * serialized index is also compatible with the original hnswlib library.
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
- * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index
- * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index
- *
- * @return cuvsError_t
- *
- * @code{.c}
- * #include <cuvs/core/c_api.h>
- * #include <cuvs/neighbors/cagra.h>
- * #include <cuvs/neighbors/hnsw.h>
- *
- * // Create cuvsResources_t
- * cuvsResources_t res;
- * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
- *
- * // create a CAGRA index with `cuvsCagraBuild`
- *
- * // Convert the CAGRA index to an HNSW index
- * cuvsHnswIndex_t hnsw_index;
- * cuvsHnswIndexCreate(&hnsw_index);
- * cuvsHnswIndexParams_t hnsw_params;
- * cuvsHnswIndexParamsCreate(&hnsw_params);
- * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
- *
- * // de-allocate `hnsw_params`, `hnsw_index` and `res`
- * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
- * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
- * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
- * @endcode
- */
-cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
-                              cuvsHnswIndexParams_t params,
-                              cuvsCagraIndex_t cagra_index,
-                              cuvsHnswIndex_t hnsw_index);
+  cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
 
-/**
- * @}
- */
+  /**
+   * @}
+   */
 
-/**
- * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors
- * @{
- */
+  /**
+   * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index
+   * @{
+   */
 
-/**
- * @brief Add new vectors to an HNSW index
- * NOTE: The HNSW index can only be extended when the hierarchy is `CPU`
- *       when converting from a CAGRA index.
-
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index
- * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index
- * @param[inout] index cuvsHnswIndex_t to extend
-  *
-  * @return cuvsError_t
-  *
-  * @code{.c}
-  * #include <cuvs/core/c_api.h>
-  * #include <cuvs/neighbors/cagra.h>
-  * #include <cuvs/neighbors/hnsw.h>
-  *
-  * // Create cuvsResources_t
-  * cuvsResources_t res;
-  * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
-  *
-  * // create an index with `cuvsCagraBuild`
-  *
-  * // Convert the CAGRA index to an HNSW index
-  * cuvsHnswIndex_t hnsw_index;
-  * cuvsHnswIndexCreate(&hnsw_index);
-  * cuvsHnswIndexParams_t hnsw_params;
-  * cuvsHnswIndexParamsCreate(&hnsw_params);
-  * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
-  *
-  * // Extend the HNSW index with additional vectors
-  * DLManagedTensor additional_dataset;
-  * cuvsHnswExtendParams_t extend_params;
-  * cuvsHnswExtendParamsCreate(&extend_params);
-  * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index);
-  *
-  * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res`
-  * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
-  * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
-  * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params);
-  * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
-  * @endcode
-  */
-
-cuvsError_t cuvsHnswExtend(cuvsResources_t res,
-                           cuvsHnswExtendParams_t params,
-                           DLManagedTensor* additional_dataset,
-                           cuvsHnswIndex_t index);
+  /**
+   * @brief Convert a CAGRA Index to an HNSW index.
+   * NOTE: When hierarchy is:
+   *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+   * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+   * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS,
+   * as the format is not compatible with the original hnswlib.
+   *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+   * serialized index is also compatible with the original hnswlib library.
+   *
+   * @param[in] res cuvsResources_t opaque C handle
+   * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
+   * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index
+   * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index
+   *
+   * @return cuvsError_t
+   *
+   * @code{.c}
+   * #include <cuvs/core/c_api.h>
+   * #include <cuvs/neighbors/cagra.h>
+   * #include <cuvs/neighbors/hnsw.h>
+   *
+   * // Create cuvsResources_t
+   * cuvsResources_t res;
+   * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+   *
+   * // create a CAGRA index with `cuvsCagraBuild`
+   *
+   * // Convert the CAGRA index to an HNSW index
+   * cuvsHnswIndex_t hnsw_index;
+   * cuvsHnswIndexCreate(&hnsw_index);
+   * cuvsHnswIndexParams_t hnsw_params;
+   * cuvsHnswIndexParamsCreate(&hnsw_params);
+   * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+   *
+   * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+   * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+   * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+   * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+   * @endcode
+   */
+  cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                cuvsHnswIndexParams_t params,
+                                cuvsCagraIndex_t cagra_index,
+                                cuvsHnswIndex_t hnsw_index);
 
-/**
- * @}
- */
+  /**
+   * @}
+   */
 
-/**
- * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
- * @{
- */
+  /**
+   * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors
+   * @{
+   */
 
-struct cuvsHnswSearchParams {
-  int32_t ef;
-  int32_t num_threads;
-};
+  /**
+   * @brief Add new vectors to an HNSW index
+   * NOTE: The HNSW index can only be extended when the hierarchy is `CPU`
+   *       when converting from a CAGRA index.
+
+   * @param[in] res cuvsResources_t opaque C handle
+   * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index
+   * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index
+   * @param[inout] index cuvsHnswIndex_t to extend
+    *
+    * @return cuvsError_t
+    *
+    * @code{.c}
+    * #include <cuvs/core/c_api.h>
+    * #include <cuvs/neighbors/cagra.h>
+    * #include <cuvs/neighbors/hnsw.h>
+    *
+    * // Create cuvsResources_t
+    * cuvsResources_t res;
+    * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+    *
+    * // create an index with `cuvsCagraBuild`
+    *
+    * // Convert the CAGRA index to an HNSW index
+    * cuvsHnswIndex_t hnsw_index;
+    * cuvsHnswIndexCreate(&hnsw_index);
+    * cuvsHnswIndexParams_t hnsw_params;
+    * cuvsHnswIndexParamsCreate(&hnsw_params);
+    * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+    *
+    * // Extend the HNSW index with additional vectors
+    * DLManagedTensor additional_dataset;
+    * cuvsHnswExtendParams_t extend_params;
+    * cuvsHnswExtendParamsCreate(&extend_params);
+    * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index);
+    *
+    * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res`
+    * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+    * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+    * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params);
+    * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+    * @endcode
+    */
+
+  cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                             cuvsHnswExtendParams_t params,
+                             DLManagedTensor* additional_dataset,
+                             cuvsHnswIndex_t index);
+
+  /**
+   * @}
+   */
 
-typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+  /**
+   * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+   * @{
+   */
 
-/**
- * @brief Allocate HNSW search params, and populate with default values
- *
- * @param[in] params cuvsHnswSearchParams_t to allocate
- * @return cuvsError_t
- */
-cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+  struct cuvsHnswSearchParams {
+    int32_t ef;
+    int32_t num_threads;
+  };
 
-/**
- * @brief De-allocate HNSW search params
- *
- * @param[in] params cuvsHnswSearchParams_t to de-allocate
- * @return cuvsError_t
- */
-cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+  typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
 
-/**
- * @}
- */
+  /**
+   * @brief Allocate HNSW search params, and populate with default values
+   *
+   * @param[in] params cuvsHnswSearchParams_t to allocate
+   * @return cuvsError_t
+   */
+  cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
 
-/**
- * @defgroup hnsw_c_index_search C API for CUDA ANN Graph-based nearest neighbor search
- * @{
- */
-/**
- * @brief Search a HNSW index with a `DLManagedTensor` which has underlying
- *        `DLDeviceType` equal to `kDLCPU`, `kDLCUDAHost`, or `kDLCUDAManaged`.
- *        It is also important to note that the HNSW Index must have been built
- *        with the same type of `queries`, such that `index.dtype.code ==
- *        queries.dl_tensor.dtype.code`
- *        Supported types for input are:
- *        1. `queries`:
- *          a. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
- *          b. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8`
- *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
- *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
- *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
- * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in
- * cuVS, as the format is not compatible with the original hnswlib.
- *
- * @code {.c}
- * #include <cuvs/core/c_api.h>
- * #include <cuvs/neighbors/hnsw.h>
- *
- * // Create cuvsResources_t
- * cuvsResources_t res;
- * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
- *
- * // Assume a populated `DLManagedTensor` type here
- * DLManagedTensor dataset;
- * DLManagedTensor queries;
- * DLManagedTensor neighbors;
- *
- * // Create default search params
- * cuvsHnswSearchParams_t params;
- * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(&params);
- *
- * // Search the `index` built using `cuvsHnswFromCagra`
- * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors,
- * &distances);
- *
- * // de-allocate `params` and `res`
- * cuvsError_t params_destroy_status = cuvsHnswSearchParamsDestroy(params);
- * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
- * @endcode
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index
- * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra`
- * @param[in] queries DLManagedTensor* queries dataset to search
- * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
- * @param[out] distances DLManagedTensor* output `k` distances for queries
- */
-cuvsError_t cuvsHnswSearch(cuvsResources_t res,
-                           cuvsHnswSearchParams_t params,
-                           cuvsHnswIndex_t index,
-                           DLManagedTensor* queries,
-                           DLManagedTensor* neighbors,
-                           DLManagedTensor* distances);
+  /**
+   * @brief De-allocate HNSW search params
+   *
+   * @param[in] params cuvsHnswSearchParams_t to de-allocate
+   * @return cuvsError_t
+   */
+  cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
 
-/**
- * @}
- */
+  /**
+   * @}
+   */
 
-/**
- * @defgroup hnsw_c_serialize HNSW C-API serialize functions
- * @{
- */
+  /**
+   * @defgroup hnsw_c_index_search C API for CUDA ANN Graph-based nearest neighbor search
+   * @{
+   */
+  /**
+   * @brief Search a HNSW index with a `DLManagedTensor` which has underlying
+   *        `DLDeviceType` equal to `kDLCPU`, `kDLCUDAHost`, or `kDLCUDAManaged`.
+   *        It is also important to note that the HNSW Index must have been built
+   *        with the same type of `queries`, such that `index.dtype.code ==
+   *        queries.dl_tensor.dtype.code`
+   *        Supported types for input are:
+   *        1. `queries`:
+   *          a. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+   *          b. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8`
+   *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
+   *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
+   *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+   * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in
+   * cuVS, as the format is not compatible with the original hnswlib.
+   *
+   * @code {.c}
+   * #include <cuvs/core/c_api.h>
+   * #include <cuvs/neighbors/hnsw.h>
+   *
+   * // Create cuvsResources_t
+   * cuvsResources_t res;
+   * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+   *
+   * // Assume a populated `DLManagedTensor` type here
+   * DLManagedTensor dataset;
+   * DLManagedTensor queries;
+   * DLManagedTensor neighbors;
+   *
+   * // Create default search params
+   * cuvsHnswSearchParams_t params;
+   * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(&params);
+   *
+   * // Search the `index` built using `cuvsHnswFromCagra`
+   * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors,
+   * &distances);
+   *
+   * // de-allocate `params` and `res`
+   * cuvsError_t params_destroy_status = cuvsHnswSearchParamsDestroy(params);
+   * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+   * @endcode
+   *
+   * @param[in] res cuvsResources_t opaque C handle
+   * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index
+   * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra`
+   * @param[in] queries DLManagedTensor* queries dataset to search
+   * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
+   * @param[out] distances DLManagedTensor* output `k` distances for queries
+   */
+  cuvsError_t cuvsHnswSearch(cuvsResources_t res,
+                             cuvsHnswSearchParams_t params,
+                             cuvsHnswIndex_t index,
+                             DLManagedTensor* queries,
+                             DLManagedTensor* neighbors,
+                             DLManagedTensor* distances);
+
+  /**
+   * @}
+   */
 
-/**
- * @brief Serialize a CAGRA index to a file as an hnswlib index
- * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
- * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
- * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
- * library.
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] filename the name of the file to save the index
- * @param[in] index cuvsHnswIndex_t to serialize
- * @return cuvsError_t
- *
- * @code{.c}
- * #include <cuvs/core/c_api.h>
- * #include <cuvs/neighbors/cagra.h>
- * #include <cuvs/neighbors/hnsw.h>
- *
- * // Create cuvsResources_t
- * cuvsResources_t res;
- * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
- *
- * // create an index with `cuvsCagraBuild`
- *
- * // Convert the CAGRA index to an HNSW index
- * cuvsHnswIndex_t hnsw_index;
- * cuvsHnswIndexCreate(&hnsw_index);
- * cuvsHnswIndexParams_t hnsw_params;
- * cuvsHnswIndexParamsCreate(&hnsw_params);
- * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
- *
- * // Serialize the HNSW index
- * cuvsHnswSerialize(res, "/path/to/index", hnsw_index);
- *
- * // de-allocate `hnsw_params`, `hnsw_index` and `res`
- * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
- * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
- * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
- * @endcode
- */
-cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index);
+  /**
+   * @defgroup hnsw_c_serialize HNSW C-API serialize functions
+   * @{
+   */
 
-/**
- * Load hnswlib index from file which was serialized from a HNSW index.
- * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
- * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
- * Experimental, both the API and the serialization format are subject to change.
- *
- * @code{.c}
- * #include <cuvs/core/c_api.h>
- * #include <cuvs/neighbors/cagra.h>
- * #include <cuvs/neighbors/hnsw.h>
- *
- * // Create cuvsResources_t
- * cuvsResources_t res;
- * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
- *
- * // create an index with `cuvsCagraBuild`
- * cuvsCagraSerializeHnswlib(res, "/path/to/index", index);
- *
- * // Load the serialized CAGRA index from file as an hnswlib index
- * // The index should have the same dtype as the one used to build CAGRA the index
- * cuvsHnswIndex_t hnsw_index;
- * cuvsHnswIndexCreate(&hnsw_index);
- * cuvsHnsWIndexParams_t hnsw_params;
- * cuvsHnswIndexParamsCreate(&hnsw_params);
- * hnsw_params->hierarchy = NONE;
- * hnsw_index->dtype = index->dtype;
- * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index);
- * @endcode
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
- * @param[in] filename the name of the file that stores the index
- * @param[in] dim the dimension of the vectors in the index
- * @param[in] metric the distance metric used to build the index
- * @param[out] index HNSW index loaded disk
- */
-cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
-                                cuvsHnswIndexParams_t params,
-                                const char* filename,
-                                int dim,
-                                cuvsDistanceType metric,
-                                cuvsHnswIndex_t index);
-/**
- * @}
- */
+  /**
+   * @brief Serialize a CAGRA index to a file as an hnswlib index
+   * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by
+   * the hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
+   * hnswlib. However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the
+   * original hnswlib library.
+   *
+   * @param[in] res cuvsResources_t opaque C handle
+   * @param[in] filename the name of the file to save the index
+   * @param[in] index cuvsHnswIndex_t to serialize
+   * @return cuvsError_t
+   *
+   * @code{.c}
+   * #include <cuvs/core/c_api.h>
+   * #include <cuvs/neighbors/cagra.h>
+   * #include <cuvs/neighbors/hnsw.h>
+   *
+   * // Create cuvsResources_t
+   * cuvsResources_t res;
+   * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+   *
+   * // create an index with `cuvsCagraBuild`
+   *
+   * // Convert the CAGRA index to an HNSW index
+   * cuvsHnswIndex_t hnsw_index;
+   * cuvsHnswIndexCreate(&hnsw_index);
+   * cuvsHnswIndexParams_t hnsw_params;
+   * cuvsHnswIndexParamsCreate(&hnsw_params);
+   * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+   *
+   * // Serialize the HNSW index
+   * cuvsHnswSerialize(res, "/path/to/index", hnsw_index);
+   *
+   * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+   * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+   * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+   * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+   * @endcode
+   */
+  cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index);
+
+  /**
+   * Load hnswlib index from file which was serialized from a HNSW index.
+   * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
+   * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original
+   * hnswlib. Experimental, both the API and the serialization format are subject to change.
+   *
+   * @code{.c}
+   * #include <cuvs/core/c_api.h>
+   * #include <cuvs/neighbors/cagra.h>
+   * #include <cuvs/neighbors/hnsw.h>
+   *
+   * // Create cuvsResources_t
+   * cuvsResources_t res;
+   * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+   *
+   * // create an index with `cuvsCagraBuild`
+   * cuvsCagraSerializeHnswlib(res, "/path/to/index", index);
+   *
+   * // Load the serialized CAGRA index from file as an hnswlib index
+   * // The index should have the same dtype as the one used to build CAGRA the index
+   * cuvsHnswIndex_t hnsw_index;
+   * cuvsHnswIndexCreate(&hnsw_index);
+   * cuvsHnsWIndexParams_t hnsw_params;
+   * cuvsHnswIndexParamsCreate(&hnsw_params);
+   * hnsw_params->hierarchy = NONE;
+   * hnsw_index->dtype = index->dtype;
+   * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index);
+   * @endcode
+   *
+   * @param[in] res cuvsResources_t opaque C handle
+   * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
+   * @param[in] filename the name of the file that stores the index
+   * @param[in] dim the dimension of the vectors in the index
+   * @param[in] metric the distance metric used to build the index
+   * @param[out] index HNSW index loaded disk
+   */
+  cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                  cuvsHnswIndexParams_t params,
+                                  const char* filename,
+                                  int dim,
+                                  cuvsDistanceType metric,
+                                  cuvsHnswIndex_t index);
+  /**
+   * @}
+   */
 
 #ifdef __cplusplus
 }
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index f0b433d8e..db58641c8 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -54,10 +54,10 @@ struct index_params : cuvs::neighbors::index_params {
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction = 200;
   /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
-  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
-  to parallelism, and increasing the number of threads can reduce the quality of the index.
+      When the value is 0, the number of threads is automatically determined to the
+      maximum number of threads available.
    */
-  int num_threads = 2;
+  int num_threads = 0;
 };
 
 /**@}*/
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index e129d23e8..75b1c9ce6 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -21,64 +21,12 @@
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
 #include <memory>
+#include <omp.h>
 #include <random>
 #include <thread>
 
 namespace cuvs::neighbors::hnsw::detail {
 
-// Multithreaded executor
-// The helper function is copied from the hnswlib repository
-// as for some reason, adding vectors to the hnswlib index does not
-// work well with omp parallel for
-template <class Function>
-inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn)
-{
-  if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); }
-
-  if (numThreads == 1) {
-    for (size_t id = start; id < end; id++) {
-      fn(id, 0);
-    }
-  } else {
-    std::vector<std::thread> threads;
-    std::atomic<size_t> current(start);
-
-    // keep track of exceptions in threads
-    // https://stackoverflow.com/a/32428427/1713196
-    std::exception_ptr lastException = nullptr;
-    std::mutex lastExceptMutex;
-
-    for (size_t threadId = 0; threadId < numThreads; ++threadId) {
-      threads.push_back(std::thread([&, threadId] {
-        while (true) {
-          size_t id = current.fetch_add(1);
-
-          if (id >= end) { break; }
-
-          try {
-            fn(id, threadId);
-          } catch (...) {
-            std::unique_lock<std::mutex> lastExcepLock(lastExceptMutex);
-            lastException = std::current_exception();
-            /*
-             * This will work even when current is the largest value that
-             * size_t can fit, because fetch_add returns the previous value
-             * before the increment (what will result in overflow
-             * and produce 0 instead of current + 1).
-             */
-            current = end;
-            break;
-          }
-        }
-      }));
-    }
-    for (auto& thread : threads) {
-      thread.join();
-    }
-    if (lastException) { std::rethrow_exception(lastException); }
-  }
-}
-
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -217,10 +165,13 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
     cagra_index.graph().extent(1) / 2,
     params.ef_construction);
   appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
-  ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) {
+  auto num_threads           = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+  std::cout << "num_threads: " << num_threads << std::endl;
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
     appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
                         i);
-  });
+  }
   appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
 
   // move cagra graph to host
@@ -236,11 +187,13 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
 // copy cagra graph to hnswlib base layer
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
-    auto ll_i = appr_algo->get_linklist0(i);
+    auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
+    auto ll_i             = appr_algo->get_linklist0(hnsw_internal_id);
     appr_algo->setListCount(ll_i, host_graph.extent(1));
     auto* data = (uint32_t*)(ll_i + 1);
     for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
-      data[j] = host_graph(i, j);
+      auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
+      data[j]                   = neighbor_internal_id;
     }
   }
 
@@ -275,19 +228,15 @@ void extend(raft::resources const& res,
     const_cast<void*>(idx.get_index()));
   auto current_element_count = hnswlib_index->getCurrentElementCount();
   auto new_element_count     = additional_dataset.extent(0);
-  auto num_threads           = params.num_threads == 0 ? std::thread::hardware_concurrency()
-                                                       : static_cast<size_t>(params.num_threads);
+  auto num_threads           = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
 
   hnswlib_index->resizeIndex(current_element_count + new_element_count);
-  ParallelFor(current_element_count,
-              current_element_count + new_element_count,
-              num_threads,
-              [&](size_t i, size_t threadId) {
-                hnswlib_index->addPoint(
-                  (void*)(additional_dataset.data_handle() +
-                          (i - current_element_count) * additional_dataset.extent(1)),
-                  i);
-              });
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < additional_dataset.extent(0); i++) {
+    hnswlib_index->addPoint(
+      (void*)(additional_dataset.data_handle() + i * additional_dataset.extent(1)),
+      current_element_count + i);
+  }
 }
 
 template <typename T>
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index 0233a510a..628d87e00 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -123,7 +123,7 @@ extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
 {
   return cuvs::core::translate_exceptions([=] {
     *params = new cuvsHnswIndexParams{
-      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2};
+      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 0};
   });
 }
 
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 4c44350e8..72a3617bd 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -52,12 +52,10 @@ cdef class IndexParams:
     ef_construction : int, default = 200 (optional)
         Maximum number of candidate list size used during construction
         when hierarchy is `cpu`.
-    num_threads : int, default = 2 (optional)
+    num_threads : int, default = 0 (optional)
         Number of CPU threads used to increase construction parallelism
-        when hierarchy is `cpu`.
-        NOTE: Constructing the hierarchy when converting from a CAGRA graph
-        is highly sensitive to parallelism, and increasing the number of
-        threads can reduce the quality of the index.
+        when hierarchy is `cpu`. When the value is 0, the number of threads is
+        automatically determined to the maximum number of threads available.
     """
 
     cdef cuvsHnswIndexParams* params
@@ -71,7 +69,7 @@ cdef class IndexParams:
     def __init__(self, *,
                  hierarchy="none",
                  ef_construction=200,
-                 num_threads=2):
+                 num_threads=0):
         if hierarchy == "none":
             self.params.hierarchy = cuvsHnswHierarchy.NONE
         elif hierarchy == "cpu":
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 20f583ae8..b6ddf14df 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -54,7 +54,7 @@ def run_hnsw_build_search_test(
 
     assert index.trained
 
-    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1)
+    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy)
     hnsw_index = hnsw.from_cagra(hnsw_params, index)
 
     queries = generate_data((n_queries, n_cols), dtype)
@@ -135,7 +135,7 @@ def run_hnsw_extend_test(
 
     assert index.trained
 
-    hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1)
+    hnsw_params = hnsw.IndexParams(hierarchy="cpu")
     hnsw_index = hnsw.from_cagra(hnsw_params, index)
     hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset)
 
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index 90a561bca..b02efaa8f 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -1,5 +1,6 @@
 name: cuvs_cagra_hnswlib
 constraints:
+  build: cuvs_bench.config.algos.constraints.cuvs_cagra_build
   search: cuvs_bench.config.algos.constraints.hnswlib_search
 groups:
   base:
@@ -7,8 +8,7 @@ groups:
       graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
-      hierarchy: ["none", "cpu"]
+      hierarchy: ["cpu"]
       ef_construction: [64, 128, 256, 512]
-      num_threads: [2, 5, 10]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]