[#24465] Fix Usearch / Hnswlib precision discrepancy

Summary: Fixing some inconsistencies in index parameters that are causing a discrepancy between Usearch and Hnswlib performance: - Correctly specifying connectivity for hnswlib as num_neighbors_per_vertex instead of max_neighbors_per_vertex. - Passing the ef option into hnswlib configuration. Adding internal statistics introspection to Usearch and Hnswlib index wrappers. PR for hnswlib changes: nmslib/hnswlib#594. PR for usearch changes: unum-cloud/usearch#508 Also allow specifying multiple values of k to pass in as input, as long as they are not greater than the precomputed ground truth result list size. Updating hnsw_tool to always convert uint8_t coordinates to float32 when using Hnswlib to have a fair comparison with Usearch on the SIFT1B dataset. Usearch does not currently support the uint8_t type natively. The changes to src/inline-thirdparty will be pushed as separate commits generated by `build-support/thirdparty_tool --sync-inline-thirdparty`. Test Plan: Jenkins Manual testing using hnsw_tool - hnswlib: https://gist.githubusercontent.com/mbautin/d21580dcac0b51ad2d7bc9fc130c5f9e/raw ``` Hnswlib index with 5 levels max_elements: 1000000 M: 16 maxM: 16 maxM0: 32 ef_construction: 128 ef: 10 mult: 0.360674 Level 0: 1000000 nodes, 21613828 edges, 21.61 average edges per node Level 1: 62323 nodes, 885027 edges, 14.20 average edges per node Level 2: 3855 nodes, 50515 edges, 13.10 average edges per node Level 3: 238 nodes, 2543 edges, 10.68 average edges per node Level 4: 17 nodes, 244 edges, 14.35 average edges per node Totals: 1066433 nodes, 22552157 edges, 21.15 average edges per node i-recall @ 50, i=1..10: 1-recall @ 50: 0.9695000052 2-recall @ 50: 0.9645000100 3-recall @ 50: 0.9604333043 4-recall @ 50: 0.9568499923 5-recall @ 50: 0.9541400075 6-recall @ 50: 0.9504333138 7-recall @ 50: 0.9467428327 8-recall @ 50: 0.9435999990 9-recall @ 50: 0.9406333566 10-recall @ 50: 0.9377999902 ``` - usearch: https://gist.githubusercontent.com/mbautin/74948b310780562e74831eb29e43cb13/raw ``` Usearch index with 4 levels connectivity: 16 connectivity_base: 32 expansion_add: 128 expansion_search: 10 inverse_log_connectivity: 0.360674 Level 0: 1000000 nodes, 20973352 edges, 20.97 average edges per node Level 1: 64036 nodes, 890428 edges, 13.91 average edges per node Level 2: 5090 nodes, 66295 edges, 13.02 average edges per node Level 3: 481 nodes, 5304 edges, 11.03 average edges per node Totals: 1069607 nodes, 21935379 edges, 20.51 average edges per node i-recall@50, i=1..10: 1-recall @ 40: 0.9305999875 2-recall @ 40: 0.9201999903 3-recall @ 40: 0.9141333103 4-recall @ 40: 0.9085000157 5-recall @ 40: 0.9036399722 6-recall @ 40: 0.8987166882 7-recall @ 40: 0.8932142854 8-recall @ 40: 0.8890249729 9-recall @ 40: 0.8852999806 10-recall @ 40: 0.8813199997 ``` Reviewers: sergei, aleksandr.ponomarenko Reviewed By: sergei, aleksandr.ponomarenko Subscribers: ybase Differential Revision: https://phorge.dev.yugabyte.com/D38977
yugabyte · Oct 22, 2024 · 1b2db39 · 1b2db39
1 parent 9007253
commit 1b2db39
Show file tree

Hide file tree

Showing 17 changed files with 608 additions and 115 deletions.
diff --git a/build-support/inline_thirdparty.yml b/build-support/inline_thirdparty.yml
@@ -15,8 +15,8 @@
 
 dependencies:
   - name: usearch
-    git_url: https://github.com/unum-cloud/usearch
-    commit: 191d9bb46fe5e2a44d1505ce7563ed51c7e55868
+    git_url: https://github.com/yugabyte/usearch
+    tag: v2.15.3-yb-2
     src_dir: include
     dest_dir: usearch
 
@@ -27,8 +27,10 @@ dependencies:
     dest_dir: fp16
 
   - name: hnswlib
-    git_url: https://github.com/nmslib/hnswlib
-    commit: 2142dc6f4dd08e64ab727a7bbd93be7f732e80b0
+    git_url: https://github.com/yugabyte/hnswlib
+    # c1b9b79a is the commit in the upstream repo that we branched off of. We apply our patches to
+    # our branch c1b9b79a-yb while waiting for our PRs to be merged in the official Hnswlib.
+    tag: vc1b9b79a-yb-1
     src_dir: hnswlib
     dest_dir: hnswlib/hnswlib
 

diff --git a/src/yb/gutil/stl_util.h b/src/yb/gutil/stl_util.h
@@ -898,6 +898,51 @@ struct PointerHash {
 template <class Value>
 using UnorderedStringMap = std::unordered_map<std::string, Value, StringHash, std::equal_to<void>>;
 
+// Define a concept that ensures Container's value_type is T
+template<typename Container, typename T>
+concept ContainerOf = requires {
+  typename Container::value_type;
+} && std::same_as<typename Container::value_type, T>;
+
+// A unified way to insert values into supported containers
+
+// Overload for std::set
+template <typename Element, typename Compare, typename Allocator>
+void InsertIntoContainer(
+    std::set<Element, Compare, Allocator>& container,
+    const Element& value) {
+  container.insert(value);
+}
+
+template <typename Element, typename Compare, typename Allocator>
+void InsertIntoContainer(
+    std::set<Element, Compare, Allocator>& container,
+    Element&& value) {
+  container.insert(std::move(value));
+}
+
+// Overload for std::vector
+template <typename Element, typename Allocator>
+void InsertIntoContainer(
+    std::vector<Element, Allocator>& container,
+    const Element& value) {
+  container.push_back(value);
+}
+
+template <typename Element, typename Allocator>
+void InsertIntoContainer(
+    std::vector<Element, Allocator>& container,
+    Element&& value) {
+  container.push_back(std::move(value));
+}
+
+// Fallback to generate a compile-time error for unsupported containers
+template <typename Container, typename Element>
+void InsertIntoContainer(Container&, const Element&) {
+  static_assert(sizeof(Container) == 0,
+                "InsertIntoContainer is not supported for this container type.");
+}
+
 } // namespace yb
 
 // For backward compatibility

diff --git a/src/yb/tools/hnsw_tool.cc b/src/yb/tools/hnsw_tool.cc
diff --git a/src/yb/util/stol_utils-test.cc b/src/yb/util/stol_utils-test.cc
@@ -21,21 +21,27 @@ namespace util {
 class StolUtilsTest : public YBTest {
  protected:
   void TestStoi(const std::string& str, const int32_t int_val) {
-    auto decoded_int = CheckedStoi(str);
-    ASSERT_OK(decoded_int);
-    ASSERT_EQ(int_val, *decoded_int);
+    auto decoded_int = ASSERT_RESULT(CheckedStoi(str));
+    ASSERT_EQ(int_val, decoded_int);
   }
 
   void TestStoll(const std::string& str, const int64_t int_val) {
-    auto decoded_int = CheckedStoll(str);
-    ASSERT_OK(decoded_int);
-    ASSERT_EQ(int_val, *decoded_int);
+    auto decoded_int = ASSERT_RESULT(CheckedStoll(str));
+    ASSERT_EQ(int_val, decoded_int);
+  }
+
+  void TestStoull(const std::string& str, const uint64_t int_val) {
+    auto decoded_int = ASSERT_RESULT(CheckedStoull(str));
+    ASSERT_EQ(int_val, decoded_int);
   }
 
   void TestStold(const std::string& str, const long double double_val) {
-    auto decoded_double = CheckedStold(str);
-    ASSERT_OK(decoded_double);
-    ASSERT_DOUBLE_EQ(double_val, *decoded_double);
+    auto decoded_double = ASSERT_RESULT(CheckedStold(str));
+    if (std::isnan(double_val) && std::isnan(decoded_double)) {
+      // Equality might not work for two NaN values, but we consider them equal here.
+      return;
+    }
+    ASSERT_DOUBLE_EQ(double_val, decoded_double);
   }
 };
 
@@ -70,11 +76,31 @@ TEST_F(StolUtilsTest, TestCheckedStoild) {
   ASSERT_NOK(CheckedStoll("123 123"));
   ASSERT_NOK(CheckedStoll(""));
 
+  TestStoull("0", 0);
+  TestStoull("123", 123);
+  TestStoull("9223372036854775808", 9223372036854775808ULL);
+  TestStoull("18446744073709551615", 18446744073709551615ULL);
+  ASSERT_NOK(CheckedStoull("abcd"));
+  ASSERT_NOK(CheckedStoull(" 123"));
+  ASSERT_NOK(CheckedStoull("123abc"));
+  ASSERT_NOK(CheckedStoull("-1"));
+  ASSERT_NOK(CheckedStoull("-9223372036854775809"));
+  ASSERT_NOK(CheckedStoull("123.1"));
+  ASSERT_NOK(CheckedStoull("123456789123456789123456789"));
+  ASSERT_NOK(CheckedStoull("123 123"));
+  ASSERT_NOK(CheckedStoull(""));
+
   TestStold("123", 123);
   TestStold("-123", -123);
   TestStold("123.1", 123.1);
   TestStold("1.7e308", 1.7e308);
   TestStold("-1.7e308", -1.7e308);
+  TestStold("inf", std::numeric_limits<long double>::infinity());
+  TestStold("InF", std::numeric_limits<long double>::infinity());
+  TestStold("-inf", -std::numeric_limits<long double>::infinity());
+  TestStold("-iNf", -std::numeric_limits<long double>::infinity());
+  TestStold("nAn", std::numeric_limits<long double>::quiet_NaN());
+  TestStold("nan", std::numeric_limits<long double>::quiet_NaN());
   ASSERT_NOK(CheckedStold("abcd"));
   ASSERT_NOK(CheckedStold(" 123"));
   ASSERT_NOK(CheckedStold("123abc"));
@@ -84,5 +110,137 @@ TEST_F(StolUtilsTest, TestCheckedStoild) {
   ASSERT_NOK(CheckedStold(""));
 }
 
+
+TEST_F(StolUtilsTest, TestCheckedParseNumber) {
+  // Test with double
+  {
+    auto result = CheckedParseNumber<double>("123.456");
+    ASSERT_OK(result);
+    ASSERT_DOUBLE_EQ(123.456, *result);
+
+    result = CheckedParseNumber<double>("-123.456");
+    ASSERT_OK(result);
+    ASSERT_DOUBLE_EQ(-123.456, *result);
+
+    result = CheckedParseNumber<double>("1.7e308");
+    ASSERT_OK(result);
+    ASSERT_DOUBLE_EQ(1.7e308, *result);
+
+    result = CheckedParseNumber<double>("inf");
+    ASSERT_OK(result);
+    ASSERT_EQ(std::numeric_limits<double>::infinity(), *result);
+
+    result = CheckedParseNumber<double>("-inf");
+    ASSERT_OK(result);
+    ASSERT_EQ(-std::numeric_limits<double>::infinity(), *result);
+
+    result = CheckedParseNumber<double>("nan");
+    ASSERT_OK(result);
+    ASSERT_TRUE(std::isnan(*result));
+
+    result = CheckedParseNumber<double>("1.8e308");
+    ASSERT_NOK(result);  // Out of range (infinite)
+
+    result = CheckedParseNumber<double>("abc");
+    ASSERT_NOK(result);  // Invalid input
+  }
+
+  // Test with float
+  {
+    auto result = CheckedParseNumber<float>("123.456");
+    ASSERT_OK(result);
+    ASSERT_FLOAT_EQ(123.456f, *result);
+
+    result = CheckedParseNumber<float>("-123.456");
+    ASSERT_OK(result);
+    ASSERT_FLOAT_EQ(-123.456f, *result);
+
+    result = CheckedParseNumber<float>("3.4e38");
+    ASSERT_OK(result);
+    ASSERT_FLOAT_EQ(3.4e38f, *result);
+
+    result = CheckedParseNumber<float>("inf");
+    ASSERT_OK(result);
+    ASSERT_EQ(std::numeric_limits<float>::infinity(), *result);
+
+    result = CheckedParseNumber<float>("-inf");
+    ASSERT_OK(result);
+    ASSERT_EQ(-std::numeric_limits<float>::infinity(), *result);
+
+    result = CheckedParseNumber<float>("nan");
+    ASSERT_OK(result);
+    ASSERT_TRUE(std::isnan(*result));
+
+    result = CheckedParseNumber<float>("3.5e38");
+    ASSERT_NOK(result);  // Out of range (infinite)
+
+    result = CheckedParseNumber<float>("abc");
+    ASSERT_NOK(result);  // Invalid input
+  }
+}
+
+TEST_F(StolUtilsTest, TestParseCommaSeparatedListOfNumbers) {
+  // Test with vector<int32_t>
+  {
+    auto result =
+        ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("1,2,3");
+    ASSERT_OK(result);
+    ASSERT_EQ((std::vector<int32_t>{1, 2, 3}), *result);
+
+    // Test with bounds
+    result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("4,5,6", 4, 6);
+    ASSERT_OK(result);
+    ASSERT_EQ((std::vector<int32_t>{4, 5, 6}), *result);
+
+    // Test with out-of-bounds value
+    result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("3,4,5", 4, 6);
+    ASSERT_NOK(result);  // 3 is less than lower bound 4
+  }
+
+  // Test with set<double>
+  {
+    auto result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("1.1,2.2,3.3");
+    ASSERT_OK(result);
+    ASSERT_EQ((std::set<double>{1.1, 2.2, 3.3}), *result);
+
+    // Test with bounds
+    result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("2.5,3.5", 2.0, 4.0);
+    ASSERT_OK(result);
+    ASSERT_EQ((std::set<double>{2.5, 3.5}), *result);
+
+    // Test with out-of-bounds value
+    result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("1.5,2.5", 2.0, 4.0);
+    ASSERT_NOK(result);  // 1.5 is less than lower bound 2.0
+  }
+
+  // Test with vector<uint64_t>
+  {
+    auto result =
+        ParseCommaSeparatedListOfNumbers<uint64_t, std::vector<uint64_t>>("10000000000,20000000000");
+    ASSERT_OK(result);
+    ASSERT_EQ((std::vector<uint64_t>{10000000000ULL, 20000000000ULL}), *result);
+  }
+
+  // Test with invalid input
+  {
+    auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("1,abc,3");
+    ASSERT_NOK(result);  // "abc" cannot be parsed as int32_t
+  }
+
+  // Test with empty input
+  {
+    auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("");
+    ASSERT_OK(result);
+    ASSERT_TRUE(result->empty());
+  }
+
+  // Test with spaces and tabs
+  {
+    auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>(" 7 ,\t8 , 9 ");
+    ASSERT_OK(result);
+    ASSERT_EQ((std::vector<int32_t>{7, 8, 9}), *result);
+  }
+}
+
 } // namespace util
 } // namespace yb
diff --git a/src/yb/util/stol_utils.cc b/src/yb/util/stol_utils.cc
@@ -15,14 +15,19 @@
 
 #include <cstring>
 
+#include "yb/util/string_util.h"
+
 using namespace std::placeholders;
 
 namespace yb {
 
 namespace {
 
-Status CreateInvalid(Slice input, int err = 0) {
+Status CreateInvalid(Slice input, int err = 0, const char* type_name = "") {
   auto message = Format("$0 is not a valid number", input.ToDebugString());
+  if (*type_name) {
+    message += Format(" for type $0", type_name);
+  }
   if (err != 0) {
     message += ": ";
     message += std::strerror(err);
@@ -41,17 +46,31 @@ Status CheckNotSpace(Slice slice) {
 template <typename T, typename StrToT>
 Result<T> CheckedSton(Slice slice, StrToT str_to_t) {
   RETURN_NOT_OK(CheckNotSpace(slice));
+  if constexpr (std::is_floating_point_v<T>) {
+    auto maybe_special_value = TryParsingNonNumberValue<T>(slice);
+    if (maybe_special_value) {
+      return *maybe_special_value;
+    }
+  }
+
   char* str_end;
   errno = 0;
   T result = str_to_t(slice.cdata(), &str_end);
   // Check errno.
   if (errno != 0) {
-    return CreateInvalid(slice, errno);
+    return CreateInvalid(slice, errno, typeid(T).name());
+  }
+  if constexpr (std::is_unsigned<T>::value) {
+    // Do not allow any minus signs for unsigned types.
+    for (auto* p = slice.cdata(); p != str_end; ++p) {
+      if (*p == '-') {
+        return CreateInvalid(slice, /* err= */ 0, typeid(T).name());
+      }
+    }
   }
-
   // Check that entire string was processed.
   if (str_end != slice.cend()) {
-    return CreateInvalid(slice);
+    return CreateInvalid(slice, /* err= */ 0, typeid(T).name());
   }
 
   return result;