From 3cf591f7553459cc9f083092694321df5c68945e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 5 Jan 2024 07:07:37 +0000 Subject: [PATCH] Improve: Test coverage --- include/stringzilla/stringzilla.h | 4 +- scripts/bench_container.cpp | 57 +++++++++++----------- scripts/test.cpp | 81 ++++++++++++++++++++++++++++--- 3 files changed, 106 insertions(+), 36 deletions(-) diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index ea268d6b..a63aba55 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -2068,6 +2068,7 @@ SZ_PUBLIC sz_bool_t sz_string_grow(sz_string_t *string, sz_size_t new_space, sz_ string->on_heap.start = new_start; string->on_heap.space = new_space; string->on_heap.padding = 0; + string->on_heap.length = string_length; // Deallocate the old string. if (string_is_on_heap) allocator->free(string_start, string_space, allocator->handle); @@ -2146,8 +2147,9 @@ SZ_PUBLIC void sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t } SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) { - if (sz_string_is_on_stack(string)) return; + if (!sz_string_is_on_stack(string)) allocator->free(string->on_heap.start, string->on_heap.space, allocator->handle); + sz_string_init(string); } SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) { diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp index a7aff5b9..6a77ba45 100644 --- a/scripts/bench_container.cpp +++ b/scripts/bench_container.cpp @@ -14,56 +14,59 @@ using namespace ashvardanian::stringzilla::scripts; +template +std::vector to(std::vector const &strings) { + std::vector result; + result.reserve(strings.size()); + for (string_type_from const &string : strings) result.push_back({string.data(), string.size()}); + return result; +} + /** * @brief Evaluation for search string operations: find. */ template -void bench(std::vector const &strings) { +void bench(std::string name, std::vector const &strings) { using key_type = typename container_at::key_type; + std::vector keys = to(strings); // Build up the container container_at container; - for (key_type const &key : strings) container[key] = 0; + for (key_type const &key : keys) container[key] = 0; tracked_function_gt variant; - variant.results = bench_on_tokens(strings, [&](key_type const &key) { - container[key]++; - return 1; + variant.name = name; + variant.results = bench_on_tokens(keys, [&](key_type const &key) { + container.find(key)->second++; + return key.size(); }); variant.print(); } -template -std::vector to(std::vector const &strings) { - std::vector result; - result.reserve(strings.size()); - for (string_type_from const &string : strings) result.push_back({string.data(), string.size()}); - return result; -} - template void bench_tokens(strings_type const &strings) { if (strings.size() == 0) return; - - // Pure STL - bench>(to(strings)); - bench>(to(strings)); - bench>(to(strings)); - bench>(to(strings)); + auto const &s = strings; // StringZilla structures - bench>(to(strings)); - bench>(to(strings)); - bench>(to(strings)); - bench>(to(strings)); + bench>("map", s); + bench>("map", s); + bench>("unordered_map", s); + bench>("unordered_map", s); + + // Pure STL + bench>("map", s); + bench>("map", s); + bench>("unordered_map", s); + bench>("unordered_map", s); // STL structures with StringZilla operations - // bench>(to(strings)); - // bench>(to(strings)); - // bench>(to(strings)); - // bench>(to(strings)); + // bench>("map", s); + // bench>("map", s); + // bench>("unordered_map", s); + // bench>("unordered_map", s); } int main(int argc, char const **argv) { diff --git a/scripts/test.cpp b/scripts/test.cpp index 5a9e051e..3dc46fba 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -1,8 +1,9 @@ -#include // assertions -#include // `std::printf` -#include // `std::memcpy` -#include // `std::distance` -#include // `std::vector` +#include // `std::transform` +#include // assertions +#include // `std::printf` +#include // `std::memcpy` +#include // `std::distance` +#include // `std::vector` #define SZ_USE_X86_AVX2 0 #define SZ_USE_X86_AVX512 0 @@ -26,6 +27,8 @@ template void eval(std::string_view haystack_pattern, std::string_view needle_stl, std::size_t misalignment) { constexpr std::size_t max_repeats = 128; alignas(64) char haystack[misalignment + max_repeats * haystack_pattern.size()]; + std::vector offsets_stl; + std::vector offsets_sz; for (std::size_t repeats = 0; repeats != 128; ++repeats) { std::size_t haystack_length = (repeats + 1) * haystack_pattern.size(); @@ -47,16 +50,37 @@ void eval(std::string_view haystack_pattern, std::string_view needle_stl, std::s auto count_stl = std::distance(begin_stl, end_stl); auto count_sz = std::distance(begin_sz, end_sz); + // To simplify debugging, let's first export all the match offsets, and only then compare them + std::transform(begin_stl, end_stl, std::back_inserter(offsets_stl), + [&](auto const &match) { return match.data() - haystack_stl.data(); }); + std::transform(begin_sz, end_sz, std::back_inserter(offsets_sz), + [&](auto const &match) { return match.data() - haystack_sz.data(); }); + // Compare results - for (; begin_stl != end_stl && begin_sz != end_sz; ++begin_stl, ++begin_sz) { + for (std::size_t match_idx = 0; begin_stl != end_stl && begin_sz != end_sz; + ++begin_stl, ++begin_sz, ++match_idx) { auto match_stl = *begin_stl; auto match_sz = *begin_sz; - assert(match_stl.data() == match_sz.data()); + if (match_stl.data() != match_sz.data()) { + std::printf("Mismatch at index #%zu: %zu != %zu\n", match_idx, match_stl.data() - haystack_stl.data(), + match_sz.data() - haystack_sz.data()); + std::printf("Breakdown of found matches:\n"); + std::printf("- STL (%zu): ", offsets_stl.size()); + for (auto offset : offsets_stl) std::printf("%zu ", offset); + std::printf("\n"); + std::printf("- StringZilla (%zu): ", offsets_sz.size()); + for (auto offset : offsets_sz) std::printf("%zu ", offset); + std::printf("\n"); + assert(false); + } } // If one range is not finished, assert failure assert(count_stl == count_sz); assert(begin_stl == end_stl && begin_sz == end_sz); + + offsets_stl.clear(); + offsets_sz.clear(); } } @@ -107,16 +131,48 @@ void eval(std::string_view haystack_pattern, std::string_view needle_stl) { int main(int argc, char const **argv) { std::printf("Hi Ash! ... or is it someone else?!\n"); + // Comparing relative order of the strings + assert("a"_sz.compare("a") == 0); + assert("a"_sz.compare("ab") == -1); + assert("ab"_sz.compare("a") == 1); + assert("a"_sz.compare("a\0"_sz) == -1); + assert("a\0"_sz.compare("a") == 1); + assert("a\0"_sz.compare("a\0"_sz) == 0); + assert("a"_sz == "a"_sz); + assert("a"_sz != "a\0"_sz); + assert("a\0"_sz == "a\0"_sz); + + assert(sz_size_bit_ceil(0) == 1); + assert(sz_size_bit_ceil(1) == 1); + assert(sz_size_bit_ceil(2) == 2); + assert(sz_size_bit_ceil(3) == 4); + assert(sz_size_bit_ceil(127) == 128); + assert(sz_size_bit_ceil(128) == 128); + assert(sz::string("abc").edit_distance("_abc") == 1); assert(sz::string("").edit_distance("_") == 1); assert(sz::string("_").edit_distance("") == 1); + assert(sz::string("_").edit_distance("xx") == 2); + assert(sz::string("_").edit_distance("xx", 1) == 1); + assert(sz::string("_").edit_distance("xx", 0) == 0); std::string_view alphabet = "abcdefghijklmnopqrstuvwxyz"; // 26 characters std::string_view base64 = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-"; // 64 characters std::string_view common = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-=@$%"; // 68 characters + // Make sure copy constructors work as expected: + { + std::vector strings; + for (std::size_t alphabet_slice = 0; alphabet_slice != alphabet.size(); ++alphabet_slice) + strings.push_back(alphabet.substr(0, alphabet_slice)); + std::vector copies {strings}; + std::vector assignments = strings; + assert(std::equal(strings.begin(), strings.end(), copies.begin())); + assert(std::equal(strings.begin(), strings.end(), assignments.begin())); + } + // When haystack is only formed of needles: - // eval("a", "a"); + eval("a", "a"); eval("ab", "ab"); eval("abc", "abc"); eval("abcd", "abcd"); @@ -124,6 +180,15 @@ int main(int argc, char const **argv) { eval(base64, base64); eval(common, common); + // When we are dealing with NULL characters inside the string + eval("\0", "\0"); + eval("a\0", "a\0"); + eval("ab\0", "ab"); + eval("ab\0", "ab\0"); + eval("abc\0", "abc"); + eval("abc\0", "abc\0"); + eval("abcd\0", "abcd"); + // When haystack is formed of equidistant needles: eval("ab", "a"); eval("abc", "a");