diff --git a/cpp/README.md b/cpp/README.md index 089f215979..3a44aabf77 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -1,8 +1,26 @@ # Apache Fury™ C++ -## How to build +Fury is a blazingly-fast multi-language serialization framework powered by just-in-time compilation and zero-copy. + +## Build Fury C++ ```bash +# Build all projects bazel build //:all +# Run all tests bazel test //:all ``` + +## Environment + +- Bazel version: 6.3.2 + +## Benchmark + +```bash +bazel build //cpp/fury/benchmark:all +bazel test //cpp/fury/benchmark:all +# You can also run a single benchmark to see how efficient it is. +# For example +bazel run //cpp/fury/benchmark:benchmark_string_util +``` diff --git a/cpp/fury/benchmark/BUILD b/cpp/fury/benchmark/BUILD new file mode 100644 index 0000000000..f6e3f697da --- /dev/null +++ b/cpp/fury/benchmark/BUILD @@ -0,0 +1,24 @@ +load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") + +cc_library( + name = "fury_benchmark", + srcs = glob(["*.cc"], exclude=["benchmark*.cc"]), + hdrs = glob(["*.h"]), + strip_include_prefix = "/cpp", + alwayslink = True, + linkstatic = True, + deps = [ + "//cpp/fury/util:fury_util", + "@com_google_benchmark//:benchmark", + ], + visibility = ["//visibility:public"], +) + + +cc_test( + name = "benchmark_string_util", + srcs = ["benchmark_string_util.cc"], + deps = [ + ":fury_benchmark", + ], +) diff --git a/cpp/fury/benchmark/benchmark_string_util.cc b/cpp/fury/benchmark/benchmark_string_util.cc new file mode 100644 index 0000000000..851cd3e538 --- /dev/null +++ b/cpp/fury/benchmark/benchmark_string_util.cc @@ -0,0 +1,510 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include +#include +#include + +#include "fury/util/string_util.h" + +#include +#include + +/* + * TEST + */ + +// Generate random bytes (0x00 to 0xFF) +std::string generateRandom(size_t length) { + std::string result; + result.reserve(length); + + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0x00, 0xFF); + + for (size_t i = 0; i < length; ++i) { + result.push_back(static_cast(distribution(generator))); + } + return result; +} + +// Generate ASCII string (0x00 to 0x7F) +std::string generateAscii(size_t length) { + const char charset[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + std::default_random_engine rng(std::random_device{}()); + std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += charset[dist(rng)]; + } + return result; +} + +// Generate Latin-1 string (0x00 to 0xFF) as std::u16string +std::u16string generateLatin1(size_t length) { + std::u16string result; + result.reserve(length); + + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0x00, 0xFF); + + for (size_t i = 0; i < length; ++i) { + result.push_back(static_cast(distribution(generator))); + } + return result; +} + +// Generate UTF-8 string (valid Unicode code points) +std::string generateUtf8(size_t length) { + std::string result; + result.reserve(length); + + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (result.size() < length) { + uint32_t code_point = distribution(generator); + + // Skip surrogate pairs (0xD800 to 0xDFFF) and invalid Unicode code points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; + } + + if (code_point <= 0x7F) { + result.push_back(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + result.push_back(0xC0 | (code_point >> 6)); + result.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { + result.push_back(0xE0 | (code_point >> 12)); + result.push_back(0x80 | ((code_point >> 6) & 0x3F)); + result.push_back(0x80 | (code_point & 0x3F)); + } else { + result.push_back(0xF0 | (code_point >> 18)); + result.push_back(0x80 | ((code_point >> 12) & 0x3F)); + result.push_back(0x80 | ((code_point >> 6) & 0x3F)); + result.push_back(0x80 | (code_point & 0x3F)); + } + } + return result; +} + +// Generate UTF-16 string (valid Unicode code points) +std::u16string generateUtf16(size_t length) { + std::u16string result; + result.reserve(length); + + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (result.size() < length) { + uint32_t code_point = distribution(generator); + + // Skip surrogate pairs (0xD800 to 0xDFFF) and invalid Unicode code points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; + } + + if (code_point <= 0xFFFF) { + result.push_back(static_cast(code_point)); + } else { + // Handle code points greater than 0xFFFF (requires surrogate pairs) + code_point -= 0x10000; + char16_t high_surrogate = 0xD800 | ((code_point >> 10) & 0x3FF); + char16_t low_surrogate = 0xDC00 | (code_point & 0x3FF); + result.push_back(high_surrogate); + result.push_back(low_surrogate); + } + } + return result; +} + +/* + * TEST NUM + */ +const size_t num_tests = 1000; +const size_t string_length = 1000; + +/* + * TEST Strings + */ +// Generate a vector of Ascii strings for testing +std::vector generateAsciiString(size_t num_tests, + size_t string_length) { + std::vector test_strings; + for (size_t i = 0; i < string_length; ++i) { + test_strings.push_back(generateUtf8(num_tests)); + } + return test_strings; +} + +const std::vector test_ascii_strings = + generateAsciiString(num_tests, string_length); + +// Generate a vector of Latin-1 strings for testing +std::vector generateLatin1String(size_t num_tests, + size_t string_length) { + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateLatin1(string_length)); + } + return test_strings; +} + +const std::vector test_latin1_strings = + generateLatin1String(num_tests, string_length); + +// Generate random UTF-16 string +std::vector generateUTF16String(size_t num_tests, + size_t string_length) { + std::vector test_strings; + for (size_t i = 0; i < string_length; ++i) { + test_strings.push_back(generateUtf16(num_tests)); + } + return test_strings; +} + +const std::vector test_utf16_strings = + generateUTF16String(num_tests, string_length); + +// Generate random UTF-8 string +std::vector generateUTF8String(size_t num_tests, + size_t string_length) { + std::vector test_strings; + for (size_t i = 0; i < string_length; ++i) { + test_strings.push_back(generateUtf8(num_tests)); + } + return test_strings; +} + +const std::vector test_utf8_strings = + generateUTF8String(num_tests, string_length); + +/* + * TEST IsAscii + */ + +// Check if a string is ASCII (all characters <= 0x7F) +bool isAscii_BaseLine(const std::string &str) { + for (char c : str) { + if (static_cast(c) > 0x7F) { + return false; + } + } + return true; +} + +// Benchmark function for Baseline ASCII check +static void BM_IsAscii_BaseLine(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_ascii_strings) { + bool result = isAscii_BaseLine(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +// Benchmark function for SIMD ASCII check +static void BM_IsAscii_SIMD(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_ascii_strings) { + bool result = fury::isAscii(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +BENCHMARK(BM_IsAscii_BaseLine); +BENCHMARK(BM_IsAscii_SIMD); + +// Baseline implementation to check if a string is Latin-1 +bool isLatin1_BaseLine(const std::u16string &str) { + const std::uint16_t *data = + reinterpret_cast(str.data()); + size_t size = str.size(); + + for (size_t i = 0; i < size; ++i) { + if (data[i] > 0xFF) { + return false; + } + } + return true; +} + +// Benchmark function for Baseline Latin-1 check +static void BM_IsLatin1_BaseLine(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_latin1_strings) { + bool result = isLatin1_BaseLine(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +// Benchmark function for Optimized Latin-1 check +static void BM_IsLatin1_SIMD(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_latin1_strings) { + bool result = fury::isLatin1(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +BENCHMARK(BM_IsLatin1_BaseLine); +BENCHMARK(BM_IsLatin1_SIMD); + +/* + * TEST Utf16HasSurrogatePairs + */ +// Check if a UTF-16 string contains surrogate pairs +bool utf16HasSurrogatePairs_BaseLine(const std::u16string &str) { + for (size_t i = 0; i < str.size(); ++i) { + char16_t c = str[i]; + if (c >= 0xD800 && c <= 0xDFFF) { + return true; + } + } + return false; +} + +// Benchmark function for checking if a UTF-16 string contains surrogate pairs +static void BM_Utf16HasSurrogatePairs_BaseLine(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + bool result = utf16HasSurrogatePairs_BaseLine(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +// Benchmark function for checking if a UTF-16 string contains surrogate pairs +// with SIMD +static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + bool result = fury::utf16HasSurrogatePairs(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} +BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine); +BENCHMARK(BM_Utf16HasSurrogatePairs_SIMD); + +/* + * TEST Utf16ToUtf8 + */ + +// UTF16 to UTF8 using the standard library +std::string utf16ToUtf8StandardLibrary(const std::u16string &utf16) { + std::wstring_convert, char16_t> convert; + return convert.to_bytes(utf16); +} + +// UTF16 to UTF8 baseline conversion (without SIMD) +std::string utf16ToUtf8BaseLine(const std::u16string &utf16, + bool is_little_endian = true) { + size_t utf16_length = utf16.length(); + size_t utf8_length = utf16_length * 3; + std::string utf8_result(utf8_length, '\0'); + + size_t i = 0, j = 0; + while (i < utf16_length) { + char16_t utf16_char = utf16[i++]; + if (utf16_char < 0x80) { + utf8_result[j++] = static_cast(utf16_char); + } else if (utf16_char < 0x800) { + utf8_result[j++] = static_cast(0xC0 | (utf16_char >> 6)); + utf8_result[j++] = static_cast(0x80 | (utf16_char & 0x3F)); + } else { + utf8_result[j++] = static_cast(0xE0 | (utf16_char >> 12)); + utf8_result[j++] = static_cast(0x80 | ((utf16_char >> 6) & 0x3F)); + utf8_result[j++] = static_cast(0x80 | (utf16_char & 0x3F)); + } + } + + utf8_result.resize(j); + return utf8_result; +} + +// Benchmark function for Standard Library UTF-16 to UTF-8 conversion +static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + std::string utf8 = utf16ToUtf8StandardLibrary(str); + benchmark::DoNotOptimize( + utf8); // Prevents the compiler from optimizing away unused variables + } + } +} + +// Benchmark function for Baseline UTF-16 to UTF-8 conversion +static void BM_Utf16ToUtf8_BaseLine(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + std::string utf8 = utf16ToUtf8BaseLine(str, true); + benchmark::DoNotOptimize( + utf8); // Prevents the compiler from optimizing away unused variables + } + } +} + +// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion +static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + std::string utf8 = fury::utf16ToUtf8(str, true); + benchmark::DoNotOptimize( + utf8); // Prevents the compiler from optimizing away unused variables + } + } +} + +BENCHMARK(BM_Utf16ToUtf8_StandardLibrary); +BENCHMARK(BM_Utf16ToUtf8_BaseLine); +BENCHMARK(BM_Utf16ToUtf8_SIMD); + +/* + * TEST Utf8ToUtf16 + */ + +// UTF8 to UTF16 using the standard library +std::u16string utf8ToUtf16StandardLibrary(const std::string &utf8) { + std::wstring_convert, char16_t> convert; + return convert.from_bytes(utf8); +} + +// UTF8 to UTF16 baseline conversion (without SIMD) +std::u16string utf8ToUtf16BaseLine(const std::string &utf8, + bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } + + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; +} + +// Benchmark function for Standard Library UTF-8 to UTF-16 conversion +static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf8_strings) { + std::u16string utf16 = utf8ToUtf16StandardLibrary(str); + benchmark::DoNotOptimize( + utf16); // Prevents the compiler from optimizing away unused variables + } + } +} + +// Benchmark function for Baseline UTF-8 to UTF-16 conversion +static void BM_Utf8ToUtf16_BaseLine(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf8_strings) { + std::u16string utf16 = utf8ToUtf16BaseLine(str, true); + benchmark::DoNotOptimize( + utf16); // Prevents the compiler from optimizing away unused variables + } + } +} + +// Benchmark function for SIMD-based UTF-8 to UTF-16 conversion +static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf8_strings) { + std::u16string utf16 = fury::utf8ToUtf16(str, true); + benchmark::DoNotOptimize( + utf16); // Prevents the compiler from optimizing away unused variables + } + } +} + +BENCHMARK(BM_Utf8ToUtf16_StandardLibrary); +BENCHMARK(BM_Utf8ToUtf16_BaseLine); +BENCHMARK(BM_Utf8ToUtf16_SIMD); + +BENCHMARK_MAIN(); diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD index 073c0e0632..36fe126b37 100644 --- a/cpp/fury/util/BUILD +++ b/cpp/fury/util/BUILD @@ -2,7 +2,7 @@ load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") cc_library( name = "fury_util", - srcs = glob(["*.cc"], exclude=["*test.cc", "benchmark.cc"]), + srcs = glob(["*.cc"], exclude=["*test.cc"]), hdrs = glob(["*.h"]), copts = ["-mavx2"], # Enable AVX2 support linkopts = ["-mavx2"], # Ensure linker also knows about AVX2 @@ -14,7 +14,6 @@ cc_library( "@com_google_absl//absl/debugging:failure_signal_handler", "@com_google_absl//absl/debugging:stacktrace", "@com_google_absl//absl/debugging:symbolize", - "@com_google_benchmark//:benchmark", ], visibility = ["//visibility:public"], ) @@ -64,12 +63,3 @@ cc_test( "@com_google_googletest//:gtest", ], ) - -cc_test( - name = "benchmark", - srcs = ["benchmark.cc"], - deps = [ - ":fury_util", - "@com_google_benchmark//:benchmark", - ], -) \ No newline at end of file diff --git a/cpp/fury/util/benchmark.cc b/cpp/fury/util/benchmark.cc deleted file mode 100644 index e911a70cba..0000000000 --- a/cpp/fury/util/benchmark.cc +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -#include -#include -#include - -#include "string_util.h" - -#include -#include - -// Function to generate a random UTF-16 string -std::u16string generateRandomUTF16String(size_t length) { - const char charset[] = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - std::default_random_engine rng(std::random_device{}()); - std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); - - std::u16string result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - result += static_cast(charset[dist(rng)]); - } - - return result; -} - -std::vector generateUTF16String(size_t num_tests) { - std::vector test_strings; - for (size_t i = 0; i < num_tests; ++i) { - test_strings.push_back(generateRandomUTF16String(num_tests)); - } - return test_strings; -} - -const std::vector test_strings = generateUTF16String(1000); - -// UTF16 to UTF8 using the standard library -std::string utf16ToUtf8StandardLibrary(const std::u16string &utf16) { - std::wstring_convert, char16_t> convert; - return convert.to_bytes(utf16); -} - -// UTF16 to UTF8 baseline conversion (without SIMD) -std::string utf16ToUtf8BaseLine(const std::u16string &utf16, - bool is_little_endian = true) { - size_t utf16_length = utf16.length(); - size_t utf8_length = utf16_length * 3; - std::string utf8_result(utf8_length, '\0'); - - size_t i = 0, j = 0; - while (i < utf16_length) { - char16_t utf16_char = utf16[i++]; - if (utf16_char < 0x80) { - utf8_result[j++] = static_cast(utf16_char); - } else if (utf16_char < 0x800) { - utf8_result[j++] = static_cast(0xC0 | (utf16_char >> 6)); - utf8_result[j++] = static_cast(0x80 | (utf16_char & 0x3F)); - } else { - utf8_result[j++] = static_cast(0xE0 | (utf16_char >> 12)); - utf8_result[j++] = static_cast(0x80 | ((utf16_char >> 6) & 0x3F)); - utf8_result[j++] = static_cast(0x80 | (utf16_char & 0x3F)); - } - } - - utf8_result.resize(j); - return utf8_result; -} - -// Benchmark function for Standard Library UTF-16 to UTF-8 conversion -static void BM_StandardLibrary(benchmark::State &state) { - for (auto _ : state) { - for (const auto &str : test_strings) { - std::string utf8 = utf16ToUtf8StandardLibrary(str); - } - } -} - -// Benchmark function for Baseline UTF-16 to UTF-8 conversion -static void BM_BaseLine(benchmark::State &state) { - for (auto _ : state) { - for (const auto &str : test_strings) { - std::string utf8 = utf16ToUtf8BaseLine(str, true); - } - } -} - -// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion -static void BM_SIMD(benchmark::State &state) { - for (auto _ : state) { - for (const auto &str : test_strings) { - std::string utf8 = fury::utf16ToUtf8(str, true); - } - } -} - -BENCHMARK(BM_StandardLibrary); -BENCHMARK(BM_BaseLine); -BENCHMARK(BM_SIMD); -BENCHMARK_MAIN(); diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index f2a2fdab09..2f267e146b 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -17,7 +17,6 @@ * under the License. */ -#include #include #include #include @@ -30,8 +29,8 @@ namespace fury { -// Function to generate a random string -std::string generateRandomString(size_t length) { +// Generate ASCII string +std::string generateAscii(size_t length) { const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; std::default_random_engine rng(std::random_device{}()); @@ -42,57 +41,26 @@ std::string generateRandomString(size_t length) { for (size_t i = 0; i < length; ++i) { result += charset[dist(rng)]; } - return result; } -bool isAscii_BaseLine(const std::string &str) { - for (char c : str) { - if (static_cast(c) >= 128) { - return false; - } - } - return true; -} - -TEST(StringUtilTest, TestisAsciiFunctions) { - std::string testStr = generateRandomString(100000); - auto start_time = std::chrono::high_resolution_clock::now(); - bool result = isAscii_BaseLine(testStr); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(FURY_INFO) << "BaseLine Running Time: " << duration << " ns."; - - start_time = std::chrono::high_resolution_clock::now(); - result = isAscii(testStr); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - - start_time) - .count(); - FURY_LOG(FURY_INFO) << "Optimized Running Time: " << duration << " ns."; - - EXPECT_TRUE(result); -} - TEST(StringUtilTest, TestisAsciiLogic) { - // Test strings with only Latin characters + // Test strings with only Ascii characters EXPECT_TRUE(isAscii("Fury")); - EXPECT_TRUE(isAscii(generateRandomString(80))); + EXPECT_TRUE(isAscii(generateAscii(80))); - // Test unaligned strings with only Latin characters - EXPECT_TRUE(isAscii(generateRandomString(80) + "1")); - EXPECT_TRUE(isAscii(generateRandomString(80) + "12")); - EXPECT_TRUE(isAscii(generateRandomString(80) + "123")); + // Test unaligned strings with only Ascii characters + EXPECT_TRUE(isAscii(generateAscii(80) + "1")); + EXPECT_TRUE(isAscii(generateAscii(80) + "12")); + EXPECT_TRUE(isAscii(generateAscii(80) + "123")); - // Test strings with non-Latin characters + // Test strings with non-Ascii characters EXPECT_FALSE(isAscii("你好, Fury")); - EXPECT_FALSE(isAscii(generateRandomString(80) + "你好")); - EXPECT_FALSE(isAscii(generateRandomString(80) + "1你好")); - EXPECT_FALSE(isAscii(generateRandomString(11) + "你")); - EXPECT_FALSE(isAscii(generateRandomString(10) + "你好")); - EXPECT_FALSE(isAscii(generateRandomString(9) + "性能好")); + EXPECT_FALSE(isAscii(generateAscii(80) + "你好")); + EXPECT_FALSE(isAscii(generateAscii(80) + "1你好")); + EXPECT_FALSE(isAscii(generateAscii(11) + "你")); + EXPECT_FALSE(isAscii(generateAscii(10) + "你好")); + EXPECT_FALSE(isAscii(generateAscii(9) + "性能好")); EXPECT_FALSE(isAscii("\u1234")); EXPECT_FALSE(isAscii("a\u1234")); EXPECT_FALSE(isAscii("ab\u1234")); @@ -249,7 +217,7 @@ std::string generateRandomUTF8String(size_t length) { str.push_back(0xE0 | (code_point >> 12)); str.push_back(0x80 | ((code_point >> 6) & 0x3F)); str.push_back(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0x10FFFF) { + } else { str.push_back(0xF0 | (code_point >> 18)); str.push_back(0x80 | ((code_point >> 12) & 0x3F)); str.push_back(0x80 | ((code_point >> 6) & 0x3F)); @@ -260,78 +228,6 @@ std::string generateRandomUTF8String(size_t length) { return str; } -std::u16string utf8ToUtf16BaseLine(const std::string &utf8, - bool is_little_endian) { - std::u16string utf16; // Resulting UTF-16 string - size_t i = 0; // Index for traversing the UTF-8 string - size_t n = utf8.size(); // Total length of the UTF-8 string - - // Loop through each byte of the UTF-8 string - while (i < n) { - uint32_t code_point = 0; // The Unicode code point - unsigned char c = utf8[i]; // Current byte of the UTF-8 string - - // Determine the number of bytes for this character based on its first byte - if ((c & 0x80) == 0) { - // 1-byte character (ASCII) - code_point = c; - ++i; - } else if ((c & 0xE0) == 0xC0) { - // 2-byte character - code_point = c & 0x1F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - i += 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character - code_point = c & 0x0F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - i += 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character - code_point = c & 0x07; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); - i += 4; - } else { - // Invalid UTF-8 byte sequence - throw std::invalid_argument("Invalid UTF-8 encoding."); - } - - // If the code point is beyond the BMP range, use surrogate pairs - if (code_point >= 0x10000) { - code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair - uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate - - // If not little-endian, swap bytes of the surrogates - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } - - // Add both high and low surrogates to the UTF-16 string - utf16.push_back(high_surrogate); - utf16.push_back(low_surrogate); - } else { - // For code points within the BMP range, directly store as a 16-bit value - uint16_t utf16_char = static_cast(code_point); - - // If not little-endian, swap the bytes of the 16-bit character - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); - } - - // Add the UTF-16 character to the string - utf16.push_back(utf16_char); - } - } - - // Return the resulting UTF-16 string - return utf16; -} - // Testing Basic Logic TEST(UTF8ToUTF16Test, BasicConversion) { std::string utf8 = u8"Hello, 世界!"; @@ -396,75 +292,6 @@ TEST(UTF8ToUTF16Test, RoundTripConversion) { ASSERT_EQ(original_utf8, utf8_converted_back); } -// Testing Performance -TEST(UTF8ToUTF16Test, PerformanceTest) { - const size_t num_tests = 1000; - const size_t string_length = 1000; - // Default little_endian - bool is_little_endian = true; - - // Random UTF-8 - std::vector test_strings; - for (size_t i = 0; i < num_tests; ++i) { - test_strings.push_back(generateRandomUTF8String(string_length)); - } - - // Standard Library - try { - auto start_time = std::chrono::high_resolution_clock::now(); - std::wstring_convert, wchar_t> convert; - // Loop through test strings and convert each UTF-8 string to UTF-16 - for (const auto &str : test_strings) { - std::wstring wide_str = convert.from_bytes(str); - std::u16string utf16; - for (wchar_t wc : wide_str) { - utf16.push_back(static_cast(wc)); - } - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(FURY_INFO) << "Standard Library Running Time: " << duration - << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FURY_FATAL) << "Caught exception in standard library conversion: " - << e.what(); - } - - // BaseLine - try { - auto start_time = std::chrono::high_resolution_clock::now(); - for (const auto &str : test_strings) { - std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(FURY_INFO) << "BaseLine Running Time: " << duration << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FURY_FATAL) << "Caught exception in baseline conversion: " - << e.what(); - } - - // Optimized (SIMD) - try { - auto start_time = std::chrono::high_resolution_clock::now(); - for (const auto &str : test_strings) { - std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(FURY_INFO) << "SIMD Optimized Running Time: " << duration << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FURY_FATAL) << "Caught exception in SIMD optimized conversion: " - << e.what(); - } -} - } // namespace fury int main(int argc, char **argv) {