From 8191e1f57518d18a2b8f7516a48d908c5b62fea6 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 16 Sep 2024 20:19:46 -0400 Subject: [PATCH] src: improve utf8 string generation performance PR-URL: https://github.com/nodejs/node/pull/54873 Reviewed-By: Daniel Lemire Reviewed-By: Matteo Collina Reviewed-By: James M Snell Reviewed-By: Stephen Belanger --- src/string_bytes.cc | 40 ++++++++++++++++++++++------------------ src/util.cc | 28 +++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 7a5d40b89d4625..8a94d0eb63245c 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -388,21 +388,21 @@ Maybe StringBytes::StorageSize(Isolate* isolate, Local val, enum encoding encoding) { HandleScope scope(isolate); - size_t data_size = 0; - bool is_buffer = Buffer::HasInstance(val); - if (is_buffer && (encoding == BUFFER || encoding == LATIN1)) { + if (Buffer::HasInstance(val) && (encoding == BUFFER || encoding == LATIN1)) { return Just(Buffer::Length(val)); } Local str; if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str)) return Nothing(); + String::ValueView view(isolate, str); + size_t data_size = 0; switch (encoding) { case ASCII: case LATIN1: - data_size = str->Length(); + data_size = view.length(); break; case BUFFER: @@ -410,25 +410,25 @@ Maybe StringBytes::StorageSize(Isolate* isolate, // A single UCS2 codepoint never takes up more than 3 utf8 bytes. // It is an exercise for the caller to decide when a string is // long enough to justify calling Size() instead of StorageSize() - data_size = 3 * str->Length(); + data_size = 3 * view.length(); break; case UCS2: - data_size = str->Length() * sizeof(uint16_t); + data_size = view.length() * sizeof(uint16_t); break; case BASE64URL: - data_size = simdutf::base64_length_from_binary(str->Length(), + data_size = simdutf::base64_length_from_binary(view.length(), simdutf::base64_url); break; case BASE64: - data_size = simdutf::base64_length_from_binary(str->Length()); + data_size = simdutf::base64_length_from_binary(view.length()); break; case HEX: - CHECK(str->Length() % 2 == 0 && "invalid hex string length"); - data_size = str->Length() / 2; + CHECK(view.length() % 2 == 0 && "invalid hex string length"); + data_size = view.length() / 2; break; default: @@ -449,32 +449,36 @@ Maybe StringBytes::Size(Isolate* isolate, Local str; if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str)) return Nothing(); + String::ValueView view(isolate, str); switch (encoding) { case ASCII: case LATIN1: - return Just(str->Length()); + return Just(view.length()); case BUFFER: case UTF8: - return Just(str->Utf8Length(isolate)); + if (view.is_one_byte()) { + return Just(simdutf::utf8_length_from_latin1( + reinterpret_cast(view.data8()), view.length())); + } + return Just(simdutf::utf8_length_from_utf16( + reinterpret_cast(view.data16()), view.length())); case UCS2: - return Just(str->Length() * sizeof(uint16_t)); + return Just(view.length() * sizeof(uint16_t)); case BASE64URL: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length(), + return Just(simdutf::base64_length_from_binary(view.length(), simdutf::base64_url)); } case BASE64: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length())); + return Just(simdutf::base64_length_from_binary(view.length())); } case HEX: - return Just(str->Length() / 2); + return Just(view.length() / 2); } UNREACHABLE(); diff --git a/src/util.cc b/src/util.cc index f9aad9ef5a6213..173f115a162562 100644 --- a/src/util.cc +++ b/src/util.cc @@ -48,6 +48,8 @@ #include #endif +#include + #include #include #include @@ -100,11 +102,31 @@ static void MakeUtf8String(Isolate* isolate, MaybeStackBuffer* target) { Local string; if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return; + String::ValueView value_view(isolate, string); + + auto value_length = value_view.length(); + + if (value_view.is_one_byte()) { + auto const_char = reinterpret_cast(value_view.data8()); + auto expected_length = + target->capacity() < (static_cast(value_length) * 2 + 1) + ? simdutf::utf8_length_from_latin1(const_char, value_length) + : value_length * 2; + + // Add +1 for null termination. + target->AllocateSufficientStorage(expected_length + 1); + const auto actual_length = simdutf::convert_latin1_to_utf8( + const_char, value_length, target->out()); + target->SetLengthAndZeroTerminate(actual_length); + return; + } - size_t storage; - if (!StringBytes::StorageSize(isolate, string, UTF8).To(&storage)) return; - storage += 1; + // Add +1 for null termination. + size_t storage = (3 * value_length) + 1; target->AllocateSufficientStorage(storage); + + // TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's + // implemented const int flags = String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8; const int length =