diff --git a/flex/tests/hqps/trucate_test.cc b/flex/tests/hqps/trucate_test.cc new file mode 100644 index 000000000000..c768fcfd9a9a --- /dev/null +++ b/flex/tests/hqps/trucate_test.cc @@ -0,0 +1,12 @@ +#include "flex/utils/property/column.h" + +#include +#include + +int main(int argc, char** argv) { + std::string str = "abcdefO(1/ε^2)"; + std::cout << "str: " << str << ", size: " << str.size() << std::endl; + std::string_view sv = gs::truncate_utf8(str, 12); + std::cout << sv << ", size:" << sv.size() << std::endl; + return 0; +} \ No newline at end of file diff --git a/flex/utils/property/column.cc b/flex/utils/property/column.cc index cb29b9c8c9fc..15139488a607 100644 --- a/flex/utils/property/column.cc +++ b/flex/utils/property/column.cc @@ -22,6 +22,35 @@ namespace gs { +std::string_view truncate_utf8(std::string_view str, size_t length) { + if (str.size() <= length) { + return str; + } + size_t byte_count = 0; + + for (const char* p = str.data(); *p && byte_count < length;) { + unsigned char ch = *p; + size_t char_length = 0; + if ((ch & 0x80) == 0) { + char_length = 1; + } else if ((ch & 0xE0) == 0xC0) { + char_length = 2; + } else if ((ch & 0xF0) == 0xE0) { + char_length = 3; + } else if ((ch & 0xF8) == 0xF0) { + char_length = 4; + } + LOG(INFO) << "current char length: " << char_length + << ", byte_count: " << byte_count; + if (byte_count + char_length > length) { + break; + } + p += char_length; + byte_count += char_length; + } + return str.substr(0, byte_count); +} + template class TypedEmptyColumn : public ColumnBase { public: diff --git a/flex/utils/property/column.h b/flex/utils/property/column.h index 46454ef7c267..2f554062acd9 100644 --- a/flex/utils/property/column.h +++ b/flex/utils/property/column.h @@ -26,6 +26,8 @@ namespace gs { +std::string_view truncate_utf8(std::string_view str, size_t length); + class ColumnBase { public: virtual ~ColumnBase() {} @@ -503,7 +505,7 @@ class TypedColumn : public ColumnBase { if (copied_val.size() >= width_) { VLOG(1) << "String length" << copied_val.size() << " exceeds the maximum length: " << width_ << ", cut off."; - copied_val = copied_val.substr(0, width_); + copied_val = truncate_utf8(copied_val, width_); } if (idx >= basic_size_ && idx < basic_size_ + extra_size_) { size_t offset = pos_.fetch_add(copied_val.size());