From 4014ca37948c0bd9db481bf6d37b1c5ef71e2262 Mon Sep 17 00:00:00 2001 From: yangzq50 Date: Fri, 24 Jan 2025 17:52:17 +0800 Subject: [PATCH] Update http support for Array (#2494) ### What problem does this PR solve? Support create, insert and select Array type in HTTP API. Issue link:#2455 ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Test cases --- example/http/insert_search_data.sh | 16 +- src/executor/operator/physical_import.cpp | 24 +++ src/network/http_server.cpp | 196 +++++++++++++--------- src/parser/expr/constant_expr.cpp | 18 ++ src/parser/expr/constant_expr.h | 2 + 5 files changed, 171 insertions(+), 85 deletions(-) diff --git a/example/http/insert_search_data.sh b/example/http/insert_search_data.sh index dfbc03adea..ce2c180569 100755 --- a/example/http/insert_search_data.sh +++ b/example/http/insert_search_data.sh @@ -48,6 +48,10 @@ curl --request POST \ { "name": "tensor", "type": "tensor,4,float" + }, + { + "name": "nested_array", + "type": "array,array,varchar" } ] } ' @@ -67,7 +71,8 @@ curl --request POST \ "vec": [1.0, 1.2, 0.8, 0.9], "sparse_column": {"10":1.1, "20":2.2, "30": 3.3}, "year": 2024, - "tensor": [[1.0, 0.0, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]] + "tensor": [[1.0, 0.0, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], + "nested_array": {"array": [{"array": ["hello", "world"]}, {"array": ["!"]}, {"array": [""]}, {"array": []}]} }, { "num": 2, @@ -76,7 +81,8 @@ curl --request POST \ "vec": [4.0, 4.2, 4.3, 4.5], "sparse_column": {"40":4.4, "50":5.5, "60": 6.6}, "year": 2023, - "tensor": [[4.0, 0.0, 4.3, 4.5], [4.0, 4.2, 4.4, 5.0]] + "tensor": [[4.0, 0.0, 4.3, 4.5], [4.0, 4.2, 4.4, 5.0]], + "nested_array": {"array": [{"array": ["hello world!"]}]} }, { "num": 3, @@ -85,7 +91,8 @@ curl --request POST \ "vec": [4.0, 4.2, 4.3, 4.2], "sparse_column": {"70":7.7, "80":8.8, "90": 9.9}, "year": 2019, - "tensor": [[0.9, 0.1, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]] + "tensor": [[0.9, 0.1, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], + "nested_array": {"array": [{"array": []}]} }, { "num": 4, @@ -94,7 +101,8 @@ curl --request POST \ "vec": [4.0, 4.2, 4.3, 4.5], "sparse_column": {"20":7.7, "80":7.8, "90": 97.9}, "year": 2018, - "tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]] + "tensor": [[5.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]], + "nested_array": {"array": []} } ] ' diff --git a/src/executor/operator/physical_import.cpp b/src/executor/operator/physical_import.cpp index 265f48a246..4172761706 100644 --- a/src/executor/operator/physical_import.cpp +++ b/src/executor/operator/physical_import.cpp @@ -936,6 +936,30 @@ SharedPtr BuildConstantExprFromJson(const nlohmann::json &json_obj } } } + case nlohmann::json::value_t::object: { + if (json_object.size() == 1 && json_object.begin().key() == "array") { + const auto &array_obj = json_object.begin().value(); + if (array_obj.type() != nlohmann::json::value_t::array) { + const auto error_info = fmt::format("Unrecognized json object type in array: {}, expect array!", array_obj.type_name()); + RecoverableError(Status::ImportFileFormatError(error_info)); + return nullptr; + } + auto res = MakeShared(LiteralType::kCurlyBracketsArray); + for (const auto &elem : array_obj) { + auto elem_expr = BuildConstantExprFromJson(elem); + if (!elem_expr) { + RecoverableError(Status::ImportFileFormatError("Failed to build expr for element of array!")); + return nullptr; + } + res->curly_brackets_array_.push_back(std::move(elem_expr)); + } + return res; + } else { + const auto error_info = fmt::format("Unrecognized json object type: {}", json_object.type_name()); + RecoverableError(Status::ImportFileFormatError(error_info)); + return nullptr; + } + } default: { const auto error_info = fmt::format("Unrecognized json object type: {}", json_object.type_name()); RecoverableError(Status::ImportFileFormatError(error_info)); diff --git a/src/network/http_server.cpp b/src/network/http_server.cpp index 3c0dff0682..8fa9881461 100644 --- a/src/network/http_server.cpp +++ b/src/network/http_server.cpp @@ -57,6 +57,7 @@ import logical_type; import embedding_info; import sparse_info; import decimal_info; +import array_info; import status; import constant_expr; import command_statement; @@ -66,6 +67,98 @@ namespace { using namespace infinity; +Pair, infinity::Status> ParseColumnType(const Span tokens, const nlohmann::json &field_element) { + SharedPtr column_type; + if (tokens.empty()) { + return {nullptr, infinity::Status::ParserError("Empty column type")}; + } else if (tokens[0] == "vector" || tokens[0] == "multivector" || tokens[0] == "tensor" || tokens[0] == "tensorarray") { + if (tokens.size() != 3) { + return {nullptr, infinity::Status::ParserError("vector / multivector / tensor / tensorarray type syntax error")}; + } + const SizeT dimension = std::stoull(tokens[1]); + const auto &etype = tokens[2]; + EmbeddingDataType e_data_type; + if (etype == "int" || etype == "integer" || etype == "int32") { + e_data_type = EmbeddingDataType::kElemInt32; + } else if (etype == "uint8") { + e_data_type = EmbeddingDataType::kElemUInt8; + } else if (etype == "int8" || etype == "tinyint") { + e_data_type = EmbeddingDataType::kElemInt8; + } else if (etype == "float" || etype == "float32") { + e_data_type = EmbeddingDataType::kElemFloat; + } else if (etype == "double" || etype == "float64") { + e_data_type = EmbeddingDataType::kElemDouble; + } else if (etype == "float16") { + e_data_type = EmbeddingDataType::kElemFloat16; + } else if (etype == "bfloat16") { + e_data_type = EmbeddingDataType::kElemBFloat16; + } else if (etype == "bit") { + e_data_type = EmbeddingDataType::kElemBit; + } else { + return {nullptr, infinity::Status::InvalidEmbeddingDataType(etype)}; + } + auto type_info = EmbeddingInfo::Make(e_data_type, dimension); + auto logical_type_v = LogicalType::kInvalid; + if (tokens[0] == "vector") { + logical_type_v = LogicalType::kEmbedding; + } else if (tokens[0] == "multivector") { + logical_type_v = LogicalType::kMultiVector; + } else if (tokens[0] == "tensor") { + logical_type_v = LogicalType::kTensor; + } else if (tokens[0] == "tensorarray") { + logical_type_v = LogicalType::kTensorArray; + } + column_type = std::make_shared(logical_type_v, std::move(type_info)); + } else if (tokens[0] == "sparse") { + if (tokens.size() != 4) { + return {nullptr, infinity::Status::ParserError("sparse type syntax error")}; + } + const SizeT dimension = std::stoull(tokens[1]); + const auto &dtype = tokens[2]; + const auto &itype = tokens[3]; + EmbeddingDataType d_data_type; + EmbeddingDataType i_data_type; + if (dtype == "integer" || dtype == "int" || dtype == "int32") { + d_data_type = EmbeddingDataType::kElemInt32; + } else if (dtype == "float" || dtype == "float32") { + d_data_type = EmbeddingDataType::kElemFloat; + } else if (dtype == "double" || dtype == "float64") { + d_data_type = EmbeddingDataType::kElemDouble; + } else { + return {nullptr, infinity::Status::InvalidEmbeddingDataType(dtype)}; + } + + if (itype == "tinyint" || itype == "int8") { + i_data_type = EmbeddingDataType::kElemInt8; + } else if (itype == "smallint" || itype == "int16") { + i_data_type = EmbeddingDataType::kElemInt16; + } else if (itype == "integer" || itype == "int" || itype == "int32") { + i_data_type = EmbeddingDataType::kElemInt32; + } else if (itype == "bigint" || itype == "int64") { + i_data_type = EmbeddingDataType::kElemInt64; + } else { + return {nullptr, infinity::Status::InvalidEmbeddingDataType(itype)}; + } + auto type_info = SparseInfo::Make(d_data_type, i_data_type, dimension, SparseStoreType::kSort); + column_type = std::make_shared(LogicalType::kSparse, std::move(type_info)); + } else if (tokens[0] == "decimal") { + auto type_info = DecimalInfo::Make(field_element["precision"], field_element["scale"]); + column_type = std::make_shared(LogicalType::kDecimal, std::move(type_info)); + } else if (tokens[0] == "array") { + auto [element_type, stat] = ParseColumnType(tokens.subspan<1>(), field_element); + if (!stat.ok()) { + return {nullptr, std::move(stat)}; + } + auto type_info = ArrayInfo::Make(std::move(*element_type)); + column_type = std::make_shared(LogicalType::kArray, std::move(type_info)); + } else if (tokens.size() == 1) { + column_type = DataType::StringDeserialize(tokens[0]); + } else { + return {nullptr, infinity::Status::ParserError(fmt::format("{} isn't supported.", tokens[0]))}; + } + return std::make_pair(std::move(column_type), infinity::Status::OK()); +} + infinity::Status ParseColumnDefs(const nlohmann::json &fields, Vector &column_definitions) { SizeT column_count = fields.size(); for (SizeT column_id = 0; column_id < column_count; ++column_id) { @@ -86,89 +179,12 @@ infinity::Status ParseColumnDefs(const nlohmann::json &fields, Vector column_type{nullptr}; - SharedPtr type_info{nullptr}; try { - if (tokens[0] == "vector" || tokens[0] == "multivector" || tokens[0] == "tensor" || tokens[0] == "tensorarray") { - if (tokens.size() != 3) { - return infinity::Status::ParserError("vector / multivector / tensor / tensorarray type syntax error"); - } - SizeT dimension = std::stoull(tokens[1]); - String etype = tokens[2]; - EmbeddingDataType e_data_type; - if (etype == "int" || etype == "integer" || etype == "int32") { - e_data_type = EmbeddingDataType::kElemInt32; - } else if (etype == "uint8") { - e_data_type = EmbeddingDataType::kElemUInt8; - } else if (etype == "int8" || etype == "tinyint") { - e_data_type = EmbeddingDataType::kElemInt8; - } else if (etype == "float" || etype == "float32") { - e_data_type = EmbeddingDataType::kElemFloat; - } else if (etype == "double" || etype == "float64") { - e_data_type = EmbeddingDataType::kElemDouble; - } else if (etype == "float16") { - e_data_type = EmbeddingDataType::kElemFloat16; - } else if (etype == "bfloat16") { - e_data_type = EmbeddingDataType::kElemBFloat16; - } else if (etype == "bit") { - e_data_type = EmbeddingDataType::kElemBit; - } else { - return infinity::Status::InvalidEmbeddingDataType(etype); - } - type_info = EmbeddingInfo::Make(e_data_type, dimension); - LogicalType logical_type_v = LogicalType::kInvalid; - if (tokens[0] == "vector") { - logical_type_v = LogicalType::kEmbedding; - } else if (tokens[0] == "multivector") { - logical_type_v = LogicalType::kMultiVector; - } else if (tokens[0] == "tensor") { - logical_type_v = LogicalType::kTensor; - } else if (tokens[0] == "tensorarray") { - logical_type_v = LogicalType::kTensorArray; - } - column_type = std::make_shared(logical_type_v, type_info); - } else if (tokens[0] == "sparse") { - if (tokens.size() != 4) { - return infinity::Status::ParserError("sparse type syntax error"); - } - SizeT dimension = std::stoull(tokens[1]); - String dtype = tokens[2]; - String itype = tokens[3]; - EmbeddingDataType d_data_type; - EmbeddingDataType i_data_type; - if (dtype == "integer" || dtype == "int" || dtype == "int32") { - d_data_type = EmbeddingDataType::kElemInt32; - } else if (dtype == "float" || dtype == "float32") { - d_data_type = EmbeddingDataType::kElemFloat; - } else if (dtype == "double" || dtype == "float64") { - d_data_type = EmbeddingDataType::kElemDouble; - } else { - return infinity::Status::InvalidEmbeddingDataType(dtype); - } - - if (itype == "tinyint" || itype == "int8") { - i_data_type = EmbeddingDataType::kElemInt8; - } else if (itype == "smallint" || itype == "int16") { - i_data_type = EmbeddingDataType::kElemInt16; - } else if (itype == "integer" || itype == "int" || itype == "int32") { - i_data_type = EmbeddingDataType::kElemInt32; - } else if (itype == "bigint" || itype == "int64") { - i_data_type = EmbeddingDataType::kElemInt64; - } else { - return infinity::Status::InvalidEmbeddingDataType(itype); - } - type_info = SparseInfo::Make(d_data_type, i_data_type, dimension, SparseStoreType::kSort); - column_type = std::make_shared(LogicalType::kSparse, type_info); - } else if (tokens[0] == "decimal") { - type_info = DecimalInfo::Make(field_element["precision"], field_element["scale"]); - column_type = std::make_shared(LogicalType::kDecimal, type_info); - } else if (tokens[0] == "array") { - type_info = nullptr; - return infinity::Status::ParserError("Array isn't implemented here."); - } else if (tokens.size() == 1) { - column_type = DataType::StringDeserialize(tokens[0]); - } else { - return infinity::Status::ParserError(fmt::format("{} isn't supported.", tokens[0])); + auto [result_type, err_status] = ParseColumnType(tokens, field_element); + if (!err_status.ok()) { + return std::move(err_status); } + column_type = std::move(result_type); } catch (std::exception &e) { return infinity::Status::ParserError("type syntax error"); } @@ -1168,6 +1184,24 @@ class InsertHandler final : public HttpRequestHandler { break; } case nlohmann::json::value_t::object: { + // check array type + if (value.size() == 1 && value.begin().key() == "array") { + SharedPtr array_expr; + try { + auto array_result = BuildConstantExprFromJson(value); + if (!array_result) { + throw std::runtime_error("Empty return value!"); + } + array_expr = std::move(array_result); + } catch (std::exception &e) { + json_response["error_code"] = ErrorCode::kSyntaxError; + json_response["error_message"] = fmt::format("Error when parsing array value: {}", e.what()); + return ResponseFactory::createResponse(http_status, json_response.dump()); + } + auto const_expr = std::make_unique(std::move(*array_expr)); + insert_one_row->values_.emplace_back(std::move(const_expr)); + break; + } std::unique_ptr const_sparse_expr = {}; if (value.size() == 0) { json_response["error_code"] = ErrorCode::kInvalidEmbeddingDataType; diff --git a/src/parser/expr/constant_expr.cpp b/src/parser/expr/constant_expr.cpp index c217ff812f..5f8869741d 100644 --- a/src/parser/expr/constant_expr.cpp +++ b/src/parser/expr/constant_expr.cpp @@ -26,6 +26,24 @@ namespace infinity { +ConstantExpr::ConstantExpr(ConstantExpr &&constant_expr) noexcept : ParsedExpr(ParsedExprType::kConstant) { + literal_type_ = constant_expr.literal_type_; + bool_value_ = constant_expr.bool_value_; + integer_value_ = constant_expr.integer_value_; + double_value_ = constant_expr.double_value_; + str_value_ = constant_expr.str_value_; + constant_expr.str_value_ = nullptr; + interval_type_ = constant_expr.interval_type_; + date_value_ = constant_expr.date_value_; + constant_expr.date_value_ = nullptr; + long_array_ = std::move(constant_expr.long_array_); + double_array_ = std::move(constant_expr.double_array_); + sub_array_array_ = std::move(constant_expr.sub_array_array_); + long_sparse_array_ = std::move(constant_expr.long_sparse_array_); + double_sparse_array_ = std::move(constant_expr.double_sparse_array_); + curly_brackets_array_ = std::move(constant_expr.curly_brackets_array_); +} + ConstantExpr::~ConstantExpr() { switch (literal_type_) { case LiteralType::kString: { diff --git a/src/parser/expr/constant_expr.h b/src/parser/expr/constant_expr.h index 03b6b086fd..50fde5864e 100644 --- a/src/parser/expr/constant_expr.h +++ b/src/parser/expr/constant_expr.h @@ -51,6 +51,8 @@ class ConstantExpr : public ParsedExpr { public: explicit ConstantExpr(LiteralType literal_type) : ParsedExpr(ParsedExprType::kConstant), literal_type_(literal_type) {} + ConstantExpr(ConstantExpr &&constant_expr) noexcept; + ~ConstantExpr() override; [[nodiscard]] std::string ToString() const override;