From e40416ecc9c17db45ffa5c8f8d267a72162335a1 Mon Sep 17 00:00:00 2001 From: HarryLeeIBM Date: Wed, 23 Aug 2023 14:17:19 -0700 Subject: [PATCH] Fix parquet endian issue for s390x --- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/encoding.cc | 23 +++++++++++++++++------ cpp/src/parquet/file_reader.cc | 4 ++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3294aaaf283f1..fd00c489c3f8e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -118,7 +118,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, if (data_size < 4) { throw ParquetException("Received invalid levels (corrupt data page?)"); } - num_bytes = ::arrow::util::SafeLoadAs(data); + num_bytes = ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data)); if (num_bytes < 0 || num_bytes > data_size - 4) { throw ParquetException("Received invalid number of bytes (corrupt data page?)"); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index dda0e7701b1e4..2f0cad15854a4 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1028,7 +1028,7 @@ int PlainDecoder::DecodeArrow( VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - builder->UnsafeAppend(SafeLoadAs(data_)); + builder->UnsafeAppend(::arrow::bit_util::ToLittleEndian(SafeLoadAs(data_))); data_ += sizeof(value_type); }, [&]() { builder->UnsafeAppendNull(); }); @@ -1055,7 +1055,8 @@ int PlainDecoder::DecodeArrow( VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - PARQUET_THROW_NOT_OK(builder->Append(SafeLoadAs(data_))); + PARQUET_THROW_NOT_OK( + builder->Append(::arrow::bit_util::ToLittleEndian(SafeLoadAs(data_)))); data_ += sizeof(value_type); }, [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); }); @@ -1075,7 +1076,17 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, } // If bytes_to_decode == 0, data could be null if (bytes_to_decode > 0) { +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + for (size_t i = 0; i < num_values; ++i) + { + memcpy(out + i, data + sizeof(T) * i, sizeof(T)); + auto begin = reinterpret_cast(out + i); + auto end = begin + sizeof(T); + std::reverse(begin, end); + } +#else memcpy(out, data, bytes_to_decode); +#endif } return static_cast(bytes_to_decode); } @@ -1098,7 +1109,7 @@ static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size, if (ARROW_PREDICT_FALSE(data_size < 4)) { ParquetException::EofException(); } - const int32_t len = SafeLoadAs(data); + const int32_t len = ::arrow::bit_util::ToLittleEndian(SafeLoadAs(data)); if (len < 0) { throw ParquetException("Invalid BYTE_ARRAY value"); } @@ -1387,7 +1398,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(data_); + auto value_len = ::arrow::bit_util::ToLittleEndian(SafeLoadAs(data_)); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } @@ -1433,7 +1444,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(data_); + auto value_len = ::arrow::bit_util::ToLittleEndian(SafeLoadAs(data_)); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } @@ -3312,7 +3323,7 @@ int ByteStreamSplitDecoder::DecodeArrow( const size_t byte_index = b * num_values_in_buffer_ + offset; gathered_byte_data[b] = data[byte_index]; } - builder->UnsafeAppend(SafeLoadAs(&gathered_byte_data[0])); + builder->UnsafeAppend(::arrow::bit_util::ToLittleEndian(SafeLoadAs(&gathered_byte_data[0]))); ++offset; }, [&]() { builder->UnsafeAppendNull(); }); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index adda9a027bded..d0d4a6b626f7b 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -473,9 +473,9 @@ class SerializedFile : public ParquetFileReader::Contents { "is not a parquet file."); } // Both encrypted/unencrypted footers have the same footer length check. - uint32_t metadata_len = ::arrow::util::SafeLoadAs( + uint32_t metadata_len = ::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); + kFooterSize)); if (metadata_len > source_size_ - kFooterSize) { throw ParquetInvalidOrCorruptedFileException( "Parquet file size is ", source_size_,