diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index 913692b03eb90..d9fc2bbac4e17 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -150,9 +150,6 @@ class IntDecoder { template T readInt(); - // Reads Int96 timestamp composed of days and nanos as int128_t. - int128_t readInt96(); - template T readVInt(); @@ -169,6 +166,7 @@ class IntDecoder { void skipVarints(uint64_t items); int128_t readVsHugeInt(); uint128_t readVuHugeInt(); + int128_t readTimestamp(); // NOTE: there is opportunity for performance gains here by avoiding this by // directly supporting deserialization into the correct target data type @@ -437,24 +435,28 @@ inline T IntDecoder::readInt() { if (useVInts_) { return readVInt(); } + if (bigEndian_) { return readLittleEndianFromBigEndian(); - } else { - if constexpr (std::is_same_v) { - if (numBytes_ == 12) { - VELOX_DCHECK(!useVInts_, "Int96 should not be VInt encoded."); - return readInt96(); - } - VELOX_NYI(); + } + + if constexpr (std::is_same_v) { + // Timestamp as Int64 or Int96 + if (numBytes_ == 8 || numBytes_ == 12) { + return readTimestamp(); } - return readLongLE(); + VELOX_NYI(); } + + return readLongLE(); } template -inline int128_t IntDecoder::readInt96() { +inline int128_t IntDecoder::readTimestamp() { + VELOX_DCHECK(!useVInts_, "Timestamp should not be VInt encoded."); + int128_t result = 0; - for (int i = 0; i < 12; ++i) { + for (int i = 0; i < numBytes_; ++i) { auto ch = readByte(); result |= static_cast(ch & BASE_256_MASK) << (i * 8); } diff --git a/velox/dwio/parquet/reader/TimestampColumnReader.h b/velox/dwio/parquet/reader/TimestampColumnReader.h index 308743ecea4fc..4c950df46c450 100644 --- a/velox/dwio/parquet/reader/TimestampColumnReader.h +++ b/velox/dwio/parquet/reader/TimestampColumnReader.h @@ -75,24 +75,26 @@ class TimestampColumnReader : public IntegerColumnReader { continue; } - // Convert int128_t to Timestamp by extracting days and nanos. + // Convert int128_t to Timestamp by extracting seconds and nanos. const int128_t encoded = reinterpret_cast(rawValues[i]); - const int32_t days = static_cast(encoded >> 64); - uint64_t nanos = encoded & ((((1ULL << 63) - 1ULL) << 1) + 1); - const auto timestamp = Timestamp::fromDaysAndNanos(days, nanos); - - nanos = timestamp.getNanos(); - switch (timestampPrecision_) { - case TimestampPrecision::kMilliseconds: - nanos = nanos / 1'000'000 * 1'000'000; - break; - case TimestampPrecision::kMicroseconds: - nanos = nanos / 1'000 * 1'000; - break; - case TimestampPrecision::kNanoseconds: - break; + int128_t seconds = encoded; + int precisionWidth = 0; + while (seconds >= 10'000'000'000) { + seconds /= 10; + ++precisionWidth; } - rawValues[i] = Timestamp(timestamp.getSeconds(), nanos); + VELOX_CHECK( + precisionWidth == 3 || precisionWidth == 6 || precisionWidth == 9); + + uint64_t nanos = encoded - seconds * std::pow(10, precisionWidth); + const int timestampPrecision = static_cast(timestampPrecision_); + if (precisionWidth < timestampPrecision) { + nanos *= std::pow(10, timestampPrecision - precisionWidth); + } else if (timestampPrecision < precisionWidth) { + nanos /= std::pow(10, precisionWidth - timestampPrecision); + } + + rawValues[i] = Timestamp(static_cast(seconds), nanos); } }