From b439c144d009cdf0fb932d11dad1f3209764d42c Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Thu, 22 Aug 2024 10:17:33 +0000 Subject: [PATCH 1/5] [GLUTEN-6961] [VL] [feat] ArrowWritableColumnVector support write to decimal --- cpp/velox/tests/VeloxRowToColumnarTest.cc | 50 ++++++++++++++++++- .../vectorized/ArrowWritableColumnVector.java | 16 ++++-- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/cpp/velox/tests/VeloxRowToColumnarTest.cc b/cpp/velox/tests/VeloxRowToColumnarTest.cc index c784dbd59c34..0d11dd4acbc9 100644 --- a/cpp/velox/tests/VeloxRowToColumnarTest.cc +++ b/cpp/velox/tests/VeloxRowToColumnarTest.cc @@ -87,10 +87,58 @@ TEST_F(VeloxRowToColumnarTest, allTypes) { makeNullableFlatVector( {std::nullopt, true, false, std::nullopt, true, true, false, true, std::nullopt, std::nullopt}), makeFlatVector( - {"alice0", "bob1", "alice2", "bob3", "Alice4", "Bob5", "AlicE6", "boB7", "ALICE8", "BOB9"}), + {"alice0", + "bob1", + "alice2", + "bob3", + "Alice4", + "Bob5123456789098766notinline", + "AlicE6", + "boB7", + "ALICE8", + "BOB9"}), makeNullableFlatVector( {"alice", "bob", std::nullopt, std::nullopt, "Alice", "Bob", std::nullopt, "alicE", std::nullopt, "boB"}), }); testRowVectorEqual(vector); } + +TEST_F(VeloxRowToColumnarTest, bigint) { + auto vector = makeRowVector({ + makeNullableFlatVector({1, 2, 3, std::nullopt, 4, std::nullopt, 5, 6, std::nullopt, 7}), + }); + testRowVectorEqual(vector); +} + +TEST_F(VeloxRowToColumnarTest, decimal) { + auto vector = makeRowVector({ + makeNullableFlatVector( + {123456, HugeInt::build(1045, 1789), 3678, std::nullopt, 4, std::nullopt, 5, 687987, std::nullopt, 7}, + DECIMAL(38, 2)), + makeNullableFlatVector( + {178987, 2, 3, std::nullopt, 4, std::nullopt, 5, 6, std::nullopt, 7}, DECIMAL(12, 3)), + }); + testRowVectorEqual(vector); +} + +TEST_F(VeloxRowToColumnarTest, timestamp) { + auto vector = makeRowVector({ + makeNullableFlatVector( + {Timestamp(-946684800, 0), + Timestamp(-7266, 0), + Timestamp(0, 0), + Timestamp(946684800, 0), + Timestamp(9466848000, 0), + Timestamp(94668480000, 0), + Timestamp(946729316, 0), + Timestamp(946729316, 0), + Timestamp(946729316, 0), + Timestamp(7266, 0), + Timestamp(-50049331200, 0), + Timestamp(253405036800, 0), + Timestamp(-62480037600, 0), + std::nullopt}), + }); + testRowVectorEqual(vector); +} } // namespace gluten diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java index dfd570debc0a..da741af0f01c 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java @@ -718,6 +718,17 @@ public Decimal getDecimal(int rowId, int precision, int scale) { return accessor.getDecimal(rowId, precision, scale); } + @Override + public void putDecimal(int rowId, Decimal value, int precision) { + if (precision <= Decimal.MAX_INT_DIGITS()) { + putInt(rowId, (int) value.toUnscaledLong()); + } else if (precision <= Decimal.MAX_LONG_DIGITS()) { + putLong(rowId, value.toUnscaledLong()); + } else { + writer.setBytes(rowId, value.toJavaBigDecimal()); + } + } + @Override public UTF8String getUTF8String(int rowId) { if (isNullAt(rowId)) { @@ -1255,9 +1266,8 @@ void setNull(int rowId) { throw new UnsupportedOperationException(); } - void setNotNull(int rowId) { - throw new UnsupportedOperationException(); - } + // Arrow not need to setNotNull, set the valus is enough. + void setNotNull(int rowId) {} void setNulls(int rowId, int count) { throw new UnsupportedOperationException(); From 8b8aa2904d491714b34a39e66c637d285a168604 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 23 Aug 2024 10:33:01 +0000 Subject: [PATCH 2/5] add test --- .../vectorized/ArrowColumnVectorTest.java | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 backends-velox/src/test/java/org/apache/gluten/vectorized/ArrowColumnVectorTest.java diff --git a/backends-velox/src/test/java/org/apache/gluten/vectorized/ArrowColumnVectorTest.java b/backends-velox/src/test/java/org/apache/gluten/vectorized/ArrowColumnVectorTest.java new file mode 100644 index 000000000000..11330544df78 --- /dev/null +++ b/backends-velox/src/test/java/org/apache/gluten/vectorized/ArrowColumnVectorTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.vectorized; + +import org.apache.spark.sql.execution.vectorized.MutableColumnarRow; +import org.apache.spark.sql.types.Decimal; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.util.TaskResources$; +import org.junit.Assert; +import org.junit.Test; + +public class ArrowColumnVectorTest { + + @Test + public void testWriteByMutableColumnarRow() { + TaskResources$.MODULE$.runUnsafe( + () -> { + final ArrowWritableColumnVector[] columns = newArrowColumns("a decimal(20, 1)", 20); + MutableColumnarRow row = new MutableColumnarRow(columns); + Decimal decimal = new Decimal(); + decimal.set(234, 20, 1); + row.setDecimal(0, decimal, 20); + Assert.assertEquals(row.getDecimal(0, 20, 1), decimal); + return null; + }); + } + + private static ArrowWritableColumnVector[] newArrowColumns(String schema, int numRows) { + ArrowWritableColumnVector[] columns = + ArrowWritableColumnVector.allocateColumns(numRows, StructType.fromDDL(schema)); + for (ArrowWritableColumnVector col : columns) { + col.setValueCount(numRows); + } + return columns; + } +} From cd1f114ddd8a96d04b31283459c52b29cb18ac16 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 23 Aug 2024 10:46:23 +0000 Subject: [PATCH 3/5] refactor --- .../vectorized/ArrowWritableColumnVector.java | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java index da741af0f01c..55bef42664b7 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java @@ -718,17 +718,6 @@ public Decimal getDecimal(int rowId, int precision, int scale) { return accessor.getDecimal(rowId, precision, scale); } - @Override - public void putDecimal(int rowId, Decimal value, int precision) { - if (precision <= Decimal.MAX_INT_DIGITS()) { - putInt(rowId, (int) value.toUnscaledLong()); - } else if (precision <= Decimal.MAX_LONG_DIGITS()) { - putLong(rowId, value.toUnscaledLong()); - } else { - writer.setBytes(rowId, value.toJavaBigDecimal()); - } - } - @Override public UTF8String getUTF8String(int rowId) { if (isNullAt(rowId)) { @@ -1755,6 +1744,14 @@ final void setLong(int rowId, long value) { final void setBytes(int rowId, BigDecimal value) { writer.setSafe(rowId, value); } + + final void setBytes(int rowId, int count, byte[] src, int srcIndex) { + if (count == src.length && srcIndex == 0) { + writer.setBigEndianSafe(rowId, src); + } else { + throw new UnsupportedOperationException(); + } + } } private static class StringWriter extends ArrowVectorWriter { From 001e259499b6ced6c4d51f3be9fd51b935917908 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 23 Aug 2024 10:48:53 +0000 Subject: [PATCH 4/5] fix comments --- .../org/apache/gluten/vectorized/ArrowWritableColumnVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java index 55bef42664b7..3e237b896b73 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java @@ -1255,7 +1255,7 @@ void setNull(int rowId) { throw new UnsupportedOperationException(); } - // Arrow not need to setNotNull, set the valus is enough. + // Most of date type of Arrow does not need to setNotNull, set the value is enough. void setNotNull(int rowId) {} void setNulls(int rowId, int count) { From 58cec3e368fa7ac925c6ec4259c9441022d50bb1 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 29 Aug 2024 11:10:08 +0800 Subject: [PATCH 5/5] fixup --- .../vectorized/ArrowWritableColumnVector.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java index 3e237b896b73..336d33771b90 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java @@ -1255,8 +1255,13 @@ void setNull(int rowId) { throw new UnsupportedOperationException(); } - // Most of date type of Arrow does not need to setNotNull, set the value is enough. - void setNotNull(int rowId) {} + void setNotNull(int rowId) { + // Arrow Java library doesn't usually expose this API from its vectors. So we have to + // allow no-op here than throwing exceptions which could fail caller. And basically it's + // acceptable because finally Spark will set value after this method returned, + // During which Arrow Java will set the validity buffer anyway. As if the call to + // `setNotNull` is just deferred. + } void setNulls(int rowId, int count) { throw new UnsupportedOperationException(); @@ -1748,9 +1753,9 @@ final void setBytes(int rowId, BigDecimal value) { final void setBytes(int rowId, int count, byte[] src, int srcIndex) { if (count == src.length && srcIndex == 0) { writer.setBigEndianSafe(rowId, src); - } else { - throw new UnsupportedOperationException(); + return; } + throw new UnsupportedOperationException(); } }