diff --git a/.github/workflows/velox_docker.yml b/.github/workflows/velox_docker.yml index 592552a19ac5..ebdc9578d3eb 100644 --- a/.github/workflows/velox_docker.yml +++ b/.github/workflows/velox_docker.yml @@ -615,6 +615,8 @@ jobs: if: ${{ steps.cache.outputs.cache-hit != 'true' }} run: | source /opt/rh/gcc-toolset-9/enable + source ./dev/build_arrow.sh + install_arrow_deps ./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \ --build_examples=ON --build_benchmarks=ON --build_protobuf=ON - name: Gluten CPP Test diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala index bbfac80a7374..99b212059966 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnHDFSSuite.scala @@ -652,7 +652,7 @@ class GlutenClickHouseMergeTreeWriteOnHDFSSuite it.next() files += 1 } - assertResult(4)(files) + assertResult(72)(files) } } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala index 43d54fb62e4b..12fa3b46d72a 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala @@ -16,58 +16,14 @@ */ package org.apache.gluten.execution -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.optimizer.NullPropagation import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.types._ import java.sql.Timestamp -class ScalarFunctionsValidateSuiteRasOff extends ScalarFunctionsValidateSuite { - override protected def sparkConf: SparkConf = { - super.sparkConf - .set("spark.gluten.ras.enabled", "false") - } - - // Since https://github.com/apache/incubator-gluten/pull/6200. - test("Test input_file_name function") { - runQueryAndCompare("""SELECT input_file_name(), l_orderkey - | from lineitem limit 100""".stripMargin) { - checkGlutenOperatorMatch[ProjectExecTransformer] - } - - runQueryAndCompare("""SELECT input_file_name(), l_orderkey - | from - | (select l_orderkey from lineitem - | union all - | select o_orderkey as l_orderkey from orders) - | limit 100""".stripMargin) { - checkGlutenOperatorMatch[ProjectExecTransformer] - } - } -} - -class ScalarFunctionsValidateSuiteRasOn extends ScalarFunctionsValidateSuite { - override protected def sparkConf: SparkConf = { - super.sparkConf - .set("spark.gluten.ras.enabled", "true") - } - - // TODO: input_file_name is not yet supported in RAS - ignore("Test input_file_name function") { - runQueryAndCompare("""SELECT input_file_name(), l_orderkey - | from lineitem limit 100""".stripMargin) { _ => } - - runQueryAndCompare("""SELECT input_file_name(), l_orderkey - | from - | (select l_orderkey from lineitem - | union all - | select o_orderkey as l_orderkey from orders) - | limit 100""".stripMargin) { _ => } - } -} - -abstract class ScalarFunctionsValidateSuite extends FunctionsValidateTest { +class ScalarFunctionsValidateSuite extends FunctionsValidateTest { disableFallbackCheck import testImplicits._ @@ -702,6 +658,22 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } + test("Test input_file_name function") { + runQueryAndCompare("""SELECT input_file_name(), l_orderkey + | from lineitem limit 100""".stripMargin) { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + + runQueryAndCompare("""SELECT input_file_name(), l_orderkey + | from + | (select l_orderkey from lineitem + | union all + | select o_orderkey as l_orderkey from orders) + | limit 100""".stripMargin) { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("Test sequence function optimized by Spark constant folding") { withSQLConf(("spark.sql.optimizer.excludedRules", NullPropagation.ruleName)) { runQueryAndCompare("""SELECT sequence(1, 5), l_orderkey diff --git a/cpp-ch/clickhouse.version b/cpp-ch/clickhouse.version index 706c510a6b69..6afd19152c79 100644 --- a/cpp-ch/clickhouse.version +++ b/cpp-ch/clickhouse.version @@ -1,4 +1,4 @@ CH_ORG=Kyligence -CH_BRANCH=rebase_ch/20240718 -CH_COMMIT=7f849f9ccf1 +CH_BRANCH=rebase_ch/20240711 +CH_COMMIT=4ab4aa7fe04 diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.cpp b/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.cpp deleted file mode 100644 index 7a3ba4bed244..000000000000 --- a/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "CompactObjectStorageDiskTransaction.h" - -#include -#include - -namespace local_engine -{ -int getFileOrder(const std::string & path) -{ - if (path.ends_with("columns.txt")) - return 1; - if (path.ends_with("metadata_version.txt")) - return 2; - if (path.ends_with("count.txt")) - return 3; - if (path.ends_with("default_compression_codec.txt")) - return 4; - if (path.ends_with("checksums.txt")) - return 5; - if (path.ends_with("uuid.txt")) - return 6; - if (path.ends_with(".cmrk3") || path.ends_with(".cmrk2") || path.ends_with(".cmrk1") || - path.ends_with(".mrk3") || path.ends_with(".mrk2") || path.ends_with(".mrk1")) - return 10; - if (path.ends_with("idx")) - return 20; - if (path.ends_with("bin")) - return 1000; - return 100; -} - -bool isMetaDataFile(const std::string & path) -{ - return !path.ends_with("bin"); -} - -using FileMappings = std::vector>>; - -void CompactObjectStorageDiskTransaction::commit() -{ - auto metadata_tx = disk.getMetadataStorage()->createTransaction(); - std::filesystem::path data_path = std::filesystem::path(prefix_path) / "data.bin"; - std::filesystem::path meta_path = std::filesystem::path(prefix_path) / "meta.bin"; - - auto object_storage = disk.getObjectStorage(); - auto data_key = object_storage->generateObjectKeyForPath(data_path); - auto meta_key = object_storage->generateObjectKeyForPath(meta_path); - - disk.createDirectories(prefix_path); - auto data_write_buffer = object_storage->writeObject(DB::StoredObject(data_key.serialize(), data_path), DB::WriteMode::Rewrite); - auto meta_write_buffer = object_storage->writeObject(DB::StoredObject(meta_key.serialize(), meta_path), DB::WriteMode::Rewrite); - String buffer; - buffer.resize(1024 * 1024); - - auto merge_files = [&](std::ranges::input_range auto && list, DB::WriteBuffer & out, const DB::ObjectStorageKey & key , const String &local_path) - { - size_t offset = 0; - std::ranges::for_each( - list, - [&](auto & item) - { - DB::DiskObjectStorageMetadata metadata(object_storage->getCommonKeyPrefix(), item.first); - DB::ReadBufferFromFilePRead read(item.second->getAbsolutePath()); - int file_size = 0; - while (int count = read.readBig(buffer.data(), buffer.size())) - { - file_size += count; - out.write(buffer.data(), count); - } - metadata.addObject(key, offset, file_size); - metadata_tx->writeStringToFile(item.first, metadata.serializeToString()); - offset += file_size; - }); - - // You can load the complete file in advance through this metadata original, which improves the download efficiency of mergetree metadata. - DB::DiskObjectStorageMetadata whole_meta(object_storage->getCommonKeyPrefix(), local_path); - whole_meta.addObject(key, 0, offset); - metadata_tx->writeStringToFile(local_path, whole_meta.serializeToString()); - out.sync(); - }; - - merge_files(files | std::ranges::views::filter([](auto file) { return !isMetaDataFile(file.first); }), *data_write_buffer, data_key, data_path); - merge_files(files | std::ranges::views::filter([](auto file) { return isMetaDataFile(file.first); }), *meta_write_buffer, meta_key, meta_path); - - metadata_tx->commit(); - files.clear(); -} - -std::unique_ptr CompactObjectStorageDiskTransaction::writeFile( - const std::string & path, - size_t buf_size, - DB::WriteMode mode, - const DB::WriteSettings &, - bool) -{ - if (mode != DB::WriteMode::Rewrite) - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `writeFile` with Append is not implemented"); - } - if (prefix_path.empty()) - prefix_path = path.substr(0, path.find_last_of('/')); - else if (!path.starts_with(prefix_path)) - throw DB::Exception( - DB::ErrorCodes::NOT_IMPLEMENTED, - "Don't support write file in different dirs, path {}, prefix path: {}", - path, - prefix_path); - auto tmp = std::make_shared(tmp_data); - files.emplace_back(path, tmp); - auto tx = disk.getMetadataStorage()->createTransaction(); - tx->createDirectoryRecursive(std::filesystem::path(path).parent_path()); - tx->createEmptyMetadataFile(path); - tx->commit(); - return std::make_unique(tmp->getAbsolutePath(), buf_size); -} -} \ No newline at end of file diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.h b/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.h deleted file mode 100644 index e15c362f304a..000000000000 --- a/cpp-ch/local-engine/Disks/ObjectStorages/CompactObjectStorageDiskTransaction.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ -extern const int NOT_IMPLEMENTED; -} -} - -namespace local_engine -{ - -class CompactObjectStorageDiskTransaction: public DB::IDiskTransaction { - public: - explicit CompactObjectStorageDiskTransaction(DB::IDisk & disk_, const DB::DiskPtr tmp_) - : disk(disk_), tmp_data(tmp_) - { - chassert(!tmp_->isRemote()); - } - - void commit() override; - - void undo() override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `undo` is not implemented"); - } - - void createDirectory(const std::string & path) override - { - disk.createDirectory(path); - } - - void createDirectories(const std::string & path) override - { - disk.createDirectories(path); - } - - void createFile(const std::string & path) override - { - disk.createFile(path); - } - - void clearDirectory(const std::string & path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `clearDirectory` is not implemented"); - } - - void moveDirectory(const std::string & from_path, const std::string & to_path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `moveDirectory` is not implemented"); - } - - void moveFile(const String & from_path, const String & to_path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `moveFile` is not implemented"); - } - - void replaceFile(const std::string & from_path, const std::string & to_path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `replaceFile` is not implemented"); - } - - void copyFile(const std::string & from_file_path, const std::string & to_file_path, const DB::ReadSettings & read_settings, const DB::WriteSettings & write_settings) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `copyFile` is not implemented"); - } - - std::unique_ptr writeFile( /// NOLINT - const std::string & path, - size_t buf_size, - DB::WriteMode mode, - const DB::WriteSettings & settings, - bool /*autocommit */) override; - - - void writeFileUsingBlobWritingFunction(const String & path, DB::WriteMode mode, WriteBlobFunction && write_blob_function) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `writeFileUsingBlobWritingFunction` is not implemented"); - } - - void removeFile(const std::string & path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeFile` is not implemented"); - } - - void removeFileIfExists(const std::string & path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeFileIfExists` is not implemented"); - } - - void removeDirectory(const std::string & path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeDirectory` is not implemented"); - } - - void removeRecursive(const std::string & path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeRecursive` is not implemented"); - } - - void removeSharedFile(const std::string & path, bool keep_shared_data) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeSharedFile` is not implemented"); - } - - void removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const DB::NameSet & file_names_remove_metadata_only) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeSharedRecursive` is not implemented"); - } - - void removeSharedFileIfExists(const std::string & path, bool keep_shared_data) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeSharedFileIfExists` is not implemented"); - } - - void removeSharedFiles(const DB::RemoveBatchRequest & files, bool keep_all_batch_data, const DB::NameSet & file_names_remove_metadata_only) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `removeSharedFiles` is not implemented"); - } - - void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override - { - disk.setLastModified(path, timestamp); - } - - void chmod(const String & path, mode_t mode) override - { - disk.chmod(path, mode); - } - - void setReadOnly(const std::string & path) override - { - disk.setReadOnly(path); - } - - void createHardLink(const std::string & src_path, const std::string & dst_path) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `createHardLink` is not implemented"); - } - - void truncateFile(const std::string & /* src_path */, size_t /* target_size */) override - { - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Operation `truncateFile` is not implemented"); - } - -private: - DB::IDisk & disk; - DB::DiskPtr tmp_data; - std::vector>> files; - String prefix_path = ""; -}; -} - diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp index 9c4b390ea8b0..f207ad232b4f 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.cpp @@ -20,19 +20,12 @@ #include #include - -#include "CompactObjectStorageDiskTransaction.h" #if USE_HDFS namespace local_engine { using namespace DB; -DiskTransactionPtr GlutenDiskHDFS::createTransaction() -{ - return std::make_shared(*this, SerializedPlanParser::global_context->getTempDataOnDisk()->getVolume()->getDisk()); -} - void GlutenDiskHDFS::createDirectory(const String & path) { DiskObjectStorage::createDirectory(path); diff --git a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h index b0f82a340b1f..97a99f1deaba 100644 --- a/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h +++ b/cpp-ch/local-engine/Disks/ObjectStorages/GlutenDiskHDFS.h @@ -21,8 +21,6 @@ #include #include -#include -#include #if USE_HDFS #include #endif @@ -53,8 +51,6 @@ class GlutenDiskHDFS : public DB::DiskObjectStorage throttler = std::make_shared(max_speed); } - DB::DiskTransactionPtr createTransaction() override; - void createDirectory(const String & path) override; void createDirectories(const String & path) override; @@ -76,17 +72,7 @@ class GlutenDiskHDFS : public DB::DiskObjectStorage { DB::ObjectStoragePtr tmp = object_storage_creator(config, context); hdfs_object_storage = typeid_cast>(tmp); - // only for java ut - bool is_cache = object_storage->supportsCache(); - if (is_cache) - { - auto cache_os = reinterpret_cast(object_storage.get()); - object_storage = hdfs_object_storage; - auto cache = DB::FileCacheFactory::instance().getOrCreate(cache_os->getCacheName(), cache_os->getCacheSettings(), "storage_configuration.disks.hdfs_cache"); - wrapWithCache(cache, cache_os->getCacheSettings(), cache_os->getCacheConfigName()); - } - else - object_storage = hdfs_object_storage; + object_storage = hdfs_object_storage; } private: std::shared_ptr hdfs_object_storage; diff --git a/cpp-ch/local-engine/Operator/DefaultHashAggregateResult.cpp b/cpp-ch/local-engine/Operator/DefaultHashAggregateResult.cpp index fbad02fda592..35f891581595 100644 --- a/cpp-ch/local-engine/Operator/DefaultHashAggregateResult.cpp +++ b/cpp-ch/local-engine/Operator/DefaultHashAggregateResult.cpp @@ -116,7 +116,7 @@ class DefaultHashAggrgateResultTransform : public DB::IProcessor has_input = true; output_chunk = DB::Chunk(result_cols, 1); auto info = std::make_shared(); - output_chunk.getChunkInfos().add(std::move(info)); + output_chunk.setChunkInfo(info); return Status::Ready; } @@ -124,10 +124,10 @@ class DefaultHashAggrgateResultTransform : public DB::IProcessor if (input.hasData()) { output_chunk = input.pull(true); - if (output_chunk.getChunkInfos().empty()) + if (!output_chunk.hasChunkInfo()) { auto info = std::make_shared(); - output_chunk.getChunkInfos().add(std::move(info)); + output_chunk.setChunkInfo(info); } has_input = true; return Status::Ready; diff --git a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp deleted file mode 100644 index a7a07c0bf31a..000000000000 --- a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include -namespace DB::ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - -namespace local_engine -{ - -template -void tryAssign(const std::unordered_map & kvs, const String & key, T & v); - -template<> -void tryAssign(const std::unordered_map & kvs, const String & key, String & v) -{ - auto it = kvs.find(key); - if (it != kvs.end()) - v = it->second; -} - -template<> -void tryAssign(const std::unordered_map & kvs, const String & key, bool & v) -{ - auto it = kvs.find(key); - if (it != kvs.end()) - { - if (it->second == "0" || it->second == "false" || it->second == "FALSE") - { - v = false; - } - else - { - v = true; - } - } -} - -template -void readStringUntilCharsInto(String & s, DB::ReadBuffer & buf) -{ - while (!buf.eof()) - { - char * next_pos = find_first_symbols(buf.position(), buf.buffer().end()); - - s.append(buf.position(), next_pos - buf.position()); - buf.position() = next_pos; - - if (buf.hasPendingData()) - return; - } -} - -/// In the format: Seg1:k1=v1\nk2=v2\n..\nSeg2:k1=v1\n... -std::unordered_map> convertToKVs(const String & advance) -{ - std::unordered_map> res; - std::unordered_map *kvs; - DB::ReadBufferFromString in(advance); - while(!in.eof()) - { - String key; - readStringUntilCharsInto<'=', '\n', ':'>(key, in); - if (key.empty()) - { - if (!in.eof()) - { - char c; - DB::readChar(c, in); - } - continue; - } - - char c; - DB::readChar(c, in); - if (c == ':') - { - res[key] = {}; - kvs = &res[key]; - continue; - } - - if (c != '=') - throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid format, = is expected: {}", advance); - - String value; - readStringUntilCharsInto<'\n'>(value, in); - (*kvs)[key] = value; - } - return res; -} - -JoinOptimizationInfo JoinOptimizationInfo::parse(const String & advance) -{ - JoinOptimizationInfo info; - auto kkvs = convertToKVs(advance); - auto & kvs = kkvs["JoinParameters"]; - tryAssign(kvs, "isBHJ", info.is_broadcast); - tryAssign(kvs, "isSMJ", info.is_smj); - tryAssign(kvs, "buildHashTableId", info.storage_join_key); - tryAssign(kvs, "isNullAwareAntiJoin", info.is_null_aware_anti_join); - tryAssign(kvs, "isExistenceJoin", info.is_existence_join); - return info; -} -} - diff --git a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h deleted file mode 100644 index 5a15a3ea8abc..000000000000 --- a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include - -namespace local_engine -{ - -std::unordered_map> convertToKVs(const String & advance); - - -struct JoinOptimizationInfo -{ - bool is_broadcast = false; - bool is_smj = false; - bool is_null_aware_anti_join = false; - bool is_existence_join = false; - String storage_join_key; - - static JoinOptimizationInfo parse(const String & advance); -}; -} - diff --git a/cpp-ch/local-engine/Parser/CrossRelParser.cpp b/cpp-ch/local-engine/Parser/CrossRelParser.cpp index 4405a57cb575..ea898640146b 100644 --- a/cpp-ch/local-engine/Parser/CrossRelParser.cpp +++ b/cpp-ch/local-engine/Parser/CrossRelParser.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -52,6 +51,17 @@ using namespace DB; namespace local_engine { +String parseCrossJoinOptimizationInfos(const substrait::CrossRel & join) +{ + google::protobuf::StringValue optimization; + optimization.ParseFromString(join.advanced_extension().optimization().value()); + String storage_join_key; + ReadBufferFromString in(optimization.value()); + assertString("JoinParameters:", in); + assertString("buildHashTableId=", in); + readString(storage_join_key, in); + return storage_join_key; +} std::shared_ptr createCrossTableJoin(substrait::CrossRel_JoinType join_type) { @@ -144,10 +154,7 @@ void CrossRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & rig DB::QueryPlanPtr CrossRelParser::parseJoin(const substrait::CrossRel & join, DB::QueryPlanPtr left, DB::QueryPlanPtr right) { - google::protobuf::StringValue optimization_info; - optimization_info.ParseFromString(join.advanced_extension().optimization().value()); - auto join_opt_info = JoinOptimizationInfo::parse(optimization_info.value()); - const auto & storage_join_key = join_opt_info.storage_join_key; + auto storage_join_key = parseCrossJoinOptimizationInfos(join); auto storage_join = BroadCastJoinBuilder::getJoin(storage_join_key) ; renamePlanColumns(*left, *right, *storage_join); auto table_join = createCrossTableJoin(join.type()); diff --git a/cpp-ch/local-engine/Parser/JoinRelParser.cpp b/cpp-ch/local-engine/Parser/JoinRelParser.cpp index 1141f479642a..03734a2a9f0d 100644 --- a/cpp-ch/local-engine/Parser/JoinRelParser.cpp +++ b/cpp-ch/local-engine/Parser/JoinRelParser.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -47,8 +46,60 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } } + +struct JoinOptimizationInfo +{ + bool is_broadcast = false; + bool is_smj = false; + bool is_null_aware_anti_join = false; + bool is_existence_join = false; + std::string storage_join_key; +}; + using namespace DB; +JoinOptimizationInfo parseJoinOptimizationInfo(const substrait::JoinRel & join) +{ + google::protobuf::StringValue optimization; + optimization.ParseFromString(join.advanced_extension().optimization().value()); + JoinOptimizationInfo info; + if (optimization.value().contains("isBHJ=")) + { + ReadBufferFromString in(optimization.value()); + assertString("JoinParameters:", in); + assertString("isBHJ=", in); + readBoolText(info.is_broadcast, in); + assertChar('\n', in); + if (info.is_broadcast) + { + assertString("isNullAwareAntiJoin=", in); + readBoolText(info.is_null_aware_anti_join, in); + assertChar('\n', in); + assertString("buildHashTableId=", in); + readString(info.storage_join_key, in); + assertChar('\n', in); + } + } + else + { + ReadBufferFromString in(optimization.value()); + assertString("JoinParameters:", in); + assertString("isSMJ=", in); + readBoolText(info.is_smj, in); + assertChar('\n', in); + if (info.is_smj) + { + assertString("isNullAwareAntiJoin=", in); + readBoolText(info.is_null_aware_anti_join, in); + assertChar('\n', in); + assertString("isExistenceJoin=", in); + readBoolText(info.is_existence_join, in); + assertChar('\n', in); + } + } + return info; +} + namespace local_engine { std::shared_ptr createDefaultTableJoin(substrait::JoinRel_JoinType join_type) @@ -210,9 +261,7 @@ void JoinRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & righ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::QueryPlanPtr left, DB::QueryPlanPtr right) { - google::protobuf::StringValue optimization_info; - optimization_info.ParseFromString(join.advanced_extension().optimization().value()); - auto join_opt_info = JoinOptimizationInfo::parse(optimization_info.value()); + auto join_opt_info = parseJoinOptimizationInfo(join); auto storage_join = join_opt_info.is_broadcast ? BroadCastJoinBuilder::getJoin(join_opt_info.storage_join_key) : nullptr; if (storage_join) { diff --git a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp index 961f482c7cae..3e93edaaab70 100644 --- a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp +++ b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp @@ -148,26 +148,9 @@ CustomStorageMergeTree::CustomStorageMergeTree( std::atomic CustomStorageMergeTree::part_num; - - -void CustomStorageMergeTree::prefectchMetaDataFile(std::unordered_set parts) -{ - auto disk = getDisks().front(); - if (!disk->isRemote()) return; - std::vector meta_paths; - std::ranges::for_each(parts, [&](const String & name) { meta_paths.emplace_back(fs::path(relative_data_path) / name / "meta.bin"); }); - for (const auto & meta_path: meta_paths) - { - if (!disk->exists(meta_path)) continue; - auto in = disk->readFile(meta_path); - String ignore_data; - readStringUntilEOF(ignore_data, *in); - } -} - std::vector CustomStorageMergeTree::loadDataPartsWithNames(std::unordered_set parts) { - prefectchMetaDataFile(parts); + auto parts_lock = lockParts(); std::vector data_parts; const auto disk = getStoragePolicy()->getDisks().at(0); for (const auto& name : parts) @@ -178,6 +161,8 @@ std::vector CustomStorageMergeTree::loadDataPartsWithNames data_parts.emplace_back(res.part); } + // without it "test mergetree optimize partitioned by one low card column" will log ERROR + calculateColumnAndSecondaryIndexSizesImpl(); return data_parts; } @@ -226,7 +211,6 @@ MergeTreeData::LoadPartResult CustomStorageMergeTree::loadDataPart( res.part->loadVersionMetadata(); res.part->setState(to_state); - auto parts_lock = lockParts(); DataPartIteratorByInfo it; bool inserted; @@ -255,9 +239,6 @@ MergeTreeData::LoadPartResult CustomStorageMergeTree::loadDataPart( if (res.part->hasLightweightDelete()) has_lightweight_delete_parts.store(true); - // without it "test mergetree optimize partitioned by one low card column" will log ERROR - calculateColumnAndSecondaryIndexSizesImpl(); - LOG_TRACE(log, "Finished loading {} part {} on disk {}", magic_enum::enum_name(to_state), part_name, part_disk_ptr->getName()); return res; } diff --git a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h index 9144aba429c0..cd507a3ac751 100644 --- a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h +++ b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h @@ -65,7 +65,6 @@ class CustomStorageMergeTree final : public MergeTreeData private: SimpleIncrement increment; - void prefectchMetaDataFile(std::unordered_set parts); void startBackgroundMovesIfNeeded() override; std::unique_ptr getDefaultSettings() const override; LoadPartResult loadDataPart( diff --git a/cpp-ch/local-engine/Storages/Mergetree/MergeSparkMergeTreeTask.cpp b/cpp-ch/local-engine/Storages/Mergetree/MergeSparkMergeTreeTask.cpp index 05b2623b4d16..8e8e4c556beb 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/MergeSparkMergeTreeTask.cpp +++ b/cpp-ch/local-engine/Storages/Mergetree/MergeSparkMergeTreeTask.cpp @@ -164,7 +164,7 @@ void MergeSparkMergeTreeTask::finish() // MergeTreeData::Transaction transaction(storage, txn.get()); // storage.merger_mutator.renameMergedTemporaryPart(new_part, future_part->parts, txn, transaction); // transaction.commit(); - new_part->getDataPartStoragePtr()->commitTransaction(); + ThreadFuzzer::maybeInjectSleep(); ThreadFuzzer::maybeInjectMemoryLimitException(); diff --git a/cpp-ch/local-engine/Storages/Mergetree/MetaDataHelper.cpp b/cpp-ch/local-engine/Storages/Mergetree/MetaDataHelper.cpp index 8d43af06829c..c6a8e03a6ba0 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/MetaDataHelper.cpp +++ b/cpp-ch/local-engine/Storages/Mergetree/MetaDataHelper.cpp @@ -113,6 +113,8 @@ void restoreMetaData(CustomStorageMergeTreePtr & storage, const MergeTreeTable & auto item_path = part_path / item.first; auto out = metadata_disk->writeFile(item_path); out->write(item.second.data(), item.second.size()); + out->finalize(); + out->sync(); } }; thread_pool.scheduleOrThrow(job); diff --git a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp index 403b845147fa..406f2aaa23df 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp +++ b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp @@ -121,11 +121,12 @@ void SparkMergeTreeWriter::write(const DB::Block & block) checkAndMerge(); } -bool SparkMergeTreeWriter::chunkToPart(Chunk && plan_chunk) +bool SparkMergeTreeWriter::chunkToPart(Chunk && chunk) { - if (Chunk result_chunk = DB::Squashing::squash(std::move(plan_chunk))) + if (chunk.hasChunkInfo()) { - auto result = squashing->getHeader().cloneWithColumns(result_chunk.detachColumns()); + Chunk squash_chunk = DB::Squashing::squash(std::move(chunk)); + Block result = header.cloneWithColumns(squash_chunk.getColumns()); return blockToPart(result); } return false; @@ -231,42 +232,18 @@ void SparkMergeTreeWriter::commitPartToRemoteStorageIfNeeded() auto read_settings = context->getReadSettings(); auto write_settings = context->getWriteSettings(); Stopwatch watch; - - // Temporary support for S3 - bool s3_disk = dest_storage->getStoragePolicy()->getAnyDisk()->getName().contains("s3"); for (const auto & merge_tree_data_part : new_parts.unsafeGet()) { String local_relative_path = storage->getRelativeDataPath() + "/" + merge_tree_data_part->name; String remote_relative_path = dest_storage->getRelativeDataPath() + "/" + merge_tree_data_part->name; - if (s3_disk) - { - storage->getStoragePolicy()->getAnyDisk()->copyDirectoryContent( + storage->getStoragePolicy()->getAnyDisk()->copyDirectoryContent( local_relative_path, dest_storage->getStoragePolicy()->getAnyDisk(), remote_relative_path, read_settings, write_settings, nullptr); - } - else - { - std::vector files; - storage->getStoragePolicy()->getAnyDisk()->listFiles(local_relative_path, files); - auto src_disk = storage->getStoragePolicy()->getAnyDisk(); - auto dest_disk = dest_storage->getStoragePolicy()->getAnyDisk(); - auto tx = dest_disk->createTransaction(); - for (const auto & file : files) - { - auto read_buffer = src_disk->readFile(local_relative_path + "/" + file, read_settings); - auto write_buffer = tx->writeFile(remote_relative_path + "/" + file, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, write_settings); - copyData(*read_buffer, *write_buffer); - write_buffer->finalize(); - } - tx->commit(); - } - - LOG_DEBUG( &Poco::Logger::get("SparkMergeTreeWriter"), "Upload part {} to disk {} success.", @@ -329,6 +306,7 @@ DB::MergeTreeDataWriter::TemporaryPart SparkMergeTreeWriter::writeTempPartAndFin { MergeTreeDataWriter::TemporaryPart temp_part; writeTempPart(temp_part, block_with_partition, metadata_snapshot); + temp_part.finalize(); return temp_part; } @@ -421,7 +399,6 @@ void SparkMergeTreeWriter::writeTempPart( new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); - data_part_storage->beginTransaction(); SyncGuardPtr sync_guard; if (new_data_part->isStoredOnDisk()) { @@ -464,8 +441,6 @@ void SparkMergeTreeWriter::writeTempPart( temp_part.part = new_data_part; temp_part.streams.emplace_back(MergeTreeDataWriter::TemporaryPart::Stream{.stream = std::move(out), .finalizer = std::move(finalizer)}); - temp_part.finalize(); - data_part_storage->commitTransaction(); } std::vector SparkMergeTreeWriter::getAllPartInfo() diff --git a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h index 269b0352c056..13ac22394477 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h +++ b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.h @@ -77,7 +77,7 @@ class SparkMergeTreeWriter void saveMetadata(); void commitPartToRemoteStorageIfNeeded(); void finalizeMerge(); - bool chunkToPart(Chunk && plan_chunk); + bool chunkToPart(Chunk && chunk); bool blockToPart(Block & block); bool useLocalStorage() const; diff --git a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp index 1c5902c8ca67..37501e98504a 100644 --- a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp +++ b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp @@ -109,13 +109,13 @@ DB::Chunk SourceFromJavaIter::generate() auto info = std::make_shared(); info->is_overflows = data->info.is_overflows; info->bucket_num = data->info.bucket_num; - result.getChunkInfos().add(std::move(info)); + result.setChunkInfo(info); } else { result = BlockUtil::buildRowCountChunk(rows); auto info = std::make_shared(); - result.getChunkInfos().add(std::move(info)); + result.setChunkInfo(info); } } return result; diff --git a/dev/build_arrow.sh b/dev/build_arrow.sh index b914a9fce48b..19d5627146b1 100755 --- a/dev/build_arrow.sh +++ b/dev/build_arrow.sh @@ -15,19 +15,17 @@ # limitations under the License. CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) -export SUDO=sudo source ${CURRENT_DIR}/build_helper_functions.sh VELOX_ARROW_BUILD_VERSION=15.0.0 ARROW_PREFIX=$CURRENT_DIR/../ep/_ep/arrow_ep BUILD_TYPE=Release function prepare_arrow_build() { - mkdir -p ${ARROW_PREFIX}/../ && pushd ${ARROW_PREFIX}/../ && sudo rm -rf arrow_ep/ + mkdir -p ${ARROW_PREFIX}/../ && cd ${ARROW_PREFIX}/../ && sudo rm -rf arrow_ep/ wget_and_untar https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz arrow_ep cd arrow_ep patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch - popd } function install_arrow_deps { @@ -108,10 +106,3 @@ function build_arrow_java() { -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly popd } - -echo "Start to build Arrow" -prepare_arrow_build -build_arrow_cpp -echo "Finished building arrow CPP" -build_arrow_java -echo "Finished building arrow Java" diff --git a/dev/builddeps-veloxbe.sh b/dev/builddeps-veloxbe.sh index 38f3978efe47..1676577ae89a 100755 --- a/dev/builddeps-veloxbe.sh +++ b/dev/builddeps-veloxbe.sh @@ -187,8 +187,15 @@ fi concat_velox_param function build_arrow { + echo "Start to build Arrow" + export SUDO=sudo cd $GLUTEN_DIR/dev - ./build_arrow.sh + source build_arrow.sh + prepare_arrow_build + build_arrow_cpp + echo "Finished building arrow CPP" + build_arrow_java + echo "Finished building arrow Java" } function build_velox { diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index 20ad7caaa437..44e0ae0359b5 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -18,7 +18,7 @@ set -exu VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=2024_07_18 +VELOX_BRANCH=2024_07_16 VELOX_HOME="" #Set on run gluten on HDFS diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala index 513a91e4361e..ab0deab1939d 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala @@ -70,21 +70,20 @@ object GlutenCostModel extends Logging { (n.children.map(longCostOf).toList :+ selfCost).reduce(safeSum) } - // A very rough estimation as of now. The cost model basically considers any - // fallen back ops has extreme high cost so offloads computations as much as possible. + // A very rough estimation as of now. private def selfLongCostOf(node: SparkPlan): Long = { node match { case _: RemoveFilter.NoopFilter => // To make planner choose the tree that has applied rule PushFilterToScan. 0L - case ColumnarToRowExec(child) => 10L - case RowToColumnarExec(child) => 10L - case ColumnarToRowLike(child) => 10L - case RowToColumnarLike(child) => 10L - case p if PlanUtil.isGlutenColumnarOp(p) => 10L - case p if PlanUtil.isVanillaColumnarOp(p) => 1000L + case ColumnarToRowExec(child) => 3L + case RowToColumnarExec(child) => 3L + case ColumnarToRowLike(child) => 3L + case RowToColumnarLike(child) => 3L + case p if PlanUtil.isGlutenColumnarOp(p) => 2L + case p if PlanUtil.isVanillaColumnarOp(p) => 3L // Other row ops. Usually a vanilla row op. - case _ => 1000L + case _ => 5L } } diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java index 4af8eb4e3f82..b7d6ecd67589 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java @@ -53,15 +53,10 @@ public long reserve(long size) { @Override public long unreserve(long size) { synchronized (this) { - try { - long freed = target.repay(size); - sharedUsage.inc(-freed); - Preconditions.checkState(freed == size); - return freed; - } catch (Exception e) { - LOG.error("Error unreserving memory from target", e); - throw e; - } + long freed = target.repay(size); + sharedUsage.inc(-freed); + Preconditions.checkState(freed == size); + return freed; } } diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java index 56ef68db9057..cfd3848d8158 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java @@ -129,9 +129,6 @@ public String getSparkMasterUrl() { if (!System.getenv().containsKey("SPARK_HOME")) { throw new IllegalArgumentException("SPARK_HOME not set! Please use --local if there is no local Spark build"); } - if (!System.getenv().containsKey("SPARK_SCALA_VERSION")) { - throw new IllegalArgumentException("SPARK_SCALA_VERSION not set! Please set it first or use --local instead. Example: export SPARK_SCALA_VERSION=2.12"); - } return String.format("local-cluster[%d,%d,%d]", lcWorkers, lcWorkerCores, Utils.byteStringAsMb(lcWorkerMem)); } diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index a65324f2728d..c092a0ebb0e6 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -17,7 +17,7 @@ 1.8 ${java.version} ${java.version} - 2.12.17 + 2.12.15 3.4.2 2.12 3 @@ -144,28 +144,24 @@ 3.2.2 - 2.12.15 spark-3.3 3.3.1 - 2.12.15 spark-3.4 3.4.2 - 2.12.17 spark-3.5 3.5.1 - 2.12.18 diff --git a/tools/gluten-it/sbin/gluten-it.sh b/tools/gluten-it/sbin/gluten-it.sh index b21038ccdef6..fda117417936 100755 --- a/tools/gluten-it/sbin/gluten-it.sh +++ b/tools/gluten-it/sbin/gluten-it.sh @@ -28,14 +28,6 @@ fi JAR_PATH=$LIB_DIR/* -EMBEDDED_SPARK_HOME=$BASEDIR/../spark-home - -export SPARK_HOME=${SPARK_HOME:-$EMBEDDED_SPARK_HOME} -export SPARK_SCALA_VERSION=${SPARK_SCALA_VERSION:-'2.12'} - -echo "SPARK_HOME set at [$SPARK_HOME]." -echo "SPARK_SCALA_VERSION set at [$SPARK_SCALA_VERSION]." - $JAVA_HOME/bin/java $GLUTEN_IT_JVM_ARGS \ -XX:+IgnoreUnrecognizedVMOptions \ --add-opens=java.base/java.lang=ALL-UNNAMED \ diff --git a/tools/gluten-it/spark-home/jars b/tools/gluten-it/spark-home/jars deleted file mode 120000 index 2939305caa54..000000000000 --- a/tools/gluten-it/spark-home/jars +++ /dev/null @@ -1 +0,0 @@ -../package/target/lib \ No newline at end of file diff --git a/tools/gluten-te/centos/shared.sh b/tools/gluten-te/centos/shared.sh index 0253c16cef5f..d14b35bf9d0f 100755 --- a/tools/gluten-te/centos/shared.sh +++ b/tools/gluten-te/centos/shared.sh @@ -24,13 +24,6 @@ source "$SHARED_BASEDIR/defaults.conf" export DOCKER_BUILDKIT=1 export BUILDKIT_PROGRESS=plain -# Validate envs -if [ -z "$HOME" ] -then - echo 'Environment variable $HOME not found. Aborting.' - exit 1 -fi - # Set operating system OS_IMAGE_NAME=${OS_IMAGE_NAME:-$DEFAULT_OS_IMAGE_NAME} diff --git a/tools/gluten-te/ubuntu/dockerfile-buildenv b/tools/gluten-te/ubuntu/dockerfile-buildenv index e520fd295118..41fc202395e8 100644 --- a/tools/gluten-te/ubuntu/dockerfile-buildenv +++ b/tools/gluten-te/ubuntu/dockerfile-buildenv @@ -65,7 +65,7 @@ RUN cat /root/.m2/settings.xml ## APT dependencies # Update, then install essentials -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y sudo locales wget tar tzdata git ccache ninja-build build-essential llvm-11-dev clang-11 libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev curl zip unzip tar pkg-config autoconf-archive bison flex +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y sudo locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-11-dev clang-11 libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev # install HBM dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y autoconf automake g++ libnuma-dev libtool numactl unzip libdaxctl-dev @@ -103,14 +103,6 @@ RUN set-login-env "LANG=en_US.UTF-8" RUN set-login-env "LANGUAGE=en_US:en" RUN set-login-env "LC_ALL=en_US.UTF-8" -# Install CMake -RUN cd /opt && wget https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-linux-x86_64.sh \ - && mkdir cmake \ - && bash cmake-3.28.3-linux-x86_64.sh --skip-license --prefix=/opt/cmake \ - && ln -s /opt/cmake/bin/cmake /usr/bin/cmake - -RUN cmake --version - # Build & install Spark 3.2.2 RUN cd /opt && wget https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz RUN cd /opt && mkdir spark322 && tar -xvf spark-3.2.2-bin-hadoop3.2.tgz -C spark322 --strip-components=1 diff --git a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/README.md b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/README.md index 28e955dac603..27e97467d6c0 100644 --- a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/README.md +++ b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/README.md @@ -15,22 +15,19 @@ The folder contains script code to build `libvelox.so` and `libgluten.so` in doc export HTTP_PROXY_HOST=myproxy.example.com export HTTP_PROXY_PORT=55555 -# 2. Set the following env to install Gluten's modified Arrow Jars on host. -export MOUNT_MAVEN_CACHE=ON - -# 3. Build the C++ libs in a ubuntu 20.04 docker container. +# 2. Build the C++ libs in a ubuntu 20.04 docker container. # Note, this command could take much longer time to finish if it's never run before. # After the first run, the essential build environment will be cached in docker builder. # # Additionally, changes to HTTP_PROXY_HOST / HTTP_PROXY_PORT could invalidate the build cache # either. For more details, please check docker file `dockerfile-buildenv`. cd gluten/ -tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run-default.sh +tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run.sh -# 4. Check the built libs. +# 3. Check the built libs. ls -l cpp/build/releases/ -# 5. If you intend to build Gluten's bundled jar, continue running subsequent Maven commands. +# 4. If you intend to build Gluten's bundled jar, continue running subsequent Maven commands. # For example: mvn clean install -P spark-3.4,backends-velox -DskipTests ``` \ No newline at end of file diff --git a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run-default.sh b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run-default.sh deleted file mode 100755 index 2648725ce0b5..000000000000 --- a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run-default.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -ex - -BASEDIR=$(readlink -f $(dirname $0)) - -$BASEDIR/run.sh --enable_vcpkg=ON --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON diff --git a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run.sh b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run.sh index 4d28d45211c7..8a0f71bbcb08 100755 --- a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run.sh +++ b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/run.sh @@ -22,6 +22,4 @@ TIMESTAMP=$(date +%s) export EXTRA_DOCKER_OPTIONS="--name buildhere-veloxbe-portable-libs-$TIMESTAMP -v $BASEDIR/scripts:/opt/scripts" -BASH_ARGS="$*" - -$BASEDIR/../../cbash-mount.sh "/opt/scripts/all.sh $BASH_ARGS" +$BASEDIR/../../cbash-mount.sh '/opt/scripts/all.sh' diff --git a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/scripts/all.sh b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/scripts/all.sh index 18dd92a343f9..26742355232b 100755 --- a/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/scripts/all.sh +++ b/tools/gluten-te/ubuntu/examples/buildhere-veloxbe-portable-libs/scripts/all.sh @@ -44,8 +44,6 @@ function retry { cd /opt/gluten retry apt-get update -retry apt-get install -y --dry-run # We now have all essentials installed in image. - -BASH_ARGS=$@ - -retry dev/builddeps-veloxbe.sh $BASH_ARGS +retry apt-get install -y curl zip unzip tar pkg-config autoconf-archive bison flex +retry source ./dev/vcpkg/env.sh +retry dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON diff --git a/tools/gluten-te/ubuntu/shared.sh b/tools/gluten-te/ubuntu/shared.sh index 0253c16cef5f..d14b35bf9d0f 100755 --- a/tools/gluten-te/ubuntu/shared.sh +++ b/tools/gluten-te/ubuntu/shared.sh @@ -24,13 +24,6 @@ source "$SHARED_BASEDIR/defaults.conf" export DOCKER_BUILDKIT=1 export BUILDKIT_PROGRESS=plain -# Validate envs -if [ -z "$HOME" ] -then - echo 'Environment variable $HOME not found. Aborting.' - exit 1 -fi - # Set operating system OS_IMAGE_NAME=${OS_IMAGE_NAME:-$DEFAULT_OS_IMAGE_NAME}