diff --git a/.github/workflows/velox_be.yml.deprecated b/.github/workflows/velox_be.yml.deprecated index d095af64d7b0..6ff5ec743c14 100644 --- a/.github/workflows/velox_be.yml.deprecated +++ b/.github/workflows/velox_be.yml.deprecated @@ -529,9 +529,9 @@ jobs: $PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \ mvn clean install -Pspark-3.2 \ && GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries \ - --local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --skip-data-gen --random-kill-tasks \ + --local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --data-gen-strategy=skip --random-kill-tasks \ && GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries \ - --local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --skip-data-gen --random-kill-tasks' + --local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --data-gen-strategy=skip --random-kill-tasks' - name: Exit docker container if: ${{ always() }} run: | @@ -580,7 +580,7 @@ jobs: mvn clean install -Pspark-3.2 \ && GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \ -d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \ @@ -592,7 +592,7 @@ jobs: mvn clean install -Pspark-3.2 \ && GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \ -d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \ @@ -603,7 +603,7 @@ jobs: $PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \ GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ @@ -615,7 +615,7 @@ jobs: $PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \ GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ diff --git a/.github/workflows/velox_docker.yml b/.github/workflows/velox_docker.yml index 1e88e034e585..47dd7a9190b8 100644 --- a/.github/workflows/velox_docker.yml +++ b/.github/workflows/velox_docker.yml @@ -296,7 +296,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \ -d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \ @@ -308,7 +308,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \ -d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \ @@ -319,7 +319,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \ -d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \ @@ -330,7 +330,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ @@ -341,7 +341,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ @@ -352,7 +352,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ --local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen -m=OffHeapExecutionMemory \ + --data-gen-strategy=skip -m=OffHeapExecutionMemory \ -d=ISOLATION:OFF,spark.gluten.memory.isolation=false \ -d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \ -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ @@ -408,7 +408,7 @@ jobs: cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries \ --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - --skip-data-gen --random-kill-tasks --no-session-reuse + --data-gen-strategy=skip --random-kill-tasks --no-session-reuse # run-tpc-test-ubuntu-sf30: # needs: build-native-lib-centos-7 @@ -457,10 +457,10 @@ jobs: # cd tools/gluten-it \ # && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \ # --local --preset=velox --benchmark-type=h --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - # --skip-data-gen --shard=${{ matrix.shard }} \ + # --data-gen-strategy=skip --shard=${{ matrix.shard }} \ # && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \ # --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - # --skip-data-gen --shard=${{ matrix.shard }} + # --data-gen-strategy=skip --shard=${{ matrix.shard }} run-tpc-test-centos8-uniffle: needs: build-native-lib-centos-7 diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java index 0682f5601a92..3854d078e261 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java @@ -17,23 +17,35 @@ package org.apache.gluten.integration.command; import org.apache.gluten.integration.action.Action; +import org.apache.gluten.integration.action.DataGenOnly; import picocli.CommandLine; public class DataGenMixin { + @CommandLine.Option(names = {"--data-gen-strategy"}, description = "The strategy of data generation, accepted values: skip, once, always", defaultValue = "always") + private String dataGenStrategy; + @CommandLine.Option(names = {"-s", "--scale"}, description = "The scale factor of sample TPC-H dataset", defaultValue = "0.1") private double scale; @CommandLine.Option(names = {"--gen-partitioned-data"}, description = "Generate data with partitions", defaultValue = "false") private boolean genPartitionedData; - @CommandLine.Option(names = {"--skip-data-gen"}, description = "Skip data generation", defaultValue = "false") - private boolean skipDataGen; - public Action[] makeActions() { - if (skipDataGen) { - return new Action[0]; + final DataGenOnly.Strategy strategy; + switch (dataGenStrategy) { + case "skip": + strategy = DataGenOnly.Skip$.MODULE$; + break; + case "once": + strategy = DataGenOnly.Once$.MODULE$; + break; + case "always": + strategy = DataGenOnly.Always$.MODULE$; + break; + default: + throw new IllegalArgumentException("Unexpected data-gen strategy: " + dataGenStrategy); } - return new Action[]{new org.apache.gluten.integration.action.DataGenOnly(scale, genPartitionedData)}; + return new Action[]{new org.apache.gluten.integration.action.DataGenOnly(strategy, scale, genPartitionedData)}; } public double getScale() { diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala index bc43834610a4..dc54e9737703 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala @@ -20,11 +20,36 @@ import org.apache.gluten.integration.Suite import java.io.File -case class DataGenOnly(scale: Double, genPartitionedData: Boolean) extends Action { +case class DataGenOnly(strategy: DataGenOnly.Strategy, scale: Double, genPartitionedData: Boolean) + extends Action { override def execute(suite: Suite): Boolean = { + strategy match { + case DataGenOnly.Skip => + // Do nothing + case DataGenOnly.Once => + val dataPath = suite.dataWritePath(scale, genPartitionedData) + val alreadyExists = new File(dataPath).exists() + if (alreadyExists) { + println(s"Data already exists at $dataPath, skipping generating it.") + } else { + gen(suite) + } + case DataGenOnly.Always => + gen(suite) + } + true + } + + private def gen(suite: Suite): Unit = { suite.sessionSwitcher.useSession("baseline", "Data Gen") val dataGen = suite.createDataGen(scale, genPartitionedData) dataGen.gen() - true } } + +object DataGenOnly { + sealed trait Strategy + case object Skip extends Strategy + case object Once extends Strategy + case object Always extends Strategy +}