Skip to content

Commit

Permalink
[VL] Gluten-it: --data-gen-strategy=once to skip data-gen when it alr…
Browse files Browse the repository at this point in the history
…eady exists (#6587)
  • Loading branch information
zhztheplayer authored Jul 26, 2024
1 parent d90a7f4 commit a4cafee
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 23 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/velox_be.yml.deprecated
Original file line number Diff line number Diff line change
Expand Up @@ -529,9 +529,9 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --skip-data-gen --random-kill-tasks \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --data-gen-strategy=skip --random-kill-tasks \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --skip-data-gen --random-kill-tasks'
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --data-gen-strategy=skip --random-kill-tasks'
- name: Exit docker container
if: ${{ always() }}
run: |
Expand Down Expand Up @@ -580,7 +580,7 @@ jobs:
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
Expand All @@ -592,7 +592,7 @@ jobs:
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
Expand All @@ -603,7 +603,7 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
Expand All @@ -615,7 +615,7 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd /opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/velox_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
Expand All @@ -308,7 +308,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
Expand All @@ -319,7 +319,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
Expand All @@ -330,7 +330,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
Expand All @@ -341,7 +341,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
Expand All @@ -352,7 +352,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
--data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
Expand Down Expand Up @@ -408,7 +408,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen --random-kill-tasks --no-session-reuse
--data-gen-strategy=skip --random-kill-tasks --no-session-reuse
# run-tpc-test-ubuntu-sf30:
# needs: build-native-lib-centos-7
Expand Down Expand Up @@ -457,10 +457,10 @@ jobs:
# cd tools/gluten-it \
# && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
# --local --preset=velox --benchmark-type=h --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \
# --skip-data-gen --shard=${{ matrix.shard }} \
# --data-gen-strategy=skip --shard=${{ matrix.shard }} \
# && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
# --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \
# --skip-data-gen --shard=${{ matrix.shard }}
# --data-gen-strategy=skip --shard=${{ matrix.shard }}

run-tpc-test-centos8-uniffle:
needs: build-native-lib-centos-7
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,35 @@
package org.apache.gluten.integration.command;

import org.apache.gluten.integration.action.Action;
import org.apache.gluten.integration.action.DataGenOnly;
import picocli.CommandLine;

public class DataGenMixin {
@CommandLine.Option(names = {"--data-gen-strategy"}, description = "The strategy of data generation, accepted values: skip, once, always", defaultValue = "always")
private String dataGenStrategy;

@CommandLine.Option(names = {"-s", "--scale"}, description = "The scale factor of sample TPC-H dataset", defaultValue = "0.1")
private double scale;

@CommandLine.Option(names = {"--gen-partitioned-data"}, description = "Generate data with partitions", defaultValue = "false")
private boolean genPartitionedData;

@CommandLine.Option(names = {"--skip-data-gen"}, description = "Skip data generation", defaultValue = "false")
private boolean skipDataGen;

public Action[] makeActions() {
if (skipDataGen) {
return new Action[0];
final DataGenOnly.Strategy strategy;
switch (dataGenStrategy) {
case "skip":
strategy = DataGenOnly.Skip$.MODULE$;
break;
case "once":
strategy = DataGenOnly.Once$.MODULE$;
break;
case "always":
strategy = DataGenOnly.Always$.MODULE$;
break;
default:
throw new IllegalArgumentException("Unexpected data-gen strategy: " + dataGenStrategy);
}
return new Action[]{new org.apache.gluten.integration.action.DataGenOnly(scale, genPartitionedData)};
return new Action[]{new org.apache.gluten.integration.action.DataGenOnly(strategy, scale, genPartitionedData)};
}

public double getScale() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,36 @@ import org.apache.gluten.integration.Suite

import java.io.File

case class DataGenOnly(scale: Double, genPartitionedData: Boolean) extends Action {
case class DataGenOnly(strategy: DataGenOnly.Strategy, scale: Double, genPartitionedData: Boolean)
extends Action {
override def execute(suite: Suite): Boolean = {
strategy match {
case DataGenOnly.Skip =>
// Do nothing
case DataGenOnly.Once =>
val dataPath = suite.dataWritePath(scale, genPartitionedData)
val alreadyExists = new File(dataPath).exists()
if (alreadyExists) {
println(s"Data already exists at $dataPath, skipping generating it.")
} else {
gen(suite)
}
case DataGenOnly.Always =>
gen(suite)
}
true
}

private def gen(suite: Suite): Unit = {
suite.sessionSwitcher.useSession("baseline", "Data Gen")
val dataGen = suite.createDataGen(scale, genPartitionedData)
dataGen.gen()
true
}
}

object DataGenOnly {
sealed trait Strategy
case object Skip extends Strategy
case object Once extends Strategy
case object Always extends Strategy
}

0 comments on commit a4cafee

Please sign in to comment.