Merge pull request #549 from kroma-network/perf/add-optimization-flags

perf: add optimization flags
kroma-network · Oct 18, 2024 · 67bf0f9 · 67bf0f9
2 parents ab62362 + fdbef39
commit 67bf0f9
Show file tree

Hide file tree

Showing 18 changed files with 87 additions and 112 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -24,6 +24,9 @@ build --announce_rc
 # TODO(chokobole): Remove when `cc_shared_library` is enabled by default
 build --experimental_cc_shared_library
 
+# Enable platform specific configurations by default depending on host machine.
+common --enable_platform_specific_config
+
 build:macos_x86_64 --config=macos
 build:macos_x86_64 --cpu=darwin_x86_64
 build:macos_x86_64 --host_cpu=darwin_x86_64
@@ -52,9 +55,22 @@ build:rocm --repo_env TACHYON_NEED_ROCM=1
 # Options extracted from configure script
 build:numa --//:has_numa
 
+# Fastbuild config
+build:fastbuild -c fastbuild
+
 # Debug config
 build:dbg -c dbg
 
+# Opt config
+build:opt -c opt
+build:opt --//:has_openmp
+
+build:maxopt -c opt
+build:maxopt --//:has_openmp
+build:maxopt --config=native
+build:maxopt --copt=-flto
+build:maxopt --linkopt=-flto
+
 # By default, build Tachyon in C++ 17 mode.
 build:linux --cxxopt=-std=c++17
 build:linux --host_cxxopt=-std=c++17
@@ -70,11 +86,14 @@ build:windows --host_cxxopt=/std:c++17
 build:avx_linux --copt=-mavx
 build:avx2_linux --copt=-mavx2
 build:avx512_linux --copt=-mavx512f
-build:native_arch_linux --copt=-march=native
 build:avx_windows --copt=/arch=AVX
 build:avx2_windows --copt=/arch=AVX2
 build:avx512_windows --copt=/arch=AVX512
 
+# Compile for the native architecture
+# WARN(batzor): this will make the build non-portable
+build:native --copt=-march=native
+
 # Enable googletest build with absl.
 # See https://github.com/google/googletest/blob/v1.13.0/BUILD.bazel#L67C1-L70
 build --define absl=1

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,17 +12,6 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest-xlarge]
         build_flag: [fastbuild, opt]
-        openmp: ["openmp", "no-openmp"]
-        include:
-          - os: ubuntu-latest
-            bazel_config: linux
-          - os: macos-latest-xlarge
-            bazel_config: macos_arm64
-          - openmp: "openmp"
-            has_openmp: "--//:has_openmp"
-        exclude:
-          - build_flag: fastbuild
-            openmp: "openmp"
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -50,7 +39,7 @@ jobs:
           # Avoid downloading Bazel every time.
           bazelisk-cache: true
           # Share a single build cache between workflows.
-          disk-cache: ${{ matrix.os }}-${{ matrix.build_flag }}-${{ matrix.openmp }}
+          disk-cache: ${{ matrix.os }}-${{ matrix.build_flag }}-build
           # Share repository cache between workflows.
           repository-cache: false
           # Cache external repositories
@@ -65,49 +54,47 @@ jobs:
         run: python3 -m pip install numpy
 
       - name: Install OpenMP on linux
-        if: matrix.os == 'ubuntu-latest' && matrix.openmp == 'openmp'
+        if: matrix.os == 'ubuntu-latest' && matrix.build_flag == 'opt'
         run: sudo apt-get install -y libomp-dev
 
       - name: Install OpenMP on macos
-        if: matrix.os == 'macos-latest-xlarge' && matrix.openmp == 'openmp'
+        if: matrix.os == 'macos-latest-xlarge' && matrix.build_flag == 'opt'
         run: brew install libomp
 
       - name: Add .bazelrc.user on linux
         if: matrix.os == 'ubuntu-latest'
-        run: echo "build --config linux" > .bazelrc.user &&
-          echo "build --action_env=CARGO=$HOME/.cargo/bin/cargo" >> .bazelrc.user &&
+        run: echo "build --action_env=CARGO=$HOME/.cargo/bin/cargo" >> .bazelrc.user &&
           echo "build --@rules_rust//rust/toolchain/channel=nightly" >> .bazelrc.user
 
       - name: Add .bazelrc.user on macos
         if: matrix.os == 'macos-latest-xlarge'
         run: brew install coreutils &&
           export PATH="/opt/homebrew/opt/coreutils/libexec/gnubin:$PATH" &&
-          echo "build --config macos_arm64" > .bazelrc.user &&
           echo "build --action_env=CARGO=$HOME/.cargo/bin/cargo" >> .bazelrc.user &&
           echo "build --@rules_rust//rust/toolchain/channel=nightly" >> .bazelrc.user
 
       - name: Build
-        run: bazel build -c ${{ matrix.build_flag }} ${{ matrix.has_openmp }} //...
+        run: bazel build --config ${{ matrix.build_flag }} //...
 
       - name: Test
         # NOTE(chokobole): Test timeouts are overridden 1.5x of the default timeout due to timeout failure on GitHub Actions.
         # See https://github.com/kroma-network/tachyon/actions/runs/9581476338/job/26418352737.
-        run: bazel test -c ${{ matrix.build_flag }} ${{ matrix.has_openmp }} --test_output=errors --test_tag_filters -benchmark,-manual,-cuda //... --test_timeout=90,450,1350,5400
+        run: bazel test --config ${{ matrix.build_flag }} --test_output=errors --test_tag_filters -benchmark,-manual,-cuda //... --test_timeout=90,450,1350,5400
 
       - name: Test Node Binding
         run: |
           cd tachyon/node/test
-          bazel test --config ${{ matrix.bazel_config }} -c ${{ matrix.build_flag}} --test_output=errors //...
+          bazel test -c ${{ matrix.build_flag}} --test_output=errors //...
 
       - name: Test Py Binding
         run: |
           cd tachyon/py/test
-          bazel test --config ${{ matrix.bazel_config }} -c ${{ matrix.build_flag}} --test_output=errors //...
+          bazel test -c ${{ matrix.build_flag}} --test_output=errors //...
 
       - name: Test Circom
         run: |
           cd vendors/circom
-          CARGO_BAZEL_REPIN=true bazel test --config ${{ matrix.bazel_config }} -c ${{ matrix.build_flag}} --test_output=errors //...
+          CARGO_BAZEL_REPIN=true bazel test --config ${{ matrix.build_flag}} --test_output=errors //...
 
   lint:
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -124,13 +124,13 @@ brew install gmp libomp
 ### Build
 
 ```shell
-bazel build --config {os} //...
+bazel build //...
 ```
 
 ### Test
 
 ```shell
-bazel test --config {os} //...
+bazel test //...
 ```
 
 Check [How To Build](/docs/how_to_use/how_to_build.md) for more information.
diff --git a/benchmark/BUILD.bazel b/benchmark/BUILD.bazel
@@ -5,6 +5,7 @@ tachyon_cc_library(
     name = "simple_reporter",
     srcs = ["simple_reporter.cc"],
     hdrs = ["simple_reporter.h"],
+    force_rtti = if_has_matplotlib(True, False),
     local_defines = tachyon_matplotlib_defines(),
     visibility = ["//benchmark:__subpackages__"],
     deps = [

diff --git a/benchmark/fft/README.md b/benchmark/fft/README.md
@@ -20,7 +20,7 @@ CPU Caches:
 ### FFT
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft:fft_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/fft:fft_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --check_results
 ```
 
 #### On Intel i9-13900K
@@ -56,7 +56,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/f
 ### IFFT
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft:fft_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --run_ifft --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/fft:fft_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --run_ifft --check_results
 ```
 
 #### On Intel i9-13900K
@@ -94,7 +94,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/f
 ### FFT
 
 ```shell
-bazel run -c opt --config cuda --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft:fft_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --check_results
+bazel run --config opt --config cuda --//:has_matplotlib //benchmark/fft:fft_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --check_results
 ```
 
 #### On RTX-4090
@@ -115,7 +115,7 @@ bazel run -c opt --config cuda --//:has_openmp --//:has_rtti --//:has_matplotlib
 ### IFFT
 
 ```shell
-bazel run -c opt --config cuda --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft:fft_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --run_ifft --check_results
+bazel run --config opt --config cuda --//:has_matplotlib //benchmark/fft:fft_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --run_ifft --check_results
 ```
 
 #### On RTX-4090

diff --git a/benchmark/fft_batch/README.md b/benchmark/fft_batch/README.md
@@ -20,7 +20,7 @@ CPU Caches:
 ### FFTBatch
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft_batch:fft_batch_benchmark -- -k 20 -k 21 -k 22 -k 23 -k 24 -k 25 -k 26 --vendor plonky3 -p baby_bear --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/fft_batch:fft_batch_benchmark -- -k 20 -k 21 -k 22 -k 23 -k 24 -k 25 -k 26 --vendor plonky3 -p baby_bear --check_results
 ```
 
 WARNING: On Mac M3, tests beyond degree 24 are not feasible due to memory constraints.
@@ -54,7 +54,7 @@ WARNING: On Mac M3, tests beyond degree 24 are not feasible due to memory constr
 ### CosetLDEBatch
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fft_batch:fft_batch_benchmark -- -k 20 -k 21 -k 22 -k 23 -k 24 -k 25 --vendor plonky3 -p baby_bear --run_coset_lde --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/fft_batch:fft_batch_benchmark -- -k 20 -k 21 -k 22 -k 23 -k 24 -k 25 --vendor plonky3 -p baby_bear --run_coset_lde --check_results
 ```
 
 WARNING: On Mac M3, tests beyond degree 24 are not feasible due to memory constraints.

diff --git a/benchmark/fri/README.md b/benchmark/fri/README.md
@@ -18,7 +18,7 @@ CPU Caches:
 ```
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/fri:fri_benchmark -- -k 18 -k 19 -k 20 -k 21 -k 22 --batch_size 100 --input_num 4 --round_num 4 --log_blowup 2 --vendor plonky3 --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/fri:fri_benchmark -- -k 18 -k 19 -k 20 -k 21 -k 22 --batch_size 100 --input_num 4 --round_num 4 --log_blowup 2 --vendor plonky3 --check_results
 ```
 
 ## On Intel i9-13900K

diff --git a/benchmark/msm/README.md b/benchmark/msm/README.md
@@ -20,7 +20,7 @@ CPU Caches:
 ### Uniform points
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/msm:msm_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/msm:msm_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --check_results
 ```
 
 #### On Intel i9-13900K
@@ -56,7 +56,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/m
 ### Non-uniform points
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/msm:msm_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --test_set non_uniform --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/msm:msm_benchmark -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --vendor arkworks --vendor bellman --vendor halo2 --test_set non_uniform --check_results
 ```
 
 #### On Intel i9-13900K
@@ -94,7 +94,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/m
 ### Uniform points
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib --config cuda //benchmark/msm:msm_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --test_set non_uniform --check_results
+bazel run --config opt --//:has_matplotlib --config cuda //benchmark/msm:msm_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --test_set non_uniform --check_results
 ```
 
 #### On RTX-4090
@@ -115,7 +115,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib --config cuda
 ### Non-uniform points
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib --config cuda //benchmark/msm:msm_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --check_results
+bazel run --config opt --//:has_matplotlib --config cuda //benchmark/msm:msm_benchmark_gpu -- -k 16 -k 17 -k 18 -k 19 -k 20 -k 21 -k 22 -k 23 --check_results
 ```
 
 #### On RTX-4090

diff --git a/benchmark/poseidon/README.md b/benchmark/poseidon/README.md
@@ -16,7 +16,7 @@ CPU Caches:
 ```
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/poseidon:poseidon_benchmark -- --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/poseidon:poseidon_benchmark -- --check_results
 ```
 
 ## On Intel i9-13900K

diff --git a/benchmark/poseidon2/README.md b/benchmark/poseidon2/README.md
@@ -20,7 +20,7 @@ Note that Poseidon2 runs 10000x per test due to some time results being too smal
 ## BN254
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p bn254_fr --vendor horizen --vendor plonky3 --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p bn254_fr --vendor horizen --vendor plonky3 --check_results
 ```
 
 ### On Intel i9-13900K
@@ -66,7 +66,7 @@ Note: Horizen and Plonky3 compute values with a different internal matrix, requi
 ### Horizen
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p baby_bear --vendor horizen --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p baby_bear --vendor horizen --check_results
 ```
 
 #### On Intel i9-13900K
@@ -108,7 +108,7 @@ bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/p
 ### Plonky3
 
 ```shell
-bazel run -c opt --//:has_openmp --//:has_rtti --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p baby_bear --vendor plonky3 --check_results
+bazel run --config opt --//:has_matplotlib //benchmark/poseidon2:poseidon2_benchmark -- -p baby_bear --vendor plonky3 --check_results
 ```
 
 #### On Intel i9-13900K

diff --git a/docker/Dockerfile.halo2.jammy b/docker/Dockerfile.halo2.jammy
@@ -4,8 +4,8 @@ LABEL maintainer="The Tachyon Authors <[email protected]>"
 COPY . /usr/src/tachyon
 WORKDIR /usr/src/tachyon
 
-RUN bazel build -c opt --config linux --//:has_openmp --//:c_shared_object //scripts/packages/debian/runtime:debian && \
-    bazel build -c opt --config linux --//:has_openmp --//:c_shared_object //scripts/packages/debian/dev:debian
+RUN bazel build --config opt --config linux --//:c_shared_object //scripts/packages/debian/runtime:debian && \
+    bazel build --config opt --config linux --//:c_shared_object //scripts/packages/debian/dev:debian
 
 FROM ubuntu:jammy AS tachyon-halo2
 LABEL maintainer="The Tachyon Authors <[email protected]>"