diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index ae87e0f63..8a1d151e6 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -45,7 +45,9 @@ jobs: - name: Install prerequisite MacOS packages if: ${{ matrix.os == 'macos-latest' }} - run: brew install ninja boost eigen open-mpi bison ccache + run: | + brew install ninja boost eigen open-mpi bison ccache + echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV - name: Install prerequisites Ubuntu packages if: ${{ matrix.os == 'ubuntu-22.04' }} @@ -54,6 +56,7 @@ jobs: sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen + echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - name: Create Build Environment # Some projects don't allow in-source building, so create a separate build directory @@ -110,10 +113,13 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp_main -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_main/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_main/CMakeFiles/CMakeError.log) - cmake --build test_install_devsamp_main + cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_helloworld/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_helloworld/CMakeFiles/CMakeError.log) + cmake --build test_install_devsamp_helloworld + $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-parsec + $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-mad cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log) cmake --build test_install_devsamp_fibonacci + $MPIEXEC -n 2 test_install_devsamp_fibonacci/fibonacci-parsec cmake -E make_directory test_install_userexamples cat > test_install_userexamples/CMakeLists.txt < cmake -S . -B build && cmake --build build --target hw-parsec +> cmake -S . -B build && cmake --build build --target helloworld-parsec ``` +The complete example, including the CMake build harness using a slightly easier way to build the executable (using `add_ttg_executable` CMake macro), can be found in [dox examples](https://github.com/TESSEorg/ttg/tree/master/doc/dox/dev/devsamp/helloworld). + ## "Hello, World!" Walkthrough Although it does not involve any useful flow of computation and/or data, the above "Hello, World!" TTG program introduces several key TTG concepts and illustrates what you need to do to write a complete TTG program. So let's walk through it. @@ -95,7 +97,7 @@ Every TTG program must: - make TTG executable and kickstart the execution by sending a control or data message to the TTG, - shut down the runtime -Let's go over each of these steps using the "Hello, World!" example. +Let's go over each of these steps using the "Hello, World!" example. The complete example, including the CMake build harness, can be found in [dox examples](https://github.com/TESSEorg/ttg/tree/master/doc/dox/dev/devsamp/fibonacci). ### Select the TTG Backend @@ -138,12 +140,12 @@ To make a TTG create and connect one or more TTs. The simplest TTG consists of a The "Hello, World!" example contains a single TT that executes a single task (hence, task ID can be omitted, i.e., void) that does not take and produce any data. The easiest way to make such a TT is by wrapping a callable (e.g., a lambda) with `ttg::make_tt`: ```cpp - auto tt = ttg::make_tt([]() { std::cout << "Hello, World!"; }); + auto tt = ttg::make_tt([]() { std::cout << "Hello, World!\n"; }); ``` ## Execute TTG -To execute a TTG we must make it executable (this will declare the TTG complete). To execute the TTG its root TT must receive at least one message; since in this case the task does not receive either task ID or data the message is empty (i.e., void): +To execute a TTG we must make it executable (this will declare the TTG program complete so no additional changes to the flowgraph are possible). To execute the TTG its root TT must receive at least one message; since in this case the task does not receive either task ID or data the message is empty (i.e., void): ```cpp ttg::make_graph_executable(tt); @@ -152,7 +154,7 @@ To execute a TTG we must make it executable (this will declare the TTG complete) tt->invoke(); ``` -Note that we must ensure that only one such message must be generated. Since TTG execution uses the Single Program Multiple Data (SPMD) model, +`ttg::execute()` must occur before, not after, sending any messages. Note also that we must ensure that only one such message must be generated. Since TTG execution uses the Single Program Multiple Data (SPMD) model, when launching the TTG program as multiple processes only the first process (rank) gets to send the message. ## Finalize TTG @@ -243,6 +245,7 @@ $F_{n-1},F_{n-2} \to F_{n}$). To illustrate the real power of TTG let's tweak the problem slightly: instead of computing first $N$ Fibonacci numbers let's find the largest Fibonacci number smaller than some $N$. The key difference in the latter case is that, unlike the former, the number of tasks is NOT known a priori; furthermore, to make a decision whether we need to compute next Fibonacci number we must examine the value returned by the previous task. This is an example of data-dependent tasking, where the decision which (if any) task to execute next depends on the values produced by previous tasks. The ability to compose regular as well as data-dependent task graphs is a distinguishing strength of TTG. To make things even more interesting, we will demonstrate how to implement such program both for execution on CPUs as well as on accelerators (GPUs). +The complete examples, including the CMake build harness, can be found in [dox examples](https://github.com/TESSEorg/ttg/tree/master/doc/dox/dev/devsamp/fibonacci). ### The CPU Version @@ -300,12 +303,11 @@ int main(int argc, char* argv[]) { auto fib = make_ttg_fib_lt(N); ttg::make_graph_executable(fib.get()); + ttg::execute(); if (ttg::default_execution_context().rank() == 0) fib->template in<0>()->send(1, Fn{});; - ttg::execute(); ttg::fence(); - ttg::finalize(); return 0; } @@ -394,6 +396,22 @@ auto make_ttg_fib_lt(const int64_t F_n_max = 1000) { ops.emplace_back(std::move(print)); return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N"); } + +int main(int argc, char* argv[]) { + ttg::initialize(argc, argv, -1); + int64_t N = 1000; + if (argc > 1) N = std::atol(argv[1]); + + auto fib = make_ttg_fib_lt(N); + ttg::make_graph_executable(fib.get()); + ttg::execute(); + if (ttg::default_execution_context().rank() == 0) + fib->template in<0>()->send(1, Fn{});; + + ttg::fence(); + ttg::finalize(); + return 0; +} ``` Although the structure of the device-capable program is nearly identical to the CPU version, there are important differences: @@ -450,8 +468,6 @@ Here's the CUDA version of the device kernel and its host-side wrapper; ROCm and `cu_next_value` is the device kernel that evaluates $F_{n+1}$ from $F_{n}$ and $F_{n-1}$. `next_value` is a host function that launches `cu_next_value`; this is the function called in the `fib` task. -The complete example, including the CMake build harness, can be found in [dox examples](https://github.com/TESSEorg/ttg/tree/master/doc/dox/dev/devsamp/fibonacci). - ## Debugging TTG Programs ### TTG Visualization diff --git a/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt b/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt index e5058cb43..7c57bde7d 100644 --- a/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt +++ b/doc/dox/dev/devsamp/fibonacci/CMakeLists.txt @@ -2,6 +2,11 @@ cmake_minimum_required(VERSION 3.14) project(ttg-devsample-fibonacci) find_package(ttg REQUIRED) +if (NOT TARGET ttg-parsec) # else build from source + include(FetchContent) + FetchContent_Declare(ttg GIT_REPOSITORY https://github.com/TESSEorg/ttg.git) + FetchContent_MakeAvailable( ttg ) +endif() add_ttg_executable(fibonacci fibonacci.cc NOT_EXCLUDE_FROM_ALL) # Fib device test @@ -11,4 +16,4 @@ if (TTG_HAVE_CUDA) fibonacci_cuda_kernel.h fibonacci_cuda_kernel.cu LINK_LIBRARIES std::coroutine RUNTIMES "parsec" NOT_EXCLUDE_FROM_ALL) -endif() \ No newline at end of file +endif() diff --git a/doc/dox/dev/devsamp/fibonacci/README.md b/doc/dox/dev/devsamp/fibonacci/README.md new file mode 100644 index 000000000..b2812041c --- /dev/null +++ b/doc/dox/dev/devsamp/fibonacci/README.md @@ -0,0 +1,17 @@ +# Largest Fibonacci number + +This directory contains TTG programs computing the largest Fibonacci number smaller than $N$: + +- CPU version: `fibonacci.cc` +- Device version: `fibonacci_device.cc` + - CUDA kernel: `fibonacci_cuda_kernel.{cu,h}` + +## Build + +After TTG has been installed to `/path/to/ttg`, do this: + +- configure: `cmake -S . -B build -DCMAKE_PREFIX_PATH="/path/to/ttg"` +- build: + - CPU version: `cmake --build build --target fibonacci` + - CUDA version (TTG must have been configured with CUDA support): `cmake --build build --target fibonacci_cuda` +- run: `./build/fibonacci N` or `./build/fibonacci_cuda N` diff --git a/doc/dox/dev/devsamp/fibonacci/fibonacci.cc b/doc/dox/dev/devsamp/fibonacci/fibonacci.cc index d2d829c45..3de431979 100644 --- a/doc/dox/dev/devsamp/fibonacci/fibonacci.cc +++ b/doc/dox/dev/devsamp/fibonacci/fibonacci.cc @@ -47,12 +47,16 @@ int main(int argc, char* argv[]) { ttg::initialize(argc, argv, -1); int64_t N = (argc > 1) ? std::atol(argv[1]) : 1000; + // make TTG auto fib = make_ttg_fib_lt(N); + // program complete, declare it executable ttg::make_graph_executable(fib.get()); + // start execution + ttg::execute(); + // start the computation by sending the first message if (ttg::default_execution_context().rank() == 0) fib->template in<0>()->send(1, Fn{});; - - ttg::execute(); + // wait for the computation to finish ttg::fence(); ttg::finalize(); diff --git a/doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc b/doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc index a1603cb58..99dbc37ca 100644 --- a/doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc +++ b/doc/dox/dev/devsamp/fibonacci/fibonacci_device.cc @@ -74,13 +74,17 @@ int main(int argc, char* argv[]) { ttg::trace_on(); int64_t N = 1000; if (argc > 1) N = std::atol(argv[1]); - auto fib = make_ttg_fib_lt(N); // computes largest F_n < N + // make TTG + auto fib = make_ttg_fib_lt(N); // computes largest F_n < N + // program complete, declare it executable ttg::make_graph_executable(fib.get()); + // start execution + ttg::execute(ttg::ttg_default_execution_context()); + // start the computation by sending the first message if (ttg::default_execution_context().rank() == 0) fib->template in<0>()->send(1, Fn{});; - - ttg::execute(ttg::ttg_default_execution_context()); + // wait for the computation to finish ttg::fence(ttg::ttg_default_execution_context()); ttg::finalize(); diff --git a/doc/dox/dev/devsamp/helloworld/CMakeLists.txt b/doc/dox/dev/devsamp/helloworld/CMakeLists.txt new file mode 100644 index 000000000..c7fddf47f --- /dev/null +++ b/doc/dox/dev/devsamp/helloworld/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.14) +project(ttg-devsample-helloworld) + +find_package(ttg REQUIRED) +if (NOT TARGET ttg-parsec) # else build from source + include(FetchContent) + FetchContent_Declare(ttg GIT_REPOSITORY https://github.com/TESSEorg/ttg.git) + FetchContent_MakeAvailable( ttg ) +endif() + +add_ttg_executable(helloworld helloworld.cpp NOT_EXCLUDE_FROM_ALL) diff --git a/doc/dox/dev/devsamp/helloworld/README.md b/doc/dox/dev/devsamp/helloworld/README.md new file mode 100644 index 000000000..4513cd489 --- /dev/null +++ b/doc/dox/dev/devsamp/helloworld/README.md @@ -0,0 +1,11 @@ +# TTG "Hello World" + +This directory contains the TTG "Hello World" program + +## Build + +After TTG has been installed to `/path/to/ttg`, do this: + +- configure: `cmake -S . -B build -DCMAKE_PREFIX_PATH="/path/to/ttg"` +- build: `cmake --build build` +- run: `./build/helloworld-parsec` or `./build/helloworld-mad` diff --git a/doc/dox/dev/devsamp/helloworld/helloworld.cpp b/doc/dox/dev/devsamp/helloworld/helloworld.cpp new file mode 100644 index 000000000..b59d939b4 --- /dev/null +++ b/doc/dox/dev/devsamp/helloworld/helloworld.cpp @@ -0,0 +1,17 @@ +#include + +using namespace ttg; + + int main(int argc, char *argv[]) { + ttg::initialize(argc, argv); + + auto tt = ttg::make_tt([]() { std::cout << "Hello, World!\n"; }); + + ttg::make_graph_executable(tt); + ttg::execute(); + if (ttg::get_default_world().rank() == 0) tt->invoke(); + ttg::fence(); + + ttg::finalize(); + return 0; +} diff --git a/doc/dox/dev/devsamp/main/CMakeLists.txt b/doc/dox/dev/devsamp/main/CMakeLists.txt deleted file mode 100644 index 5a127cd97..000000000 --- a/doc/dox/dev/devsamp/main/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -project(ttg-devsample-main) - -find_package(ttg REQUIRED) - -add_ttg_executable(test test.cpp NOT_EXCLUDE_FROM_ALL) diff --git a/doc/dox/dev/devsamp/main/test.cpp b/doc/dox/dev/devsamp/main/test.cpp deleted file mode 100644 index a0cc7b73c..000000000 --- a/doc/dox/dev/devsamp/main/test.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#include - -using namespace ttg; - -int main(int argc, char* argv[]) { - initialize(argc, argv); - ttg_finalize(); -} diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5c3d4012c..e1fb7d685 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -32,14 +32,6 @@ add_ttg_executable(serialization serialization.cc unit_main.cpp add_ttg_executable(serialization_boost serialization_boost.cc LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec") -# Fib device test -if (TTG_HAVE_CUDA) - add_ttg_executable(fibonacci_device fibonacci_device.cc - fibonacci_cuda_kernel.h - fibonacci_cuda_kernel.cu - LINK_LIBRARIES std::coroutine RUNTIMES "parsec") -endif() - # TODO: convert into unit test #if (TARGET MADworld) #add_executable(splitmd_serialization splitmd_serialization.cc unit_main.cpp) diff --git a/tests/unit/fibonacci_cuda_kernel.cu b/tests/unit/fibonacci_cuda_kernel.cu deleted file mode 100644 index 6fa316468..000000000 --- a/tests/unit/fibonacci_cuda_kernel.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include "fibonacci_cuda_kernel.h" - -#ifdef TTG_HAVE_CUDA - -__global__ void cu_next_value(int64_t* fn_and_fnm1) { - int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1]; - fn_and_fnm1[1] = fn_and_fnm1[0]; - fn_and_fnm1[0] = fnp1; -} - -void next_value(int64_t* fn_and_fnm1) { - cu_next_value<<<1, 1>>>(fn_and_fnm1); -} - -#endif // TTG_HAVE_CUDA diff --git a/tests/unit/fibonacci_cuda_kernel.h b/tests/unit/fibonacci_cuda_kernel.h deleted file mode 100644 index a096ec3f1..000000000 --- a/tests/unit/fibonacci_cuda_kernel.h +++ /dev/null @@ -1,4 +0,0 @@ -#include "ttg/config.h" -#include - -void next_value(int64_t* fn_and_fnm1); \ No newline at end of file diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc deleted file mode 100644 index a1603cb58..000000000 --- a/tests/unit/fibonacci_device.cc +++ /dev/null @@ -1,88 +0,0 @@ -#include - -#if defined(TTG_HAVE_CUDA) -#define ES ttg::ExecutionSpace::CUDA -#include "cuda_runtime.h" -#include "fibonacci_cuda_kernel.h" -#else -#error " CUDA is required to build this test!" -#endif - -#include "ttg/serialization.h" - -const int64_t F_n_max = 1000; -/// N.B. contains values of F_n and F_{n-1} -struct Fn : public ttg::TTValue { - std::unique_ptr F; // F[0] = F_n, F[1] = F_{n-1} - ttg::Buffer b; - - Fn() : F(std::make_unique(2)), b(F.get(), 2) { F[0] = 1; F[1] = 0; } - - Fn(const Fn&) = delete; - Fn(Fn&& other) = default; - Fn& operator=(const Fn& other) = delete; - Fn& operator=(Fn&& other) = default; - - template - void serialize(Archive& ar) { - ttg::ttg_abort(); - } - template - void serialize(Archive& ar, const unsigned int) { - ttg::ttg_abort(); - } -}; - -auto make_ttg_fib_lt(const int64_t F_n_max = 1000) { - ttg::Edge f2f; - ttg::Edge f2p; - - auto fib = ttg::make_tt( - [=](int64_t n, Fn&& f_n) -> ttg::device::Task { - assert(n > 0); - ttg::trace("in fib: n=", n, " F_n=", f_n.F[0]); - - co_await ttg::device::select(f_n.b); - - next_value(f_n.b.current_device_ptr()); - - // wait for the task to complete and the values to be brought back to the host - co_await ttg::device::wait(f_n.b); - - if (f_n.F[0] < F_n_max) { - co_await ttg::device::forward(ttg::device::send<0>(n + 1, std::move(f_n))); - } else { - co_await ttg::device::forward(ttg::device::sendv<1>(std::move(f_n))); - } - }, - ttg::edges(f2f), ttg::edges(f2f, f2p), "fib"); - auto print = ttg::make_tt( - [=](Fn&& f_n) { - std::cout << "The largest Fibonacci number smaller than " << F_n_max << " is " << f_n.F[1] << std::endl; - }, - ttg::edges(f2p), ttg::edges(), "print"); - - auto ins = std::make_tuple(fib->template in<0>()); - std::vector> ops; - ops.emplace_back(std::move(fib)); - ops.emplace_back(std::move(print)); - return make_ttg(std::move(ops), ins, std::make_tuple(), "Fib_n < N"); -} - -int main(int argc, char* argv[]) { - ttg::initialize(argc, argv, -1); - ttg::trace_on(); - int64_t N = 1000; - if (argc > 1) N = std::atol(argv[1]); - auto fib = make_ttg_fib_lt(N); // computes largest F_n < N - - ttg::make_graph_executable(fib.get()); - if (ttg::default_execution_context().rank() == 0) - fib->template in<0>()->send(1, Fn{});; - - ttg::execute(ttg::ttg_default_execution_context()); - ttg::fence(ttg::ttg_default_execution_context()); - - ttg::finalize(); - return 0; -}