From 5e824fc657c4f3cdd8f10feb80b0a4a5e08c0c9b Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 9 Dec 2024 14:14:03 -0800 Subject: [PATCH] feat: add nix build definitions Signed-off-by: Nicholas Sielicki --- .gitignore | 5 + flake.lock | 283 ++++++++++++++++++ flake.nix | 140 +++++++++ nix/checks.nix | 26 ++ .../0001-add-latest-nccl.nix | 13 + .../0002-use-latest-nccl.nix | 1 + .../0003-nccl-tests-use-mpi.nix | 7 + .../0004-add-ncclAws.nix | 29 ++ .../0005-add-nccl-tests-aws.nix | 8 + nix/overlays/libfabric/default.nix | 58 ++++ nix/pkgs/aws-ofi-nccl/cleanSource.nix | 76 +++++ nix/pkgs/aws-ofi-nccl/default.nix | 173 +++++++++++ nix/pkgs/ncclWithExtNet.nix | 27 ++ nix/shell.nix | 203 +++++++++++++ nix/ubuntuTestRunners.nix | 45 +++ 15 files changed, 1094 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 nix/checks.nix create mode 100644 nix/cudaPackagesExtensions/0001-add-latest-nccl.nix create mode 100644 nix/cudaPackagesExtensions/0002-use-latest-nccl.nix create mode 100644 nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix create mode 100644 nix/cudaPackagesExtensions/0004-add-ncclAws.nix create mode 100644 nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix create mode 100644 nix/overlays/libfabric/default.nix create mode 100644 nix/pkgs/aws-ofi-nccl/cleanSource.nix create mode 100644 nix/pkgs/aws-ofi-nccl/default.nix create mode 100644 nix/pkgs/ncclWithExtNet.nix create mode 100644 nix/shell.nix create mode 100644 nix/ubuntuTestRunners.nix diff --git a/.gitignore b/.gitignore index caeaee5b7..5ae053df3 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,8 @@ m4/lt~obsolete.m4 .idea/ .devenv/ .direnv + +result-bin +result +.ctags.d +.tags diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..1f6b7ae3d --- /dev/null +++ b/flake.lock @@ -0,0 +1,283 @@ +{ + "nodes": { + "cuda-packages": { + "inputs": { + "flake-parts": [ + "flake-parts" + ], + "git-hooks-nix": [ + "git-hooks" + ], + "nixpkgs": [ + "nixpkgs" + ], + "treefmt-nix": "treefmt-nix" + }, + "locked": { + "lastModified": 1733611080, + "narHash": "sha256-NoBzdEc79euz3iaWpJD7v0GUEpOl49i6hryZYYV8Pj8=", + "owner": "ConnorBaker", + "repo": "cuda-packages", + "rev": "0b1f47e9877892cda177f86de54210ba592714ca", + "type": "github" + }, + "original": { + "owner": "ConnorBaker", + "repo": "cuda-packages", + "type": "github" + } + }, + "flake-compat": { + "flake": false, + "locked": { + "lastModified": 1696426674, + "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", + "owner": "edolstra", + "repo": "flake-compat", + "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", + "type": "github" + }, + "original": { + "owner": "edolstra", + "repo": "flake-compat", + "type": "github" + } + }, + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1733312601, + "narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=", + "rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9", + "revCount": 350, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/hercules-ci/flake-parts/0.1.350%2Brev-205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9/019392c0-0749-7c85-a1dd-d9c67cfdb738/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/hercules-ci/flake-parts/0.1.350.tar.gz" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "git-hooks": { + "inputs": { + "flake-compat": "flake-compat", + "gitignore": "gitignore", + "nixpkgs": "nixpkgs", + "nixpkgs-stable": "nixpkgs-stable" + }, + "locked": { + "lastModified": 1733665616, + "narHash": "sha256-+XTFXYlFJBxohhMGLDpYdEnhUNdxN8dyTA8WAd+lh2A=", + "rev": "d8c02f0ffef0ef39f6063731fc539d8c71eb463a", + "revCount": 932, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/cachix/git-hooks.nix/0.1.932%2Brev-d8c02f0ffef0ef39f6063731fc539d8c71eb463a/0193a757-e6ff-7101-a73e-4c0739bea407/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/cachix/git-hooks.nix/0.1.932.tar.gz" + } + }, + "gitignore": { + "inputs": { + "nixpkgs": [ + "git-hooks", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1709087332, + "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=", + "owner": "hercules-ci", + "repo": "gitignore.nix", + "rev": "637db329424fd7e46cf4185293b9cc8c88c95394", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "gitignore.nix", + "type": "github" + } + }, + "lib-aggregate": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs-lib": "nixpkgs-lib_2" + }, + "locked": { + "lastModified": 1733660018, + "narHash": "sha256-DHW2Hzyo8W6wVPGFaYLM9mKMH/qAtHJSUZ4ti7LHMCY=", + "owner": "nix-community", + "repo": "lib-aggregate", + "rev": "f710791be27b31ea1ee9c40a94d08bcbba99b3c0", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "lib-aggregate", + "type": "github" + } + }, + "nix-github-actions": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1731952509, + "narHash": "sha256-p4gB3Rhw8R6Ak4eMl8pqjCPOLCZRqaehZxdZ/mbFClM=", + "owner": "nix-community", + "repo": "nix-github-actions", + "rev": "7b5f051df789b6b20d259924d349a9ba3319b226", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nix-github-actions", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1730768919, + "narHash": "sha256-8AKquNnnSaJRXZxc5YmF/WfmxiHX6MMZZasRP6RRQkE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "a04d33c0c3f1a59a2c1cb0c6e34cd24500e5a1dc", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1733096140, + "narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" + } + }, + "nixpkgs-lib_2": { + "locked": { + "lastModified": 1733620091, + "narHash": "sha256-5WoMeCkaXqTZwwCNLRzyLxEJn8ISwjx4cNqLgqKwg9s=", + "owner": "nix-community", + "repo": "nixpkgs.lib", + "rev": "f4dc9a6c02e5e14d91d158522f69f6ab4194eb5b", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nixpkgs.lib", + "type": "github" + } + }, + "nixpkgs-stable": { + "locked": { + "lastModified": 1730741070, + "narHash": "sha256-edm8WG19kWozJ/GqyYx2VjW99EdhjKwbY3ZwdlPAAlo=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "d063c1dd113c91ab27959ba540c0d9753409edf3", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-24.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1733064805, + "narHash": "sha256-7NbtSLfZO0q7MXPl5hzA0sbVJt6pWxxtGWbaVUDDmjs=", + "rev": "31d66ae40417bb13765b0ad75dd200400e98de84", + "revCount": 715040, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/DeterminateSystems/nixpkgs-weekly/0.1.715040%2Brev-31d66ae40417bb13765b0ad75dd200400e98de84/01938b06-3358-73df-a7e1-598cb884b5d0/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/DeterminateSystems/nixpkgs-weekly/0.1.715040.tar.gz" + } + }, + "root": { + "inputs": { + "cuda-packages": "cuda-packages", + "flake-parts": "flake-parts", + "git-hooks": "git-hooks", + "lib-aggregate": "lib-aggregate", + "nix-github-actions": "nix-github-actions", + "nixpkgs": "nixpkgs_2" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "treefmt-nix": { + "inputs": { + "nixpkgs": [ + "cuda-packages", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1733440889, + "narHash": "sha256-qKL3vjO+IXFQ0nTinFDqNq/sbbnnS5bMI1y0xX215fU=", + "owner": "numtide", + "repo": "treefmt-nix", + "rev": "50862ba6a8a0255b87377b9d2d4565e96f29b410", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "treefmt-nix", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..845511dd0 --- /dev/null +++ b/flake.nix @@ -0,0 +1,140 @@ +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE for licensing information + +{ + description = "aws-ofi-nccl development/build flake."; + + outputs = + { self, flake-parts, ... }@inputs: + let + inherit (inputs.lib-aggregate) lib; + systems = [ + "x86_64-linux" + "aarch64-linux" + ]; + in + flake-parts.lib.mkFlake { inherit inputs; } ( + { withSystem, flake-parts-lib, ... }: + { + inherit systems; + imports = [ + inputs.git-hooks.flakeModule + flake-parts.flakeModules.easyOverlay + ]; + flake = { + githubActionChecks = inputs.nix-github-actions.lib.mkGithubMatrix { + checks = self.outputs.packages.x86_64-linux; + }; + }; + debug = true; + perSystem = + { + system, + config, + final, + pkgs, + ... + }: + { + _module.args.pkgs = import inputs.nixpkgs { + inherit system; + overlays = [ + (import ./nix/overlays/libfabric) + inputs.cuda-packages.overlays.default + inputs.self.overlays.default + ]; + config = { + cudaSupport = true; + cudaForwardCompat = true; + cudaCapabilities = [ + "7.0" + "7.5" + "8.0" + "8.6" + "8.9" + "9.0" + "9.0a" + ]; + allowBroken = true; + allowUnfree = true; + }; + }; + pre-commit.settings = import ./nix/checks.nix { inherit lib; }; + devShells.default = import ./nix/shell.nix { + inherit + pkgs + config + system + inputs + self + ; + }; + overlayAttrs = { + cudaPackagesExtensions = [ + (import ./nix/cudaPackagesExtensions/0001-add-latest-nccl.nix { inherit (pkgs) fetchFromGitHub; }) + (import ./nix/cudaPackagesExtensions/0002-use-latest-nccl.nix) + (import ./nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix { inherit config; }) + (import ./nix/cudaPackagesExtensions/0004-add-ncclAws.nix { + inherit lib config; + inherit (pkgs) symlinkJoin patchelf; + }) + (import ./nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix { + inherit (pkgs) replaceDependency; + }) + ]; + + inherit (config.packages) + libfabric + openmpi + ; + }; + packages = rec { + aws-ofi-nccl = ( + pkgs.callPackage ./nix/pkgs/aws-ofi-nccl { + inherit inputs self; + } + ); + ubuntu-test-runners = pkgs.callPackage ./nix/ubuntuTestRunners.nix { + nccl-tests = pkgs.pkgsCuda.sm_90.cudaPackages.nccl-tests-aws; + }; + default = aws-ofi-nccl; + inherit (pkgs) + libfabric + openmpi + ; + }; + }; + } + ); + + inputs = { + flake-parts.url = "https://flakehub.com/f/hercules-ci/flake-parts/0.1.350.tar.gz"; + lib-aggregate.url = "github:nix-community/lib-aggregate"; + nixpkgs.url = "https://flakehub.com/f/DeterminateSystems/nixpkgs-weekly/0.1.715040.tar.gz"; + git-hooks.url = "https://flakehub.com/f/cachix/git-hooks.nix/0.1.932.tar.gz"; + nix-github-actions.url = "github:nix-community/nix-github-actions"; + nix-github-actions.inputs.nixpkgs.follows = "nixpkgs"; + cuda-packages.url = "github:ConnorBaker/cuda-packages"; + cuda-packages.inputs.flake-parts.follows = "flake-parts"; + cuda-packages.inputs.nixpkgs.follows = "nixpkgs"; + cuda-packages.inputs.git-hooks-nix.follows = "git-hooks"; + }; + + nixConfig = { + allowUnfree = true; + cudaSupport = true; + extra-substituters = [ + "https://numtide.cachix.org" + "https://nix-community.cachix.org" + "https://devenv.cachix.org" + "https://cuda-maintainers.cachix.org" + ]; + extra-trusted-public-keys = [ + "numtide.cachix.org-1:2ps1kLBUWjxIneOy1Ik6cQjb41X0iXVXeHigGmycPPE=" + "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs=" + "devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=" + "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" + ]; + }; +} diff --git a/nix/checks.nix b/nix/checks.nix new file mode 100644 index 000000000..b9556e947 --- /dev/null +++ b/nix/checks.nix @@ -0,0 +1,26 @@ +{ lib }: +{ + hooks = { + nixfmt-rfc-style.enable = true; + clang-format = { + enable = true; + types_or = lib.mkForce [ + "c" + "c++" + ]; + }; + actionlint.enable = true; + check-added-large-files.enable = true; + check-xml.enable = true; + detect-aws-credentials.enable = true; + detect-private-keys.enable = true; + editorconfig-checker.enable = true; + mdl.enable = true; + shfmt.enable = true; + shellcheck.enable = true; + #check-merge-conficts.enable = true; + no-commit-to-branch.enable = true; + forbid-new-submodules.enable = true; + convco.enable = true; + }; +} diff --git a/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix b/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix new file mode 100644 index 000000000..7dd2c3c03 --- /dev/null +++ b/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix @@ -0,0 +1,13 @@ +{ fetchFromGitHub }: +ffinal: pprev: { + nccl_latest = pprev.nccl.overrideAttrs (prevAttrs: { + src = fetchFromGitHub { + owner = "NVIDIA"; + repo = "nccl"; + rev = "v2.23.4-1"; + hash = "sha256-DlMxlLO2F079fBkhORNPVN/ASYiVIRfLJw7bDoiClHw="; + }; + name = "cuda${ffinal.cudaMajorMinorPatchVersion}-nccl-2.23.4-1"; + version = "2.23.4-1"; + }); +} diff --git a/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix b/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix new file mode 100644 index 000000000..076047126 --- /dev/null +++ b/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix @@ -0,0 +1 @@ +ffinal: pprev: { nccl = pprev.nccl_latest; } diff --git a/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix b/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix new file mode 100644 index 000000000..961d17e5f --- /dev/null +++ b/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix @@ -0,0 +1,7 @@ +{ config }: +ffinal: pprev: { + nccl-tests = pprev.nccl-tests.override { + mpiSupport = true; + mpi = config.packages.openmpi; + }; +} diff --git a/nix/cudaPackagesExtensions/0004-add-ncclAws.nix b/nix/cudaPackagesExtensions/0004-add-ncclAws.nix new file mode 100644 index 000000000..f2aec2b85 --- /dev/null +++ b/nix/cudaPackagesExtensions/0004-add-ncclAws.nix @@ -0,0 +1,29 @@ +{ + lib, + config, + symlinkJoin, + patchelf, +}: +ffinal: pprev: { + ncclAws = symlinkJoin { + inherit (pprev.nccl) + name + ; + paths = [ + (ffinal.backendStdenv.mkDerivation { + name = "${pprev.nccl.name}+ofi-nccl-aws"; + src = pprev.nccl.out; + buildPhase = '' + cp -r . $out + ''; + postFixup = '' + ${patchelf}/bin/patchelf --add-rpath ${ + lib.makeLibraryPath [ (lib.getLib config.packages.default) ] + } $out/lib/libnccl.so + ''; + }) + (lib.getLib pprev.nccl) + (lib.getDev pprev.nccl) + ]; + }; +} diff --git a/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix b/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix new file mode 100644 index 000000000..d49fd5714 --- /dev/null +++ b/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix @@ -0,0 +1,8 @@ +{ replaceDependency }: +ffinal: pprev: { + nccl-tests-aws = replaceDependency { + drv = ffinal.nccl-tests; + oldDependency = ffinal.nccl; + newDependency = ffinal.ncclAws; + }; +} diff --git a/nix/overlays/libfabric/default.nix b/nix/overlays/libfabric/default.nix new file mode 100644 index 000000000..bb70ead7e --- /dev/null +++ b/nix/overlays/libfabric/default.nix @@ -0,0 +1,58 @@ +final: prev: { + libfabric = ( + let + joined = ( + final.symlinkJoin { + name = "cuda-build-deps-joined"; + paths = [ + (final.lib.getLib final.cudaPackages.cuda_cudart) + (final.lib.getDev final.cudaPackages.cuda_cudart) + (final.lib.getLib final.cudaPackages.cuda_nvcc) + (final.lib.getDev final.cudaPackages.cuda_nvcc) + (final.lib.getLib final.cudaPackages.cuda_nvml_dev) + (final.lib.getDev final.cudaPackages.cuda_nvml_dev) + ]; + } + ); + in + (prev.libfabric.overrideAttrs (pprev: { + pname = "libfabric-aws"; + src = final.fetchFromGitHub { + owner = "aws"; + repo = "libfabric"; + rev = "v1.22.0amzn4.0"; + hash = "sha256-Y79fwGJQI+AHqWBmydILFGMLTfFdqC6gr59Xnb24Llc="; + }; + patches = [ + (final.fetchpatch { + url = "https://patch-diff.githubusercontent.com/raw/ofiwg/libfabric/pull/10365.patch"; + hash = "sha256-dArUPaWQrb5OwTBNY0QCIizSB0aWaupcJaNyq7azU/8="; + }) + ]; + version = "1.22.0-4.0"; + buildInputs = (pprev.buildInputs or [ ]) ++ [ + final.rdma-core + final.cudaPackages.cuda_cudart + final.cudaPackages.cuda_nvcc + final.cudaPackages.cuda_nvml_dev + ]; + configureFlags = (pprev.configureFlags or [ ]) ++ [ + "--enable-efa=yes" + "--with-cuda=${joined}/" + "--enable-cuda-dlopen" + ]; + nativeBuildInputs = (pprev.nativeBuildInputs or [ ]) ++ [ + final.autoAddDriverRunpath + final.autoPatchelfHook + ]; + appendRunpaths = final.lib.makeLibraryPath [ + joined + ]; + })).override + ({ + enableOpx = false; + enablePsm2 = false; + stdenv = final.cudaPackages.backendStdenv; + }) + ); +} diff --git a/nix/pkgs/aws-ofi-nccl/cleanSource.nix b/nix/pkgs/aws-ofi-nccl/cleanSource.nix new file mode 100644 index 000000000..9da6fe133 --- /dev/null +++ b/nix/pkgs/aws-ofi-nccl/cleanSource.nix @@ -0,0 +1,76 @@ +{ lib, self }: +let + inherit (lib.fileset) + intersection + difference + unions + fileFilter + fromSource + toSource + gitTracked + traceVal + ; + inherit (builtins) + any + ; + + dirs = { + third-party = ./../../../3rd-party; + docs = ./../../../doc; + headers = ./../../../include; + mfour = ./../../../m4; + nix = ./../../../nix; + tus = ./../../../src; + tests = ./../../../tests; + topologies = ./../../../topology; + }; + + sourceFilter = fileFilter ( + file: + any file.hasExt [ + "c" + "cc" + "cpp" + "h" + "hpp" + "hh" + "xml" + ] + ); + + buildFileFilter = fileFilter ( + file: + any file.hasExt [ + "in" + "m4" + "ac" + "am" + ] + ); + + cleanRepo = traceVal (gitTracked ../../../.); + cleaned = x: intersection x (gitTracked ../../../.); + sourceFiles = cleaned (sourceFilter ../../../.); + buildFiles = cleaned (buildFileFilter ../../../.); + thirdPartyFiles = cleaned ../../../.; + thirdPartyBuildFiles = unions [ + thirdPartyFiles + buildFiles + ]; + thirdPartySourceFiles = difference [ + thirdPartyFiles + thirdPartyBuildFiles + ]; + + projectSourceFiles = difference [ + sourceFiles + thirdPartySourceFiles + ]; +in +lib.fileset.toSource { + root = ../../../.; + fileset = lib.fileset.unions [ + buildFiles + sourceFiles + ]; +} diff --git a/nix/pkgs/aws-ofi-nccl/default.nix b/nix/pkgs/aws-ofi-nccl/default.nix new file mode 100644 index 000000000..1e4047e9f --- /dev/null +++ b/nix/pkgs/aws-ofi-nccl/default.nix @@ -0,0 +1,173 @@ +{ + lib, + inputs, + self, + fetchFromGitHub, + symlinkJoin, + releaseTools, + stdenv, + config, + libfabric, + hwloc, + perl, + libtool, + autoconf, + automake, + autoreconfHook, + lttng-ust, + valgrind, + mpi, + cudaPackages ? { }, + autoAddDriverRunpath, + neuronSupport ? (!config.cudaSupport), + cudaSupport ? (config.cudaSupport && !neuronSupport), + enableTests ? cudaSupport, + enableTracePrints ? true, + enableLTTNGTracing ? false, + enablePickyCompiler ? true, + enableWerror ? true, + enableNVTXTracing ? false, + enableValgrind ? false, + enableAwsTuning ? true, +}: + +assert neuronSupport != cudaSupport; +assert !enableNVTXTracing || (enableNVTXTracing && cudaSupport); +let + + effectiveStdenv = if cudaSupport then cudaPackages.backendStdenv else stdenv; + + cudaBuildDepsJoined = symlinkJoin { + name = "cuda-build-deps-joined"; + paths = lib.optionals (cudaSupport) ( + [ + (lib.getDev cudaPackages.cuda_nvcc) + cudaPackages.cuda_cudart.include + ] + ++ ( + if effectiveStdenv.hostPlatform.isStatic then + [ + (lib.getOutput "static" cudaPackages.cuda_cudart) + ] + else + [ + (lib.getLib cudaPackages.cuda_cudart) + ] + ) + ); + }; +in +effectiveStdenv.mkDerivation { + name = "aws-ofi-nccl"; + pname = lib.concatStringsSep "" [ + "lib" + (if neuronSupport then "nccom" else "nccl") + "-net-ofi" + (lib.optionalString enableAwsTuning "-aws") + ]; + version = inputs.self.shortRev or inputs.self.dirtyShortRev; + src = import ./cleanSource.nix { + inherit lib; + inherit self; + }; + + nativeBuildInputs = + [ autoreconfHook ] + ++ lib.optionals cudaSupport [ + autoAddDriverRunpath + cudaPackages.cuda_nvcc + ]; + + buildInputs = + [ + libfabric + hwloc + ] + ++ lib.optionals cudaSupport [ + cudaBuildDepsJoined + ] + ++ lib.optionals enableValgrind [ + valgrind + ] + ++ lib.optionals enableTests [ + mpi + ] + ++ lib.optionals enableLTTNGTracing [ + lttng-ust + ]; + + configureFlags = [ + # core deps + (lib.withFeatureAs true "libfabric" (lib.getDev libfabric)) + (lib.withFeatureAs true "hwloc" (lib.getDev hwloc)) + #(lib.withFeatureAs true "nccl-headers" (cudaPackages.nccl.dev)) + + # libs + (lib.withFeatureAs enableTests "mpi" (lib.getDev mpi)) + (lib.enableFeature enableTests "tests") + (lib.withFeatureAs enableLTTNGTracing "lttng" (lib.getDev lttng-ust)) + (lib.withFeatureAs enableValgrind "valgrind" (lib.getDev valgrind)) + + # accelerator support + (lib.enableFeature neuronSupport "neuron") + (lib.withFeatureAs cudaSupport "cuda" cudaBuildDepsJoined) + (lib.withFeatureAs (enableNVTXTracing && cudaSupport) "nvtx" (lib.getDev cudaPackages.cuda_nvtx)) + (lib.enableFeature (!effectiveStdenv.hostPlatform.isStatic) "cudart-dynamic") + + # build configuration + (lib.enableFeature enableAwsTuning "platform-aws") + (lib.enableFeature enablePickyCompiler "picky-compiler") + (lib.enableFeature enableWerror "werror") + (lib.enableFeature enableTracePrints "trace") + ]; + + meta = with lib; { + homepage = "https://github.com/aws/aws-ofi-nccl"; + license = licenses.asl20; + broken = (cudaSupport && !config.cudaSupport); + maintainers = with maintainers; [ sielicki ]; + platforms = [ + "x86_64-linux" + "aarch64-linux" + ]; + }; + + hardeningEnable = [ + "format" + "fortify3" + "shadowstack" + "pacret" + "pic" + "pie" + "stackprotector" + "stackclashprotection" + "strictoverflow" + "trivialautovarinit" + ]; + enableParallelBuilding = true; + separateDebugInfo = true; + strictDeps = true; + + outputs = [ + "dev" + "out" + ] ++ lib.optionals enableTests [ "bin" ]; + postInstall = '' + find $out | grep -E \.la$ | xargs rm + mkdir -p $dev/nix-support/generated-headers/include && cp include/config.h $dev/nix-support/generated-headers/include/ + cp config.log $dev/nix-support/config.log + ''; + + doCheck = enableTests; + checkPhase = '' + set -euo pipefail + for test in $(find tests/unit/ -type f -executable -print | xargs) ; do + echo "======================================================================" + echo "Running $test" + ./$test + test $? -eq 0 && (echo "✅ Passed" || (echo "❌ Failed!" && exit 1)) + done + echo "All unit tests passed successfully." + set +u + ''; +} diff --git a/nix/pkgs/ncclWithExtNet.nix b/nix/pkgs/ncclWithExtNet.nix new file mode 100644 index 000000000..3b2c84854 --- /dev/null +++ b/nix/pkgs/ncclWithExtNet.nix @@ -0,0 +1,27 @@ +{ + lib, + stdenv, + patchelf, + symlinkJoin, + nccl, + plugin, +}: +symlinkJoin { + name = "${nccl.name}-${plugin.name}-joined"; + paths = [ + (stdenv.mkDerivation { + name = "${nccl.name}+net-${plugin.name}"; + src = nccl.out; + buildPhase = '' + cp -r . $out + ''; + postFixup = '' + ${patchelf}/bin/patchelf --add-rpath ${ + lib.makeLibraryPath [ (lib.getLib plugin) ] + } $out/lib/libnccl.so + ''; + }) + (lib.getLib nccl) + (lib.getDev nccl) + ]; +} diff --git a/nix/shell.nix b/nix/shell.nix new file mode 100644 index 000000000..0df25bf30 --- /dev/null +++ b/nix/shell.nix @@ -0,0 +1,203 @@ +{ + self, + config, + system, + inputs, + pkgs, +}: +let + source-dir = builtins.getEnv "PWD"; + + clang-format-file = pkgs.writeTextFile { + name = "clang-format-config"; + text = pkgs.lib.generators.toYAML { } { + AlignConsecutiveAssignments = false; + AlignConsecutiveBitFields = { + AcrossComments = true; + AcrossEmptyLines = true; + Enabled = true; + }; + AlignConsecutiveDeclarations = false; + AlignConsecutiveMacros = { + AcrossComments = true; + AcrossEmptyLines = true; + Enabled = true; + }; + AlignConsecutiveShortCaseStatements = { + AcrossComments = true; + AcrossEmptyLines = true; + AlignCaseColons = false; + Enabled = true; + }; + AlignOperands = "Align"; + AlignTrailingComments = { + Kind = "Always"; + OverEmptyLines = 0; + }; + AllowShortCompoundRequirementOnASingleLine = true; + KeepEmptyLines = { + AtEndOfFile = false; + AtStartOfBlock = false; + AtStartOfFile = false; + }; + AllowAllArgumentsOnNextLine = false; + AllowShortFunctionsOnASingleLine = "None"; + AllowShortIfStatementsOnASingleLine = false; + AllowShortLoopsOnASingleLine = false; + BasedOnStyle = "Google"; + BinPackArguments = false; + BinPackParameters = false; + BracedInitializerIndentWidth = 8; + BreakBeforeBraces = "Linux"; + ColumnLimit = 130; + ContinuationIndentWidth = 8; + IncludeBlocks = "Regroup"; + IncludeCategories = [ + { + Priority = -40; + Regex = "^([\"]config[.]h[\"])$"; + SortPriority = -40; + } + { + Priority = 5; + Regex = "^[<](rdma/|uthash/|nccl/|mpi|hwloc/|lttng/|valgrind/|cuda).*[.]h[>]$"; + SortPriority = 5; + } + { + Priority = 10; + Regex = "^([\"]nccl.*[.]h[\"])$"; + SortPriority = 10; + } + ]; + IndentCaseLabels = false; + IndentWidth = 8; + InsertBraces = true; + InsertNewlineAtEOF = true; + LineEnding = "LF"; + MaxEmptyLinesToKeep = 2; + PointerAlignment = "Right"; + ReferenceAlignment = "Right"; + ReflowComments = true; + RemoveParentheses = "MultipleParentheses"; + SortIncludes = "CaseSensitive"; + SpacesBeforeTrailingComments = 2; + TabWidth = 8; + BreakBinaryOperations = "RespectPrecedence"; + AllowShortCaseExpressionOnASingleLine = true; + UseTab = "ForContinuationAndIndentation"; + }; + }; + + editorconfig-file = pkgs.writeTextFile { + name = "editorconfig-config"; + text = pkgs.lib.generators.toINIWithGlobalSection { } { + globalSection = { + root = true; + }; + sections = { + "*" = { + trim_trailing_whitespace = true; + charset = "utf-8"; + end_of_line = "lf"; + insert_final_newline = true; + }; + "*.am" = { + indent_size = 8; + indent_style = "tab"; + }; + "*.md" = { + indent_size = 2; + indent_style = "space"; + }; + "*.nix" = { + tab_width = 4; + indent_size = 2; + indent_style = "space"; + }; + "*.{c|h|cc|hh|cu}" = { + tab_width = 8; + indent_size = 8; + indent_style = "tab"; + }; + }; + }; + }; + + clangd-file = pkgs.writeTextFile { + name = "clangd-config"; + text = pkgs.lib.generators.toYAML { } { + CompileFlags = { + Add = [ + "-Wall" + "-Wextra" + "-Wformat" + "-xc++" + "-std=c++23" + "-isystem${pkgs.glibc_multi.dev}/include/" + "-isystem${pkgs.hwloc.dev}/include/" + "-isystem${pkgs.cudaPackages.cuda_cudart.dev}/include/" + "-isystem${pkgs.cudaPackages.cuda_nvtx.dev}/include/" + "-isystem${config.packages.libfabric.dev}/include/" + "-isystem${config.packages.openmpi.dev}/include/" + "-I${config.packages.default}/nix-support/generated-headers/include/" + "-I${source-dir}/include/" + "-I${source-dir}/3rd-party/nccl/cuda/include/" + ]; + }; + Diagnostics = { + ClangTidy = { + CheckOptions = { + "cppcoreguidelines-avoid-magic-numbers.IgnoreTypeAliases" = true; + "readability-magic-numbers.IgnoreTypeAliases" = true; + }; + }; + Includes = { + IgnoreHeader = [ + "hwloc.h" + "config.h" + ]; + }; + }; + }; + }; + clionConfigureFlags = pkgs.writeTextFile { + name = ".configureFlags"; + text = pkgs.lib.concatStringsSep " " config.packages.default.configureFlags; + }; +in +pkgs.mkShell { + inputsFrom = [ + self.packages.${system}.aws-ofi-nccl + config.packages.libfabric + config.packages.openmpi + ]; + packages = [ + #pkgs.llvmPackages_git.clang-analyzer + pkgs.llvmPackages_git.clang-tools + pkgs.llvmPackages_git.clang + pkgs.gcc + pkgs.gdb + pkgs.include-what-you-use + pkgs.llvmPackages_git.libclang.python + + pkgs.ccache + pkgs.cppcheck + pkgs.universal-ctags + pkgs.act + pkgs.actionlint + + pkgs.gh + pkgs.git + pkgs.eksctl + pkgs.awscli2 + + pkgs.nixfmt-rfc-style + ]; + shellHook = '' + rm -f ${source-dir}/.clangd && ln -s ${clangd-file} ${source-dir}/.clangd + rm -f ${source-dir}/.editorconfig && ln -s ${editorconfig-file} ${source-dir}/.editorconfig + rm -f ${source-dir}/.clang-format && ln -s ${clang-format-file} ${source-dir}/.clang-format + rm -f ${source-dir}/.clion-configure-flags && ln -s ${clionConfigureFlags} ${source-dir}/.clion-configure-flags + ${config.pre-commit.installationScript} + ''; +} diff --git a/nix/ubuntuTestRunners.nix b/nix/ubuntuTestRunners.nix new file mode 100644 index 000000000..1fc120cf3 --- /dev/null +++ b/nix/ubuntuTestRunners.nix @@ -0,0 +1,45 @@ +{ + config, + lib, + symlinkJoin, + writeShellScriptBin, + openmpi, + libfabric, + nccl-tests, +}: +let + tests = [ + "all_gather" + "all_reduce" + "alltoall" + "broadcast" + "gather" + "hypercube" + "reduce" + "reduce_scatter" + "scatter" + "sendrecv" + ]; + ubuntuLibs = [ + "/usr/lib/x86_64-linux-gnu/libcuda.so.1" + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1" + "/usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1" + ]; + libPathLibs = [ + config.packages.default + openmpi + libfabric + ]; + makeNcclTestRunner = + collName: + writeShellScriptBin "${collName}_perf" '' + LD_PRELOAD="${lib.concatStringsSep ":" ubuntuLibs}" \ + NCCL_TUNER_PLUGIN=libnccl-ofi-tuner.so \ + exec ${lib.getExe' nccl-tests "${collName}_perf"} $@ + ''; + runners = builtins.map makeNcclTestRunner tests; +in +symlinkJoin { + name = "ubuntu-nccl-tests-wrappers"; + paths = runners; +}