Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stabilize MPI tests for Azure Linux #3521

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions lisa/features/infiniband.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from retry import retry

from lisa.base_tools import Cat, Sed, Uname, Wget
from lisa.tools.git import Git
from lisa.feature import Feature
from lisa.features import Disk
from lisa.operating_system import CBLMariner, Oracle, Redhat, Ubuntu
from lisa.tools import Firewall, Ls, Lspci, Make, Service
from lisa.tools import Chmod, Find, Firewall, Ls, Lspci, Make, Service
from lisa.tools.tar import Tar
from lisa.util import (
LisaException,
Expand Down Expand Up @@ -466,7 +467,6 @@ def install_intel_mpi(self) -> None:

def install_open_mpi(self) -> None:
node = self._node
# Install Open MPI
wget = node.tools[Wget]
tar_file = (
"https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz"
Expand Down Expand Up @@ -497,6 +497,55 @@ def install_open_mpi(self) -> None:
make.make("", cwd=openmpi_folder, sudo=True)
make.make_install(cwd=openmpi_folder, sudo=True)

def install_intel_mpi_benchmarking_tool(self, tool_names: List[str] = ["IMB-MPI1"]) -> None:
# Assumption is we have required mpi package built and installed
node = self._node
if not isinstance(node.os, CBLMariner):
# These tools are included in other distro packages
return
# Clone and build Intel MPI Benchmarks https://github.com/intel/mpi-benchmarks.git
git = node.tools[Git]
git.clone(url="https://github.com/intel/mpi-benchmarks.git", cwd=node.working_path)

imb_src_folder = node.get_pure_path(f"{node.working_path}/mpi-benchmarks")

find = node.tools[Find]
# find mpicc path
find_results = find.find_files(
node.get_pure_path("/"), "mpicc", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicc from MPI package"
).is_greater_than(0)
mpicc_path = find_results[0]
assert_that(mpicc_path).described_as(
"Could not find location of mpicc from MPI package"
).is_not_empty()

# find mpicxx path
find_results = find.find_files(
node.get_pure_path("/"), "mpicxx", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicxx from MPI package"
).is_greater_than(0)
mpicxx_path = find_results[0]
assert_that(mpicxx_path).described_as(
"Could not find location of mpicxx from MPI package"
).is_not_empty()

node.tools[Chmod].chmod(mpicc_path, "755", sudo=True)
node.tools[Chmod].chmod(mpicxx_path, "755", sudo=True)

# tool_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for tool in tool_names:
make = node.tools[Make]
make.make(f"{tool} CC={mpicc_path} CXX={mpicxx_path}",
cwd=imb_src_folder, sudo=True,
shell=False, sendYesCmd=False)
node.tools[Chmod].chmod(f"{imb_src_folder}/{tool}", "755", sudo=True)


def install_ibm_mpi(self, platform_mpi_url: str) -> None:
node = self._node
if isinstance(node.os, Redhat):
Expand Down
12 changes: 9 additions & 3 deletions lisa/tools/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def make(
thread_count: int = 0,
update_envs: Optional[Dict[str, str]] = None,
ignore_error: bool = False,
shell: bool = True,
sendYesCmd: bool = True
) -> ExecutableResult:
expected_exit_code: Optional[int] = 0
if thread_count == 0:
Expand All @@ -95,13 +97,17 @@ def make(

if ignore_error:
expected_exit_code = None
# yes '' answers all questions with default value.
command = ""
if sendYesCmd:
# yes '' answers all questions with default value.
command = "yes '' | "

result = self.node.execute(
f"yes '' | make -j{thread_count} {arguments}",
f"{command} make -j{thread_count} {arguments}",
cwd=cwd,
timeout=timeout,
sudo=sudo,
shell=True,
shell=shell,
update_envs=update_envs,
expected_exit_code=expected_exit_code,
expected_exit_code_failure_message="Failed to make",
Expand Down
32 changes: 25 additions & 7 deletions microsoft/testsuites/hpc/infinibandsuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
simple_requirement,
)
from lisa.features import AvailabilitySetEnabled, Infiniband, Sriov
from lisa.operating_system import BSD, Windows
from lisa.operating_system import BSD, CBLMariner, Windows
from lisa.sut_orchestrator.azure.tools import Waagent
from lisa.tools import Find, KernelConfig, Ls, Modprobe, Ssh
from lisa.util import (
Expand Down Expand Up @@ -286,6 +286,9 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add comments above to explain why Mariner needs sudo.

sudo=True

# Note: Using bash because script is not supported by Dash
# sh points to dash on Ubuntu
Expand All @@ -295,6 +298,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed intra-node pingpong test "
"with intel mpi",
Expand All @@ -306,6 +310,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed inter-node pingpong test "
"with intel mpi",
Expand All @@ -319,6 +324,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-n 44 -env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
f"/opt/intel/oneapi/mpi/2021.1.1/bin/{test}",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test with intel mpi",
timeout=3000,
Expand Down Expand Up @@ -360,10 +366,13 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_open_mpi, client_ib.install_open_mpi])

server_node.execute("ldconfig", sudo=True)
client_node.execute("ldconfig", sudo=True)

# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool()
Copy link
Member

@squirrelsc squirrelsc Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also add if block here to skip other distros. So the logic here is easier to know at the test case level. The if block in the install_intel_mpi_benchmarking_tool is a safe guard.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.


# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
server_node.close()
Expand All @@ -386,7 +395,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
# Ping Pong test
find = server_node.tools[Find]
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of IMB-MPI1 for Open MPI"
Expand All @@ -407,7 +416,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:

# IMB-MPI Tests
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of Open MPI test: IMB-MPI1"
Expand All @@ -417,7 +426,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
"Could not find location of Open MPI test: IMB-MPI1"
).is_not_empty()
server_node.execute(
f"/usr/local/bin/mpirun --host {server_ip},{client_ip} "
f"/usr/local/bin/mpirun -hosts {server_ip},{client_ip} "
"-n 2 --mca btl self,vader,openib --mca btl_openib_cq_size 4096 "
"--mca btl_openib_allow_ib 1 --mca "
f"btl_openib_warn_no_device_params_found 0 {test_path}",
Expand Down Expand Up @@ -571,6 +580,12 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_mvapich_mpi, client_ib.install_mvapich_mpi])
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
squirrelsc marked this conversation as resolved.
Show resolved Hide resolved
# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool(tool_names=test_names)

server_node.execute("ldconfig", sudo=True)

# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
Expand All @@ -590,13 +605,15 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True

# Run MPI tests
find = server_node.tools[Find]
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for test in test_names:
find_results = find.find_files(
server_node.get_pure_path("/usr"), test, sudo=True
server_node.get_pure_path("/"), test, sudo=True
)
assert_that(len(find_results)).described_as(
f"Could not find location of MVAPICH MPI test: {test}"
Expand All @@ -611,6 +628,7 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test "
"with MVAPICH MPI",
sudo=sudo
)

def _check_nd_enabled(self, node: Node) -> None:
Expand Down
Loading