Skip to content

Commit

Permalink
[3.6] Avoid duplication of nodes in `ClusterManager._find_active_node…
Browse files Browse the repository at this point in the history
…s()` (#537)

This is relevant only in cases when the Slurm partition configuration
is edited and PC-managed nodes are added to multiple custom partitions.

Add unit test for `ClusterManager._find_active_nodes()`.


Signed-off-by: Jacopo De Amicis <[email protected]>
  • Loading branch information
jdeamicis authored Jun 19, 2023
1 parent dc340b4 commit e52e25b
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ This file is used to list changes made in each version of the aws-parallelcluste
3.6.1
------

**CHANGES**
- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions.

**BUG FIXES**
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned

Expand Down
2 changes: 1 addition & 1 deletion src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,7 +1127,7 @@ def _find_active_nodes(partitions_name_map):
for partition in partitions_name_map.values():
if partition.state != "INACTIVE":
active_nodes += partition.slurm_nodes
return active_nodes
return list(dict.fromkeys(active_nodes))

def _is_node_in_replacement_valid(self, node, check_node_is_valid):
"""
Expand Down
124 changes: 124 additions & 0 deletions tests/slurm_plugin/test_clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3729,3 +3729,127 @@ def test_find_unhealthy_slurm_nodes(
assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes)
assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes)
assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map)


@pytest.mark.parametrize(
"partitions_name_map, expected_nodelist",
[
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two non-overlapping partitions",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"custom_partition": SimpleNamespace(
name="custom_partition",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a "
"PC-managed queue",
),
pytest.param(
{},
[],
id="Empty cluster with no partitions",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two non-overlapping partitions, one active and one inactive",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[],
id="Two inactive non-overlapping partitions",
),
],
)
def test_find_active_nodes(partitions_name_map, expected_nodelist):
"""Unit test for the `ClusterManager._find_active_nodes()` method."""
result_nodelist = ClusterManager._find_active_nodes(partitions_name_map)
assert_that(result_nodelist).is_equal_to(expected_nodelist)

0 comments on commit e52e25b

Please sign in to comment.