Force nodes to down and power_save when stopping the cluster

If a cluster is stopped while a node is powering-up (alloc#-idle#), node is kept in the powering-up state on cluster start. This makes the node unavailable for the entire ResumeTimeout which is 60 minutes. Slurm is ignoring the transition to power_down if we don't put the node to down first. From @demartinofra ## Manual test * Created a cluster and submitted a job on it * When the node was powering up stopped the cluster and verified the node is correctly marked as power down * Restarted the cluster and verified the node is back to powering save state (after about 2 minutes) * Job ran correctly in the new node. Signed-off-by: Enrico Usai <[email protected]>
aws · Aug 10, 2021 · fe0c2a0 · fe0c2a0
1 parent 3489e16
commit fe0c2a0
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
 
 This file is used to list changes made in each version of the aws-parallelcluster-node package.
 
+2.11.2
+-----
+
+**BUG FIXES**
+- Slurm: fix issue that was causing powering up nodes to not be correctly reset after a stop and start of the cluster. 
+
 2.11.1
 -----
 

diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py
@@ -235,7 +235,7 @@ def update_all_partitions(state, reset_node_addrs_hostname):
                 logging.info(f"Setting partition {part.name} state from {part.state} to {state}")
                 if reset_node_addrs_hostname:
                     logging.info(f"Resetting partition nodes {part.nodes}")
-                    reset_nodes(part.nodes, state="power_down", reason="stopping cluster")
+                    set_nodes_down_and_power_save(part.nodes, reason="stopping cluster")
                 partition_to_update.append(part.name)
         succeeded_partitions = update_partitions(partition_to_update, state)
         return succeeded_partitions == partition_to_update

diff --git a/tests/common/schedulers/test_slurm_commands.py b/tests/common/schedulers/test_slurm_commands.py
@@ -614,7 +614,7 @@ def test_update_partitions(
             ],
             PartitionStatus.INACTIVE,
             True,
-            [call("node-3,node-4", reason="stopping cluster", state="power_down")],
+            [call("node-3,node-4", reason="stopping cluster")],
             ["part-2"],
             ["part-2"],
             True,
@@ -627,8 +627,8 @@ def test_update_partitions(
             PartitionStatus.INACTIVE,
             True,
             [
-                call("node-1,node-2", reason="stopping cluster", state="power_down"),
-                call("node-3,node-4", reason="stopping cluster", state="power_down"),
+                call("node-1,node-2", reason="stopping cluster"),
+                call("node-3,node-4", reason="stopping cluster"),
             ],
             ["part-1", "part-2"],
             ["part-1", "part-2"],
@@ -682,7 +682,9 @@ def test_update_all_partitions(
     expected_results,
     mocker,
 ):
-    reset_node_spy = mocker.patch("common.schedulers.slurm_commands.reset_nodes", auto_spec=True)
+    set_nodes_down_and_power_save_spy = mocker.patch(
+        "common.schedulers.slurm_commands.set_nodes_down_and_power_save", auto_spec=True
+    )
     update_partitions_spy = mocker.patch(
         "common.schedulers.slurm_commands.update_partitions", return_value=mock_succeeded_partitions, auto_spec=True
     )
@@ -692,7 +694,7 @@ def test_update_all_partitions(
     assert_that(update_all_partitions(state, reset_node_addrs_hostname=reset_node_info)).is_equal_to(expected_results)
     get_part_spy.assert_called_with(get_all_nodes=True)
     if expected_reset_nodes_calls:
-        reset_node_spy.assert_has_calls(expected_reset_nodes_calls)
+        set_nodes_down_and_power_save_spy.assert_has_calls(expected_reset_nodes_calls)
     else:
-        reset_node_spy.assert_not_called()
+        set_nodes_down_and_power_save_spy.assert_not_called()
     update_partitions_spy.assert_called_with(partitions_to_update, state)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG @@
     This file is used to list changes made in each version of the aws-parallelcluster-node package.
+.11.2
+    -----
+    **BUG FIXES**
+    - Slurm: fix issue that was causing powering up nodes to not be correctly reset after a stop and start of the cluster.
 .11.1
     -----
@@ Expand Down @@