Skip to content

Commit

Permalink
Force nodes to down and power_save when stopping the cluster
Browse files Browse the repository at this point in the history
If a cluster is stopped while a node is powering-up (alloc#-idle#),
node is kept in the powering-up state on cluster start.

This makes the node unavailable for the entire ResumeTimeout which is 60 minutes.
Slurm is ignoring the transition to power_down if we don't put the node to down first.

From @demartinofra

## Manual test
* Created a cluster and submitted a job on it
* When the node was powering up stopped the cluster and verified the node is correctly marked as power down
* Restarted the cluster and verified the node is back to powering save state (after about 2 minutes)
* Job ran correctly in the new node.

Signed-off-by: Enrico Usai <[email protected]>
  • Loading branch information
enrico-usai committed Aug 10, 2021
1 parent 3489e16 commit fe0c2a0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG

This file is used to list changes made in each version of the aws-parallelcluster-node package.

2.11.2
-----

**BUG FIXES**
- Slurm: fix issue that was causing powering up nodes to not be correctly reset after a stop and start of the cluster.

2.11.1
-----

Expand Down
2 changes: 1 addition & 1 deletion src/common/schedulers/slurm_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def update_all_partitions(state, reset_node_addrs_hostname):
logging.info(f"Setting partition {part.name} state from {part.state} to {state}")
if reset_node_addrs_hostname:
logging.info(f"Resetting partition nodes {part.nodes}")
reset_nodes(part.nodes, state="power_down", reason="stopping cluster")
set_nodes_down_and_power_save(part.nodes, reason="stopping cluster")
partition_to_update.append(part.name)
succeeded_partitions = update_partitions(partition_to_update, state)
return succeeded_partitions == partition_to_update
Expand Down
14 changes: 8 additions & 6 deletions tests/common/schedulers/test_slurm_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def test_update_partitions(
],
PartitionStatus.INACTIVE,
True,
[call("node-3,node-4", reason="stopping cluster", state="power_down")],
[call("node-3,node-4", reason="stopping cluster")],
["part-2"],
["part-2"],
True,
Expand All @@ -627,8 +627,8 @@ def test_update_partitions(
PartitionStatus.INACTIVE,
True,
[
call("node-1,node-2", reason="stopping cluster", state="power_down"),
call("node-3,node-4", reason="stopping cluster", state="power_down"),
call("node-1,node-2", reason="stopping cluster"),
call("node-3,node-4", reason="stopping cluster"),
],
["part-1", "part-2"],
["part-1", "part-2"],
Expand Down Expand Up @@ -682,7 +682,9 @@ def test_update_all_partitions(
expected_results,
mocker,
):
reset_node_spy = mocker.patch("common.schedulers.slurm_commands.reset_nodes", auto_spec=True)
set_nodes_down_and_power_save_spy = mocker.patch(
"common.schedulers.slurm_commands.set_nodes_down_and_power_save", auto_spec=True
)
update_partitions_spy = mocker.patch(
"common.schedulers.slurm_commands.update_partitions", return_value=mock_succeeded_partitions, auto_spec=True
)
Expand All @@ -692,7 +694,7 @@ def test_update_all_partitions(
assert_that(update_all_partitions(state, reset_node_addrs_hostname=reset_node_info)).is_equal_to(expected_results)
get_part_spy.assert_called_with(get_all_nodes=True)
if expected_reset_nodes_calls:
reset_node_spy.assert_has_calls(expected_reset_nodes_calls)
set_nodes_down_and_power_save_spy.assert_has_calls(expected_reset_nodes_calls)
else:
reset_node_spy.assert_not_called()
set_nodes_down_and_power_save_spy.assert_not_called()
update_partitions_spy.assert_called_with(partitions_to_update, state)

0 comments on commit fe0c2a0

Please sign in to comment.