Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SmartSwitch] Extend reboot script for rebooting SmartSwitch #3566

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1686dbe
Extend reboot script for rebooting SmartSwitch
vvolam Nov 4, 2024
23461b2
Add more coverage
vvolam Nov 4, 2024
cef5de7
Add more unittests and optimize tests file
vvolam Nov 4, 2024
d41bf43
Fix minor indentation
vvolam Nov 4, 2024
68e70ab
Move smartswitch helper functions to new reboot_smartswitch_helper.sh
vvolam Nov 6, 2024
3848b75
Fix pre-commit errors
vvolam Nov 8, 2024
84d9e50
Fix few more indentation errors
vvolam Nov 8, 2024
ba5cd5d
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Nov 12, 2024
7f75134
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Nov 25, 2024
a849e41
Add a new API in chassis.py
vvolam Nov 25, 2024
4975ac0
Fix issues while testing
vvolam Nov 25, 2024
bead103
Fix indentation errors
vvolam Nov 25, 2024
f88491a
Add DPU_BUS_INFO
vvolam Nov 26, 2024
b3dbc0f
Fix pre-commit errors
vvolam Nov 26, 2024
2d8b908
Add more error handling scenarios and increase more coverage
vvolam Nov 26, 2024
1a6ef04
parse_args function is not required
vvolam Nov 26, 2024
ec21d6f
Fix indentation
vvolam Nov 26, 2024
8d59222
Address review comments
vvolam Nov 27, 2024
98406c7
Increase code coverage
vvolam Nov 27, 2024
a6f771e
Update scripts/reboot_smartswitch_helper
vvolam Nov 28, 2024
a3f8af7
Update scripts/reboot_smartswitch_helper
vvolam Nov 28, 2024
36ecf1b
Rename module_base.py to module.py
vvolam Nov 28, 2024
67e7817
Committing missed files in previous commit
vvolam Nov 28, 2024
88af21d
Define a new try_get_args() which takes arguments as inputs
vvolam Nov 28, 2024
3551a5a
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Dec 3, 2024
e4bcc95
Fix some arguments
vvolam Dec 3, 2024
edaa0de
Exit the reboot script after completing DPU reboot
vvolam Dec 3, 2024
119d83b
Fix long lines
vvolam Dec 4, 2024
03fd56f
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Dec 6, 2024
75f4c26
Update unit tests for update function code
vvolam Dec 6, 2024
7b365fd
Merge remote-tracking branch 'public/master' into ss-reboot
vvolam Jan 3, 2025
2690348
Update scripts/reboot
vvolam Jan 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions scripts/reboot
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ declare -r EXIT_ERROR=1
declare -r WATCHDOG_UTIL="/usr/local/bin/watchdogutil"
declare -r PRE_REBOOT_HOOK="pre_reboot_hook"

source reboot_smartswitch_helper

DEVPATH="/usr/share/sonic/device"
PLAT_REBOOT="platform_reboot"
PLATFORM_UPDATE_REBOOT_CAUSE="platform_update_reboot_cause"
Expand Down Expand Up @@ -37,10 +39,17 @@ EXIT_NEXT_IMAGE_NOT_EXISTS=4
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
EXIT_PLATFORM_FW_AU_FAILURE=22
PLATFORM_FWUTIL_AU_REBOOT_HANDLE="platform_fw_au_reboot_handle"
PLATFORM_JSON_FILE="platform.json"
PLATFORM_JSON_PATH="${DEVPATH}/${PLATFORM}/${PLATFORM_JSON_FILE}"
REBOOT_SCRIPT_NAME=$(basename $0)
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
TAG_LATEST=no
REBOOT_FLAGS=""
FORCE_REBOOT="no"
SMART_SWITCH="no"
DPU_MODULE_NAME=""
REBOOT_DPU="no"
PRE_SHUTDOWN="no"

function debug()
{
Expand Down Expand Up @@ -128,6 +137,8 @@ function show_help_and_exit()
echo " "
echo " Available options:"
echo " -h, -? : getting this help"
echo " -d : DPU module name on a smart switch, option is invalid when on DPU"
echo " -p : Pre-shutdown steps on DPU, invalid on NPU"

exit ${EXIT_SUCCESS}
}
Expand All @@ -154,7 +165,7 @@ function reboot_pre_check()
${DEVPATH}/${PLATFORM}/${PLATFORM_REBOOT_PRE_CHECK}
[[ $? -ne 0 ]] && exit $?
fi

# Verify the next image by sonic-installer
local message=$(sonic-installer verify-next-image 2>&1)
if [ $? -ne 0 ]; then
Expand All @@ -178,7 +189,7 @@ function check_conflict_boot_in_fw_update()

function parse_options()
{
while getopts "h?vf" opt; do
while getopts "h?vfpd:" opt; do
case ${opt} in
h|\? )
show_help_and_exit
Expand All @@ -192,6 +203,13 @@ function parse_options()
f )
REBOOT_FLAGS+=" -f"
;;
d )
REBOOT_DPU="yes"
DPU_MODULE_NAME="$OPTARG"
;;
p )
PRE_SHUTDOWN="yes"
vvolam marked this conversation as resolved.
Show resolved Hide resolved
;;
esac
done
}
Expand Down Expand Up @@ -225,6 +243,19 @@ fi

debug "User requested rebooting device ..."

handle_smart_switch "$REBOOT_DPU" "$PRE_SHUTDOWN" "$DPU_MODULE_NAME"
smart_switch_result=$?
if [[ $smart_switch_result -ne 0 ]]; then
exit $smart_switch_result
fi

# On a smartswitch, complete the DPU reboot and exit
is_smartswitch=$(is_smartswitch)
if [ "$is_smartswitch" == "True" ] && [ "$REBOOT_DPU" == "yes" ]; then
exit $smart_switch_result
fi
fi
vvolam marked this conversation as resolved.
Show resolved Hide resolved

check_conflict_boot_in_fw_update

setup_reboot_variables
Expand Down Expand Up @@ -287,6 +318,11 @@ if [ -x ${WATCHDOG_UTIL} ]; then
${WATCHDOG_UTIL} arm
fi

if [[ "${PRE_SHUTDOWN}" == "yes" ]]; then
echo "${DPU_MODULE_NAME} pre-shutdown steps are completed"
exit ${EXIT_SUCCESS}
fi

if [ -x ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} ]; then
VERBOSE=yes debug "Rebooting with platform ${PLATFORM} specific tool ..."
${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} $@
Expand Down
291 changes: 291 additions & 0 deletions scripts/reboot_smartswitch_helper
vvolam marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
#!/bin/bash

declare -r GNMI_PORT=50052
declare -r MODULE_REBOOT_DPU="DPU"
declare -r MODULE_REBOOT_SMARTSWITCH="SMARTSWITCH"

# Function to print debug message
function log_message() {
local message=$1
echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >&2
}

# Function to check if running on smart switch
function is_smartswitch()
{
python3 -c "from utilities_common.chassis import is_smartswitch; print(is_smartswitch())" | grep -q "True"
}

# Function to check if running on DPU
function is_dpu()
{
python3 -c "from utilities_common.chassis import is_dpu; print(is_dpu())" | grep -q "True"
}

# Function to retrieve number of DPUs
function get_num_dpus()
{
python3 -c "from utilities_common.chassis import get_num_dpus; print(get_num_dpus())"
}

# Function to retrieve DPU IP from CONFIG_DB
function get_dpu_ip()
{
local DPU_NAME=$1
sonic-db-cli CONFIG_DB HGET "DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME}" "ips@"
}

# Function to retrieve GNMI port from CONFIG_DB
function get_gnmi_port()
{
local DPU_NAME=$1
sonic-db-cli CONFIG_DB HGET "DPU_PORT|$DPU_NAME" "gnmi"
}

# Function to get reboot status from DPU
function get_reboot_status()
{
local dpu_ip=$1
local port=$2
local reboot_status
reboot_status=$(docker exec -i gnmi gnoi_client -target "${dpu_ip}:${port}" -logtostderr -insecure -rpc RebootStatus 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$reboot_status" ]; then
log_message "Error: Failed to send reboot status command to DPU ${DPU_NAME}"
return ${EXIT_ERROR}
fi
local is_reboot_active
is_reboot_active=$(echo "$reboot_status" | grep "active" | awk '{print $2}')
if [ "$is_reboot_active" == "false" ]; then
log_message "DPU ${DPU_NAME} has finished rebooting"
return ${EXIT_SUCCESS}
fi
return ${EXIT_ERROR}
}

# Function to detach PCI module
function pci_detach_module()
{
local DPU_NAME=$1
local DPU_BUS_INFO=$2

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vvolam Instead of defining the "DPU_BUS_INFO" in the platform.json file can we define the following two paths for reliable operation?

  1. path_pcie_device_remove
    1. path_pcie_device_rescan
      The platforms will provide these two paths.

python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.pci_detach_module('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "Error: PCI detach vendor API is not available"
echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO}/remove
fi
}

# Function to rescan PCI module
function pci_reattach_module()
{
local DPU_NAME=$1
local DPU_BUS_INFO=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.pci_reattach_module('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "Error: PCI reattach vendor API is not available"
echo 1 > /sys/bus/pci/rescan

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vvolam This may cause kernel panic. The "/sys/bus/pci/rescan" rescans the entire PCIe tree and when this happens another DPU may be detaching, which may result in kernel panic. You should be rescanning only the root port.

fi
}

# Function to reboot DPU
function reboot_dpu_platform()
{
local DPU_NAME=$1
local REBOOT_TYPE=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.reboot_module('${DPU_NAME}', '${REBOOT_TYPE}')"
}

# Function to wait for DPU reboot status
function wait_for_dpu_reboot_status()
{
local dpu_ip=$1
local port=$2

if [[ -z "$PLATFORM_JSON_PATH" ]]; then
log_message "Error: PLATFORM_JSON_PATH is not defined"
exit $EXIT_ERROR
fi

local dpu_halt_services_timeout=$(jq -r '.dpu_halt_services_timeout' "$PLATFORM_JSON_PATH" 2>/dev/null)
if [ $? -ne 0 ]; then
log_message "Error: Failed to retrieve dpu_halt_services_timeout from ${PLATFORM_JSON_PATH}"
return ${EXIT_ERROR}
fi

local poll_interval=5
local waited_time=0
while true; do
local reboot_status
get_reboot_status "${dpu_ip}" "${port}"
reboot_status=$?
if [ $reboot_status -eq ${EXIT_SUCCESS} ]; then
break
fi

sleep "$poll_interval"
waited_time=$((waited_time + poll_interval))
if [ $waited_time -ge $dpu_halt_services_timeout ]; then
log_message "Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting"
return ${EXIT_ERROR}
fi
done
}

# Function to send reboot command to DPU
function gnmi_reboot_dpu()
{
# Retrieve DPU IP and GNMI port
dpu_ip=$(get_dpu_ip "${DPU_NAME}")
log_message "DPU IP ${DPU_NAME}: $dpu_ip"
port=$(get_gnmi_port "${DPU_NAME}")
if [ -z "$port" ]; then
port=$GNMI_PORT # Default GNMI port
fi
log_message "GNMI port ${DPU_NAME}: $port"

if [ -z "$dpu_ip" ]; then
log_message "Error: Failed to retrieve DPU IP for ${DPU_NAME}"
return ${EXIT_ERROR}
fi

docker exec -i gnmi gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc Reboot -jsonin '{"method":3}'
if [ $? -ne 0 ]; then
log_message "Error: Failed to send reboot command to DPU ${DPU_NAME}"
return ${EXIT_ERROR}
fi

wait_for_dpu_reboot_status "${dpu_ip}" "${port}"
}

function reboot_dpu()
{
local DPU_NAME=$1
local REBOOT_TYPE=$2
local DPU_INDEX=${DPU_NAME//[!0-9]/}

debug "User requested rebooting device ${DPU_NAME} ..."

# Send reboot command to DPU
gnmi_reboot_dpu "${DPU_NAME}"
if [ $? -ne 0 ]; then
log_message "Error: Failed to send gnoi command to reboot DPU ${DPU_NAME}"
fi

local DPU_BUS_INFO=$(jq -r --arg DPU_NAME "$DPU_NAME" '.DPUS[$DPU_NAME].bus_info' "$PLATFORM_JSON_PATH")
if [ -z "$DPU_BUS_INFO" ] || [ "$DPU_BUS_INFO" = "null" ]; then
log_message "Error: Failed to retrieve bus info for DPU ${DPU_NAME}"
return ${EXIT_ERROR}
fi

# Update STATE_DB and handle PCIe removal and rescan
sonic-db-cli STATE_DB set "PCIE_DETACH_INFO|${DPU_NAME}" '{"dpu_id": "'${DPU_INDEX}'", "dpu_state": "detaching", "bus_info": "'${DPU_BUS_INFO}'"}'

pci_detach_module ${DPU_NAME} ${DPU_BUS_INFO}
if [ $? -ne 0 ]; then
log_message "Error: Failed to detach PCI module for DPU ${DPU_NAME}"
return ${EXIT_ERROR}
fi

reboot_dpu_platform ${DPU_NAME} ${REBOOT_TYPE}
if [ $? -ne 0 ]; then
log_message "Error: Failed to send platform command to reboot DPU ${DPU_NAME}"
return ${EXIT_ERROR}
fi

pci_reattach_module ${DPU_NAME} ${DPU_BUS_INFO}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vvolam How is the reboot completion guarantied before the "pci_reattach_module" function is called?


sonic-db-cli STATE_DB del "PCIE_DETACH_INFO|${DPU_NAME}"
}

# Function to reboot all DPUs in parallel
function reboot_all_dpus() {
local NUM_DPU=$1

if [[ -z $NUM_DPU ]]; then
log_message "Error: Failed to retrieve number of DPUs or no DPUs found"
return
fi

local failures=0
for (( i=0; i<"$NUM_DPU"; i++ )); do
log_message "Rebooting DPU module dpu$i"
reboot_dpu "dpu$i" "$MODULE_REBOOT_SMARTSWITCH" &
if [ $? -ne 0 ]; then
((failures++))
fi
done
wait
return $failures
}

# Function to verify DPU module name
function verify_dpu_module_name() {
local DPU_MODULE_NAME=$1
local NUM_DPU=$2

if [[ -z "$DPU_MODULE_NAME" ]]; then
log_message "Error: DPU module name not provided"
return $EXIT_ERROR
fi

NUM_DPU=$((NUM_DPU - 1))
if [[ ! "$DPU_MODULE_NAME" =~ ^dpu[0-$NUM_DPU]$ ]]; then
log_message "Error: Invalid DPU module name provided"
return $EXIT_ERROR
fi
}

# Function to handle scenarios on smart switch
function handle_smart_switch() {
local REBOOT_DPU=$1
local PRE_SHUTDOWN=$2
local DPU_NAME=$3

NUM_DPU=$(get_num_dpus)

if is_dpu; then
if [[ "$PRE_SHUTDOWN" != "yes" ]]; then
log_message "Error: '-p' option not specified for a DPU"
return $EXIT_ERROR
elif [[ "$REBOOT_DPU" == "yes" ]]; then
log_message "Error: '-d' option specified for a DPU"
return $EXIT_ERROR
fi
return $EXIT_SUCCESS
fi

if [[ "$PRE_SHUTDOWN" == "yes" ]]; then
log_message "Error: '-p' option specified for a non-DPU"
return $EXIT_ERROR
fi

if [[ "$REBOOT_DPU" == "yes" ]]; then
if is_smartswitch; then
if [[ -z $NUM_DPU ]]; then
log_message "Error: Failed to retrieve number of DPUs or no DPUs found"
return $EXIT_ERROR
fi

DPU_MODULE_NAME="${DPU_NAME,,}"
verify_dpu_module_name "$DPU_MODULE_NAME" "$NUM_DPU"
result=$?
if [[ $result -ne $EXIT_SUCCESS ]]; then
return $result
fi

log_message "Rebooting device ${DPU_MODULE_NAME}"
reboot_dpu "$DPU_MODULE_NAME" "$MODULE_REBOOT_DPU"
result=$?
return $result
else
log_message "Error: '-d' option specified for a non-smart-switch"
return $EXIT_ERROR
fi
fi

# If the system is a smart switch, reboot all DPUs in parallel
if is_smartswitch; then
reboot_all_dpus "$NUM_DPU" "$MODULE_REBOOT_SMARTSWITCH"
result=$?
return $result
fi
}
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
'scripts/psushow',
'scripts/queuestat',
'scripts/reboot',
'scripts/reboot_smartswitch_helper',
'scripts/route_check.py',
'scripts/route_check_test.sh',
'scripts/vnet_route_check.py',
Expand Down
Loading
Loading