Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add monitoring for Node failure #528

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
[CmdletBinding()]
param
(
# Path to the module defining the strategy to use for monitoring the node
[string]
$StrategyModulePath = "C:\k\debug\StrategyModulePath.psm1"
)

function Start-HNSTrace
{
.\collectlogs.ps1
$sessionName = 'HnsCapture'
Write-Host "Starting HNS tracing"

$curDir = Get-Location
# Generate a random directory to capture all the logs
$etlPath = [io.Path]::Combine($curDir.Path, "HNSTrace.etl")
.\starthnstrace.ps1 -NoPrompt -MaxFileSize 1024 -EtlFile $etlPath
}

function Stop-HNSTrace
{
# Stop the tracing
$sessionName = 'HnsCapture'
Write-Host "Stopping $sessionName."
Stop-NetEventSession $sessionName

# Collect logs
.\collectlogs.ps1
.\collect-windows-logs.ps1

# Take a HNS Process dump
$hnsProcessId = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Hns'" | Select-Object -ExpandProperty ProcessId
.\Procdump\Procdump.exe -ma $hnsProcessId /accepteula
}

'''
Start-Monitoring

Monitors Windows node for an error condition by polling every 15 seconds.
Gathers all the necessary logs if Windows node goes into an error/faulted state.
'''
function Start-Monitoring
{
param
(
# Path with filename where the configuration module is located
[string]
$StrategyModulePath = "C:\k\debug\StrategyModule.psm1",

# Interval to poll for failure in seconds
[int]
$PollingInterval = 15,

# Number of consecutive failures to declare the node is faulty
[int]
$FailureThreshold = 3
)

$curDir = Get-Location
# Generate a random directory to capture all the logs
$outDir = [io.Path]::Combine($curDir.Path, [io.Path]::GetRandomFileName())
md $outDir
pushd
cd $outDir

# Download necessary files
wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/collectlogs.ps1 -o collectlogs.ps1
wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/VFP.psm1 -o VFP.psm1
wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.psm1 -o HNS.psm1
wget https://raw.githubusercontent.com/Azure/aks-engine/master/scripts/collect-windows-logs.ps1 -o collect-windows-logs.ps1
wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/starthnstrace.ps1 -o starthnstrace.ps1
wget https://download.sysinternals.com/files/Procdump.zip -o Procdump.zip
Expand-Archive .\Procdump.zip
wget $StrategyModulePath -o StrategyModule.psm1
ipmo .\VFP.psm1
ipmo .\HNS.psm1
ipmo .\StrategyModule.psm1

Start-HNSTrace
$consecutiveFailures = 0

StartHandler

LogMessage "Started Monitoring"

while($true)
{
if(IsNodeFaulted)
{
$consecutiveFailures++
# Number of consecutive failures to confirm that the Windows node is faulted for real
# and this is not an intermittent failure
if ($consecutiveFailures -ge $FailureThreshold)
{
Stop-HNSTrace

popd

TerminateHandler($outDir)

LogMessage "Diagnostic logs are available at $outDir"
return
}
}
else
{
$consecutiveFailures = 0
}

# Adjust the sleep time to lower the polling frequency
Start-Sleep -Seconds $PollingInterval
}
}

##### Start execution #########

Start-Monitoring -StrategyModulePath $StrategyModulePath
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#Implement these 4 methods:
# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file.
# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state)
# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state)
# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise

function LogMessage
{
param
(
[string] $Message = ""
)

#re-implement if needed
$FilePath = "C:\k\debug\MonitorWindowsNode.txt"
Get-Date | Out-File -FilePath $FilePath -Append
$Message | Out-File -FilePath $FilePath -Append

}

function StartHandler
{
#download file
wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.v2.psm1 -o HNS.V2.psm1

ipmo .\HNS.V2.psm1
}

function TerminateHandler
{
param
(
[string] $LogPath = ""
)
LogMessage "Capturing information after node failure"
LogMessage "Information has been logged: $LogPath"

#TODO: add azure blob
#TODO: add way to notify user of issue
}

function IsNodeFaulted
{
#More specific lookup by azure name. Needs more testing before is used.
#((get-hnsnetwork | ? name -like azure)[0].Policies | Where-Object PolicyType -eq IPSET).count
$expectedNumPolicies = (((get-hnsnetwork | Select Policies)[1].Policies) | Where-Object PolicyType -eq IPSET).Count
if($expectedNumPolicies -eq 0){
return $false
}
$EndpointPorts = Get-HnsEndpoint | %{$_.Resources.Allocators} | Where-Object Tag -eq "Endpoint Port" | Select -ExpandProperty EndpointPortGuid
foreach ($endPort in $EndpointPorts)
{
$currNumPolicies = (vfpctrl /port $endPort /list-tag | Select-String "Friendly Name").Count
#if difference is greater than or equal to 10%
if($currNumPolicies -le ($expectedNumPolicies - $expectedNumPolicies * .1)){

#get the virtualNetwork
$netId = Get-HnsEndpoint | where-object {$_.Resources.Allocators.EndPointPortGuid -eq $endPort} | Select -ExpandProperty VirtualNetwork
#send test policy to simplify log lookup
New-HNSSetPolicy -NetworkId $netId -setType 0 -setValues "10.22.0.44" -setName "spTestName" -setId "spTestId"

return $true
}
}
return $false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#Implement these 4 methods:
# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file.
# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state)
# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state)
# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise

function LogMessage
{
param
(
[string] $Message = ""
)

#re-implement if needed
$FilePath = "C:\k\debug\MonitorWindowsNode.txt"
Get-Date | Out-File -FilePath $FilePath -Append
$Message | Out-File -FilePath $FilePath -Append

}

function StartHandler
{
#logic here
}

function TerminateHandler
{
param
(
[string] $LogPath = ""
)

# copy the logs to Azure blob
Invoke-WebRequest https://azcopyvnext.azureedge.net/release20211027/azcopy_windows_amd64_10.13.0.zip -OutFile azcopyv10.zip
Expand-Archive .\azcopyv10.zip -Force

$timeStamp = get-date -format 'yyyyMMdd-hhmmss'
$zipFileName = "$env:computername-$($timeStamp)_logs.zip"
Compress-Archive -LiteralPath $LogPath -DestinationPath $zipFileName
.\azcopyv10\azcopy_windows_amd64_10.13.0\azcopy.exe copy $zipFileName "https://sban91storage.blob.core.windows.net/akslogs?sp=rw&st=2021-11-30T18:59:20Z&se=2021-12-12T02:59:20Z&spr=https&sv=2020-08-04&sr=c&sig=3uzRPB72k4NnM2q1k1vZ1xqugkjDSUSWSPMdiMQkwMI%3D"
}

function IsNodeFaulted
{
#logic here
return $true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
$ServiceIp = "192.168.0.10"
$ServicePort = 53

#Implement these 4 methods:
# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file.
# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state)
# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state)
# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise

function LogMessage
{
param
(
[string] $Message = ""
)

#re-implement if needed
$FilePath = "C:\k\debug\MonitorWindowsNode.txt"
Get-Date | Out-File -FilePath $FilePath -Append
$Message | Out-File -FilePath $FilePath -Append

}

function StartHandler
{
LogMessage "Capturing some information before the repro."
$hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'"
$kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'"
LogMessage $hnsInfo
LogMessage $kubeproxyInfo
}

function TerminateHandler
{
param
(
[string] $LogPath = ""
)
LogMessage "Capturing some information after the repro."
$hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'"
$kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'"
LogMessage $hnsInfo
LogMessage $kubeproxyInfo
LogMessage "HNS Policy for K8's Service with IP $ServiceIp and Port $ServicePort is missing"
}

function IsNodeFaulted
{
return ((Get-HnsPolicyList | where {($_.Policies.VIPs.Count -ge 1) -and $_.Policies.VIPs.Contains($ServiceIp) -and $_.Policies.ExternalPort -eq $ServicePort}) -eq $null)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#Implement these 4 methods:
# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file.
# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state)
# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state)
# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise

function LogMessage
{
param
(
[string] $Message = ""
)

#re-implement if needed
$FilePath = "C:\k\debug\MonitorWindowsNode.txt"
Get-Date | Out-File -FilePath $FilePath -Append
$Message | Out-File -FilePath $FilePath -Append

}

function StartHandler
{
#logic here
}

function TerminateHandler
{
param
(
[string] $LogPath = ""
)
#logic here
}

function IsNodeFaulted
{
#logic here
return $true
}