Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
131567: roachprod: add command to download the latest pprof file r=herkolategan a=nameisbhaskar

Currently, whenever we see a spike in the CPU for a specific cluster, we have to download the latest pprof file for analysis. We generally select the latest file across all nodes or a specific set of nodes. This becomes a very manual process as we need to check each cluster to find the suitable file to download.
This command helps in finding the latest pprof to download across all nodes or given nodes. The command also takes a time parameter as input with which we can download the latest pprof before a specific time.

Epic: none
Release note: None

Co-authored-by: Bhaskarjyoti Bora <[email protected]>
  • Loading branch information
craig[bot] and nameisbhaskar committed Dec 1, 2024
2 parents b1474fa + 51e4ba9 commit 749b563
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 1 deletion.
22 changes: 22 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2019,6 +2019,7 @@ func main() {
opentelemetryStartCmd,
opentelemetryStopCmd,
fetchLogsCmd,
getLatestPProf,
)
loadBalancerCmd.AddCommand(createLoadBalancerCmd)
loadBalancerCmd.AddCommand(loadBalancerPGUrl)
Expand Down Expand Up @@ -2117,3 +2118,24 @@ The logs will be placed in the directory if specified or in the directory named
fetchLogsTimeout)
}),
}

var getLatestPProf = &cobra.Command{
Use: "get-latest-pprof <cluster> [time-before]",
Short: "downloads the latest pprof file which is created on or before the provided time-before.",
Long: `Downloads the latest pprof file which is created on or before the provided time-before.
The time should be of the format 2022-08-31T15:23:22Z for UTC or 2022-08-31T15:23:22+05:30 for time zone.
If the time is not provided, it downloads the latest pprof file across all clusters.
`,
Args: cobra.MinimumNArgs(1),
// Wraps the command execution with additional error handling
Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
cluster := args[0]
pprofTimeBefore := ""
if len(args) == 2 {
// time-before is optional
pprofTimeBefore = args[1]
}
ctx := context.Background()
return roachprod.DownloadLatestPProfFile(ctx, config.Logger, cluster, pprofTimeBefore)
}),
}
8 changes: 7 additions & 1 deletion pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ go_library(
"clusters_cache.go",
"multitenant.go",
"roachprod.go",
"roachprod_pprof.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod",
visibility = ["//visibility:public"],
Expand Down Expand Up @@ -43,11 +44,16 @@ go_library(

go_test(
name = "roachprod_test",
srcs = ["roachprod_test.go"],
srcs = [
"getpprof_test.go",
"roachprod_test.go",
],
embed = [":roachprod"],
deps = [
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
],
)
90 changes: 90 additions & 0 deletions pkg/roachprod/getpprof_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package roachprod

import (
"fmt"
"regexp"
"strings"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/stretchr/testify/require"
)

func Test_buildClusterPProfInfo(t *testing.T) {
clusterToPProfInfo := make(map[install.Node][]*pprofInfo)
buildClusterPProfInfo(strings.Split(getListClusterMockResponse(1), "\n"), install.Node(1), clusterToPProfInfo)
buildClusterPProfInfo(strings.Split(getListClusterMockResponse(2), "\n"), install.Node(2), clusterToPProfInfo)
require.Equal(t, 2, len(clusterToPProfInfo))
pprofRegExp := regexp.MustCompile(`cpuprof\.(.*)\.pprof`)
for node, pprofInfos := range clusterToPProfInfo {
nodeNumber := int(node)
// fileCount represents the file index as returned by getListClusterMockResponse
fileCount := 0
for _, pi := range pprofInfos {
// fileDate is using the same logic as used in getListClusterMockResponse for calculating the date
fileDate := nodeNumber + 20 - fileCount*2
require.Equal(t, fmt.Sprintf("cpuprof.2024-09-%dT10_05_22.234.69.pprof", fileDate),
pi.filename)
pprofFileMatch := pprofRegExp.FindStringSubmatch(pi.filename)
require.Equal(t, 2, len(pprofFileMatch))
ct, err := time.Parse("2006-01-02T15:04:05",
strings.ReplaceAll(strings.Split(pprofFileMatch[1], ".")[0], "_", ":"))
require.Nil(t, err)
require.Equal(t, ct, pi.createdTime)
fileCount++
if fileCount == 1 {
// the second last file in the list returned is invalid. Since the files get reverse sorted by time, the invalid
// file entry becomes the second entry
fileCount++
}
}
}
}

func Test_getLatestPProfInfoBefore(t *testing.T) {
timeNow := time.Now()
fileInfoMap := map[install.Node][]*pprofInfo{
1: {
{createdTime: timeNow.Add(-10)},
{createdTime: timeNow.Add(-20)},
{createdTime: timeNow.Add(-30)},
},
2: {
{createdTime: timeNow.Add(-15)},
{createdTime: timeNow.Add(-25)},
{createdTime: timeNow.Add(-28)},
},
3: {
{createdTime: timeNow.Add(-5)},
{createdTime: timeNow.Add(-11)},
{createdTime: timeNow.Add(-15)},
},
}
require.Nil(t, getLatestPProfInfo(fileInfoMap, timeNow.Add(-31)))
require.Equal(t, fileInfoMap[3][1], getLatestPProfInfo(fileInfoMap, timeNow.Add(-11)))
require.Equal(t, fileInfoMap[3][0], getLatestPProfInfo(fileInfoMap, time.Now()))
require.Equal(t, fileInfoMap[1][0], getLatestPProfInfo(fileInfoMap, timeNow.Add(-6)))
require.Equal(t, fileInfoMap[2][2], getLatestPProfInfo(fileInfoMap, timeNow.Add(-26)))
}

// getListClusterMockResponse returns the mocked data for a node. The node number decides the dates.
// e.g. if n=1, it returns 6 records with dates:
// Sept 11, 13, 15, 17, 19, 21
// Similarly for n=2, the 6 records are:
// Sept 12, 14, 16, 18, 20, 22
// Records on day n+18 is invalid.
func getListClusterMockResponse(n int) string {
return fmt.Sprintf(`-rw-r----- 1 ubuntu ubuntu 219534 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof
-rw-r----- 1 ubuntu ubuntu 211461 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof
-rw-r----- 1 ubuntu ubuntu 207410 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof
-rw-r----- 1 ubuntu ubuntu 248365 Sep %d 10:06 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof
-rw-r----- 1 ubuntu ubuntu 248365 Sep %d 10:06 logs/pprof_dump/cpuprof.2024-09-Invalid_06_%d.042.82.pprof
-rw-r----- 1 ubuntu ubuntu 231194 Sep %d 10:07 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof`,
n+10, n+10, n+12, n+12, n+14, n+14, n+16, n+16, n+18, n+18, n+20, n+20)
}
202 changes: 202 additions & 0 deletions pkg/roachprod/roachprod_pprof.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package roachprod

import (
"context"
"fmt"
"regexp"
"sort"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

var (
// pprofFileListRegex extracts the ppprof file name of the "ls -ltr" console output. It also extracts the date
// and time from the file name. e.g.
// -rw-r----- 1 ubuntu ubuntu 221508 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_19.067.67.pprof
// extracts cpuprof.2024-09-27T10_05_19.067.67.pprof and 2024-09-27T10_05_19.067.67
pprofFileListRegex = regexp.MustCompile(`.+ logs/pprof_dump/(cpuprof\.(.*)\.pprof)$`)
)

// pprofInfo maintains the information of a single pprof file in a cluster
type pprofInfo struct {
node install.Node
filename string
createdTime time.Time
}

// DownloadLatestPProfFile downloads the latest pprof file before beforeTime across all cluster nodes
// if the beforeTime is not set, it defaults to the current time
func DownloadLatestPProfFile(
ctx context.Context, l *logger.Logger, clusterName string, beforeTimeArg string,
) (err error) {
beforeTime := timeutil.Now()
if beforeTimeArg != "" {
beforeTime, err = time.Parse(time.RFC3339, beforeTimeArg)
if err != nil {
return err
}
}
c, err := getClusterFromCache(l, clusterName)
if err != nil {
return err
}

nodePProfInfo, err := getClusterToPProfInfoMap(ctx, l, c)
if err != nil {
return err
}

latestPProf := getLatestPProfInfo(nodePProfInfo, beforeTime)

if latestPProf != nil {
// download the file if found
err = c.Get(ctx, l, install.Nodes{latestPProf.node},
fmt.Sprintf("logs/pprof_dump/%s", latestPProf.filename),
// the downloaded file name has the cluster name and node number prefixed
fmt.Sprintf("%s_%d_%s", c.Name, latestPProf.node, latestPProf.filename),
)
if err != nil {
return err
}
} else {
fmt.Println("No file found to download.")
}
return nil
}

// getLatestPProfInfo gets the latest pprofInfo across all clusters before beforeTime
// it returns nil if no pprof were found.
func getLatestPProfInfo(
clusterToPProfInfo map[install.Node][]*pprofInfo, beforeTime time.Time,
) *pprofInfo {
var latestPProf *pprofInfo
allEligiblePProf := make([]*pprofInfo, 0)
for _, pprofInfos := range clusterToPProfInfo {
// data is sorted. so, binary search works
info := getLatestPProfInfoBefore(pprofInfos, beforeTime)
if info != nil {
allEligiblePProf = append(allEligiblePProf, info)
}
}
// check for the length as we may not find any file with the given criteria
if len(allEligiblePProf) > 0 {
latestPProf = allEligiblePProf[0]
for i := 1; i < len(allEligiblePProf); i++ {
if latestPProf.createdTime.Before(allEligiblePProf[i].createdTime) {
latestPProf = allEligiblePProf[i]
}
}
}
return latestPProf
}

// getClusterToPProfInfoMap run "ls -ltr logs/pprof_dump" on the cluster and creates a map where
// the key is the node number and value is a list of pprofInfo. e.g.
// Consider the following console output for different nodes:
// Node 1 output:
//
// -rw-r----- 1 ubuntu ubuntu 207410 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_42.752.79.pprof
// -rw-r----- 1 ubuntu ubuntu 248365 Sep 27 10:06 logs/pprof_dump/cpuprof.2024-09-27T10_06_33.042.82.pprof
// -rw-r----- 1 ubuntu ubuntu 231194 Sep 27 10:07 logs/pprof_dump/cpuprof.2024-09-27T10_07_33.427.85.pprof
//
// Node 2 output:
//
// -rw-r----- 1 ubuntu ubuntu 255149 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_39.811.84.pprof
// -rw-r----- 1 ubuntu ubuntu 237958 Sep 27 10:07 logs/pprof_dump/cpuprof.2024-09-27T10_06_50.128.87.pprof
//
// The above forms the following map:
//
// {
// 1: []*pprofInfo{
// {clusterNodeNumber: 1, filename: cpuprof.2024-09-27T10_07_33.427.85.pprof, createdTime: 2024-09-27T10:07:33Z},
// {clusterNodeNumber: 1, filename: cpuprof.2024-09-27T10_06_33.042.82.pprof, createdTime: 2024-09-27T10:06:33Z},
// },
// 2: []*pprofInfo{
// {clusterNodeNumber: 2, filename: cpuprof.2024-09-27T10_06_50.128.87.pprof, createdTime: 2024-09-27T10:06:50Z},
// {clusterNodeNumber: 2, filename: cpuprof.2024-09-27T10_05_39.811.84.pprof, createdTime: 2024-09-27T10:05:39Z},
// },
// }
//
// Here 1 amd 2 as key represents the node number and the values represent the files in each node
// Note that the lists per node are reverse sorted.
func getClusterToPProfInfoMap(
ctx context.Context, l *logger.Logger, c *install.SyncedCluster,
) (map[install.Node][]*pprofInfo, error) {
results, err := c.RunWithDetails(ctx, l, install.WithNodes(c.Nodes), "",
"ls -ltr logs/pprof_dump/cpuprof.*.pprof")
if err != nil {
return nil, err
}
clusterToPProfInfo := make(map[install.Node][]*pprofInfo)
for _, r := range results {
node := r.Node
output := r.Stdout
buildClusterPProfInfo(strings.Split(output, "\n"), node, clusterToPProfInfo)
}

return clusterToPProfInfo, nil
}

// getLatestPProfInfoBefore gets the last pprofInfo record before a certain time using binary search
func getLatestPProfInfoBefore(pprofList []*pprofInfo, beforeTime time.Time) *pprofInfo {
// Binary search to find the first element where createdTime is before the given time
index := sort.Search(len(pprofList), func(i int) bool {
// Since the list is reverse sorted, we check the opposite condition
// 1 is added to select the file with exact matching timestamp
return pprofList[i].createdTime.Before(beforeTime.Add(1))
})

// no files are found
if index == len(pprofList) {
return nil
}
// the pprofInfo at the index is returned
return pprofList[index]
}

// buildClusterPProfInfo scans through each line and builds the map of cluster node to the list of files
func buildClusterPProfInfo(
lines []string, node install.Node, clusterToPProfInfo map[install.Node][]*pprofInfo,
) {
// allPProf contains the list of pprofInfo for a node.
// As the lines are read in sequence, the files for a single node always come in one flow.
allPProf := make([]*pprofInfo, 0)
for index := 0; index < len(lines); index++ {
// One line at a time is read. The lines that are of our interest looks like the following:
// "-rw-r----- 1 ubuntu ubuntu 237958 Sep 27 10:07 logs/pprof_dump/<pprof file name"
// any other line is ignored
line := lines[index]
// it checks if the line matches any ls -ltr output of ppprof file
pprofFileMatch := pprofFileListRegex.FindStringSubmatch(line)
if len(pprofFileMatch) > 2 {
// this is a line with a matching pprof file name
// extract the date from the file name. To make it parsable replace _ with :
ct, err := time.Parse("2006-01-02T15:04:05",
strings.ReplaceAll(strings.Split(pprofFileMatch[2], ".")[0], "_", ":"))
if err != nil {
fmt.Println("Unsupported file name format:", pprofFileMatch[1], err)
continue
}
// append the new pprofInfo in the beginning
allPProf = append([]*pprofInfo{{
node: node,
filename: pprofFileMatch[1],
createdTime: ct,
}}, allPProf...)
}
// if the line does not match any of the above, continue to the next line
}
// we reach here once all the lines are read. if allPProf has any data, this is for the node
if len(allPProf) > 0 {
clusterToPProfInfo[node] = allPProf
}
}

0 comments on commit 749b563

Please sign in to comment.