From 80f08f97d5ddae60db02b8cc355be2d5c3201389 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Tue, 6 Feb 2024 12:10:42 -0600 Subject: [PATCH] Added ability to enable DCGM library log output Signed-off-by: Vadym Fedorov --- .vscode/launch.json | 4 +- Makefile | 2 +- go.mod | 2 +- internal/pkg/logging/logger_adapter.go | 72 +++++ internal/pkg/logging/logger_adapter_test.go | 113 +++++++ pkg/cmd/app.go | 312 +++++++++++--------- pkg/dcgmexporter/config.go | 2 + pkg/dcgmexporter/const.go | 20 ++ pkg/dcgmexporter/server.go | 10 +- pkg/stdout/capture.go | 78 +++++ pkg/stdout/capture_test.go | 83 ++++++ pkg/stdout/capture_test_wrapper.go | 56 ++++ pkg/stdout/stdoutprocessor.go | 63 ++++ 13 files changed, 670 insertions(+), 147 deletions(-) create mode 100644 internal/pkg/logging/logger_adapter.go create mode 100644 internal/pkg/logging/logger_adapter_test.go create mode 100644 pkg/stdout/capture.go create mode 100644 pkg/stdout/capture_test.go create mode 100644 pkg/stdout/capture_test_wrapper.go create mode 100644 pkg/stdout/stdoutprocessor.go diff --git a/.vscode/launch.json b/.vscode/launch.json index a4f0acbf..83fd9326 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -14,7 +14,9 @@ "args": [ "-f", "./etc/default-counters.csv", - "--debug" + "--debug", + "--enable-dcgm-log", + "--dcgm-log-level=INFO" ] } ] diff --git a/Makefile b/Makefile index a0f8038c..c2308afd 100644 --- a/Makefile +++ b/Makefile @@ -94,4 +94,4 @@ validate-modules: tools: ## Install required tools and utilities go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.55.2 go install github.com/axw/gocov/gocov@latest - + diff --git a/go.mod b/go.mod index b53b855b..08406483 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/NVIDIA/go-nvml v0.12.0-2 github.com/avast/retry-go/v4 v4.5.1 github.com/bits-and-blooms/bitset v1.13.0 + github.com/go-kit/log v0.2.1 github.com/gorilla/mux v1.8.1 github.com/prometheus/client_model v0.6.0 github.com/prometheus/common v0.47.0 @@ -31,7 +32,6 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch v4.12.0+incompatible // indirect - github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.5.1 // indirect github.com/go-logr/logr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect diff --git a/internal/pkg/logging/logger_adapter.go b/internal/pkg/logging/logger_adapter.go new file mode 100644 index 00000000..dccc5a3a --- /dev/null +++ b/internal/pkg/logging/logger_adapter.go @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logging + +import ( + "fmt" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/sirupsen/logrus" +) + +// LogrusAdapter is an adapter that allows logrus Logger to be used as a go-kit/log Logger. +type LogrusAdapter struct { + Logger *logrus.Logger +} + +// NewLogrusAdapter creates a new LogrusAdapter with the provided logrus.Logger. +func NewLogrusAdapter(logger *logrus.Logger) log.Logger { + return &LogrusAdapter{ + Logger: logger, + } +} + +// Log implements the go-kit/log Logger interface. +func (a *LogrusAdapter) Log(keyvals ...interface{}) error { + if len(keyvals)%2 != 0 { + keyvals = append(keyvals, "MISSING") + } + + fields := logrus.Fields{} + for i := 0; i < len(keyvals); i += 2 { + key, ok := keyvals[i].(string) + if !ok { + // If the key is not a string, use a default key + key = "missing_key" + } + fields[key] = keyvals[i+1] + } + + msg, exists := fields["msg"] + if exists { + delete(fields, "msg") + } + + lvl, exists := fields["level"] + if !exists { + fields["level"] = level.InfoValue() + } + delete(fields, "level") + parsedLvl, err := logrus.ParseLevel(fmt.Sprint(lvl)) + if err != nil { + parsedLvl = logrus.InfoLevel + } + + a.Logger.WithFields(fields).Log(parsedLvl, msg) + + return nil +} diff --git a/internal/pkg/logging/logger_adapter_test.go b/internal/pkg/logging/logger_adapter_test.go new file mode 100644 index 00000000..670ac989 --- /dev/null +++ b/internal/pkg/logging/logger_adapter_test.go @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logging + +import ( + "github.com/go-kit/log/level" + "github.com/sirupsen/logrus" + "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "testing" +) + +func TestLogrusAdapter_Log(t *testing.T) { + type testCase struct { + name string + keyvals []interface{} + assert func(*testing.T, *logrus.Entry) + } + + //"msg", "Listening on", "address" + testCases := []testCase{ + { + name: "Success", + keyvals: []interface{}{ + "level", + level.InfoValue, + "msg", + "Listening on", + "address", + "127.0.0.0.1:8080", + }, + assert: func(t *testing.T, entry *logrus.Entry) { + t.Helper() + require.NotNil(t, entry) + assert.Equal(t, "Listening on", entry.Message) + require.Contains(t, entry.Data, "address") + assert.Equal(t, "127.0.0.0.1:8080", entry.Data["address"]) + }, + }, + { + name: "When no Level", + keyvals: []interface{}{ + "msg", + "Listening on", + "address", + "127.0.0.0.1:8080", + }, + assert: func(t *testing.T, entry *logrus.Entry) { + t.Helper() + require.NotNil(t, entry) + assert.Equal(t, "Listening on", entry.Message) + require.Contains(t, entry.Data, "address") + assert.Equal(t, "127.0.0.0.1:8080", entry.Data["address"]) + }, + }, + { + name: "When key is not string", + keyvals: []interface{}{ + "msg", + "Listening on", + 42, + "127.0.0.0.1:8080", + }, + assert: func(t *testing.T, entry *logrus.Entry) { + t.Helper() + require.NotNil(t, entry) + assert.Equal(t, "Listening on", entry.Message) + require.Contains(t, entry.Data, "missing_key") + assert.Equal(t, "127.0.0.0.1:8080", entry.Data["missing_key"]) + }, + }, + { + name: "When value is missing", + keyvals: []interface{}{ + "msg", + "Listening on", + "address", + }, + assert: func(t *testing.T, entry *logrus.Entry) { + t.Helper() + require.NotNil(t, entry) + assert.Equal(t, "Listening on", entry.Message) + require.Contains(t, entry.Data, "address") + assert.Equal(t, "MISSING", entry.Data["address"]) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + logrusLogger, logHook := test.NewNullLogger() + logger := NewLogrusAdapter(logrusLogger) + err := logger.Log(tc.keyvals...) + require.NoError(t, err) + tc.assert(t, logHook.LastEntry()) + }) + } +} diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 3fbce59f..579c328f 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -2,11 +2,13 @@ package cmd import ( "bytes" + "context" "fmt" "os" "os/signal" "runtime" "runtime/debug" + "slices" "strconv" "strings" "sync" @@ -14,11 +16,11 @@ import ( "text/template" "time" + "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" + "github.com/NVIDIA/dcgm-exporter/pkg/stdout" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" - - "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" ) const ( @@ -67,6 +69,8 @@ const ( CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" CLIDebugMode = "debug" CLIClockEventsCountWindowSize = "clock-events-count-window-size" + CLIEnableDCGMLog = "enable-dcgm-log" + CLIDCGMLogLevel = "dcgm-log-level" ) func NewApp(buildVersion ...string) *cli.App { @@ -191,7 +195,7 @@ func NewApp(buildVersion ...string) *cli.App { Name: CLIReplaceBlanksInModelName, Aliases: []string{"rbmn"}, Value: false, - Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.", + Usage: "Replace every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.", EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"}, }, &cli.BoolFlag{ @@ -206,6 +210,18 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Set time window size in milliseconds (ms) for counting clock events in DCGM Exporter.", EnvVars: []string{"DCGM_EXPORTER_CLOCK_EVENTS_COUNT_WINDOW_SIZE"}, }, + &cli.BoolFlag{ + Name: CLIEnableDCGMLog, + Value: false, + Usage: "Enable writing DCGM logs to standard output (stdout).", + EnvVars: []string{"DCGM_EXPORTER_ENABLE_DCGM_LOG"}, + }, + &cli.StringFlag{ + Name: CLIDCGMLogLevel, + Value: dcgmexporter.DCGMDbgLvlNone, + Usage: "Specify the DCGM log verbosity level. This parameter is effective only when the '--enable-dcgm-log' option is set to 'true'. Possible values: NONE, FATAL, ERROR, WARN, INFO, DEBUG and VERB", + EnvVars: []string{"DCGM_EXPORTER_DCGM_LOG_LEVEL"}, + }, } if runtime.GOOS == "linux" { @@ -236,180 +252,188 @@ func newOSWatcher(sigs ...os.Signal) chan os.Signal { } func action(c *cli.Context) (err error) { -restart: - - // The purpose of this function is to capture any panic that may occur - // during initialization and return an error. - defer func() { - if r := recover(); r != nil { - logrus.WithField(dcgmexporter.LoggerStackTrace, string(debug.Stack())).Error("Encountered a failure.") - err = fmt.Errorf("encountered a failure; err: %v", r) - } - }() - - logrus.Info("Starting dcgm-exporter") - config, err := contextToConfig(c) - if err != nil { - return err - } - - if config.Debug { - // enable debug logging - logrus.SetLevel(logrus.DebugLevel) - logrus.Debug("Debug output is enabled") - } - - logrus.Debugf("Command line: %s", strings.Join(os.Args, " ")) - - logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration") + ctx, cancel := context.WithCancel(context.Background()) + return stdout.Capture(ctx, func() error { + restart: + // The purpose of this function is to capture any panic that may occur + // during initialization and return an error. + defer func() { + if r := recover(); r != nil { + logrus.WithField(dcgmexporter.LoggerStackTrace, string(debug.Stack())).Error("Encountered a failure.") + err = fmt.Errorf("encountered a failure; err: %v", r) + } + }() - if config.UseRemoteHE { - logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo) - cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") - defer cleanup() + logrus.Info("Starting dcgm-exporter") + config, err := contextToConfig(c) if err != nil { - logrus.Fatal(err) + return err } - } else { - cleanup, err := dcgm.Init(dcgm.Embedded) - defer cleanup() - if err != nil { - logrus.Fatal(err) + + if config.Debug { + // enable debug logging + logrus.SetLevel(logrus.DebugLevel) + logrus.Debug("Debug output is enabled") } - } - logrus.Info("DCGM successfully initialized!") - dcgm.FieldsInit() - defer dcgm.FieldsTerm() + logrus.Debugf("Command line: %s", strings.Join(os.Args, " ")) - var groups []dcgm.MetricGroup - groups, err = dcgm.GetSupportedMetricGroups(0) - if err != nil { - config.CollectDCP = false - logrus.Info("Not collecting DCP metrics: ", err) - } else { - logrus.Info("Collecting DCP Metrics") - config.MetricGroups = groups - } + logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration") - cs, err := dcgmexporter.GetCounterSet(config) - - if err != nil { - logrus.Fatal(err) - } + if config.UseRemoteHE { + logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo) + cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") + defer cleanup() + if err != nil { + logrus.Fatal(err) + } + } else { + if config.EnableDCGMLog { + os.Setenv("__DCGM_DBG_FILE", "-") + os.Setenv("__DCGM_DBG_LVL", config.DCGMLogLevel) + } - // Copy labels from DCGM Counters to ExporterCounters - for i := range cs.DCGMCounters { - if cs.DCGMCounters[i].PromType == "label" { - cs.ExporterCounters = append(cs.ExporterCounters, cs.DCGMCounters[i]) + cleanup, err := dcgm.Init(dcgm.Embedded) + defer cleanup() + if err != nil { + logrus.Fatal(err) + } } - } + logrus.Info("DCGM successfully initialized!") - hostname, err := dcgmexporter.GetHostname(config) - if err != nil { - return err - } + dcgm.FieldsInit() + defer dcgm.FieldsTerm() - allCounters := []dcgmexporter.Counter{} + var groups []dcgm.MetricGroup + groups, err = dcgm.GetSupportedMetricGroups(0) + if err != nil { + config.CollectDCP = false + logrus.Info("Not collecting DCP metrics: ", err) + } else { + logrus.Info("Collecting DCP Metrics") + config.MetricGroups = groups + } - allCounters = append(allCounters, cs.DCGMCounters...) - allCounters = append(allCounters, - dcgmexporter.Counter{ - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - dcgmexporter.Counter{ - FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, - }, - ) + cs, err := dcgmexporter.GetCounterSet(config) - fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config) + if err != nil { + logrus.Fatal(err) + } - for _, egt := range dcgmexporter.FieldEntityGroupTypeToMonitor { - err := fieldEntityGroupTypeSystemInfo.Load(egt) + // Copy labels from DCGM Counters to ExporterCounters + for i := range cs.DCGMCounters { + if cs.DCGMCounters[i].PromType == "label" { + cs.ExporterCounters = append(cs.ExporterCounters, cs.DCGMCounters[i]) + } + } + + hostname, err := dcgmexporter.GetHostname(config) if err != nil { - logrus.Infof("Not collecting %s metrics; %s", egt.String(), err) + return err } - } - ch := make(chan string, 10) + allCounters := []dcgmexporter.Counter{} - pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, - cs.DCGMCounters, - hostname, - dcgmexporter.NewDCGMCollector, - fieldEntityGroupTypeSystemInfo, - ) - defer cleanup() - if err != nil { - logrus.Fatal(err) - } + allCounters = append(allCounters, cs.DCGMCounters...) + allCounters = append(allCounters, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, + }, + ) - cRegistry := dcgmexporter.NewRegistry() + fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config) - if dcgmexporter.IsDCGMExpXIDErrorsCountEnabled(cs.ExporterCounters) { - item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - if !exists { - logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMXIDErrorsCount.String()) + for _, egt := range dcgmexporter.FieldEntityGroupTypeToMonitor { + err := fieldEntityGroupTypeSystemInfo.Load(egt) + if err != nil { + logrus.Infof("Not collecting %s metrics; %s", egt.String(), err) + } } - xidCollector, err := dcgmexporter.NewXIDCollector(cs.ExporterCounters, hostname, config, item) + ch := make(chan string, 10) + + pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, + cs.DCGMCounters, + hostname, + dcgmexporter.NewDCGMCollector, + fieldEntityGroupTypeSystemInfo, + ) + defer cleanup() if err != nil { logrus.Fatal(err) } - cRegistry.Register(xidCollector) + cRegistry := dcgmexporter.NewRegistry() - logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String()) - } + if dcgmexporter.IsDCGMExpXIDErrorsCountEnabled(cs.ExporterCounters) { + item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + if !exists { + logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMXIDErrorsCount.String()) + } - if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) { - item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - if !exists { - logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String()) - } - clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector( - cs.ExporterCounters, hostname, config, item) - if err != nil { - logrus.Fatal(err) + xidCollector, err := dcgmexporter.NewXIDCollector(cs.ExporterCounters, hostname, config, item) + if err != nil { + logrus.Fatal(err) + } + + cRegistry.Register(xidCollector) + + logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String()) } - cRegistry.Register(clocksThrottleReasonsCollector) + if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) { + item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + if !exists { + logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String()) + } + clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector( + cs.ExporterCounters, hostname, config, item) + if err != nil { + logrus.Fatal(err) + } - logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String()) - } + cRegistry.Register(clocksThrottleReasonsCollector) - defer func() { - cRegistry.Cleanup() - }() + logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String()) + } - server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry) - defer cleanup() - if err != nil { - return err - } + defer func() { + cRegistry.Cleanup() + }() - var wg sync.WaitGroup - stop := make(chan interface{}) + server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry) + defer cleanup() + if err != nil { + return err + } - wg.Add(1) - go pipeline.Run(ch, stop, &wg) + var wg sync.WaitGroup + stop := make(chan interface{}) - wg.Add(1) - go server.Run(stop, &wg) + wg.Add(1) + go pipeline.Run(ch, stop, &wg) - sigs := newOSWatcher(syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGHUP) - sig := <-sigs - close(stop) - err = dcgmexporter.WaitWithTimeout(&wg, time.Second*2) - if err != nil { - logrus.Fatal(err) - } + wg.Add(1) + go server.Run(stop, &wg) - if sig == syscall.SIGHUP { - goto restart - } + sigs := newOSWatcher(syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGHUP) + sig := <-sigs + close(stop) + cancel() + err = dcgmexporter.WaitWithTimeout(&wg, time.Second*2) + if err != nil { + logrus.Fatal(err) + } - return nil + if sig == syscall.SIGHUP { + goto restart + } + + return nil + }) } func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { @@ -418,8 +442,7 @@ func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { letterAndRange := strings.Split(devices, ":") count := len(letterAndRange) if count > 2 { - return dOpt, fmt.Errorf("invalid ranged device option '%s'; err: there can only be one specified range", - devices) + return dOpt, fmt.Errorf("Invalid ranged device option '%s': there can only be one specified range", devices) } letter := letterAndRange[0] @@ -470,7 +493,7 @@ func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { dOpt.MinorRange = indices } } else { - return dOpt, fmt.Errorf("valid options preceding ':' are 'g' or 'i', but found '%s'", letter) + return dOpt, fmt.Errorf("the only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) } return dOpt, nil @@ -492,6 +515,11 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { return nil, err } + dcgmLogLevel := c.String(CLIDCGMLogLevel) + if !slices.Contains(dcgmexporter.DCGMDbgLvlValues, dcgmLogLevel) { + return nil, fmt.Errorf("invalid %s parameter value: %s", CLIDCGMLogLevel, dcgmLogLevel) + } + return &dcgmexporter.Config{ CollectorsFile: c.String(CLIFieldsFile), Address: c.String(CLIAddress), @@ -514,5 +542,7 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), Debug: c.Bool(CLIDebugMode), ClockEventsCountWindowSize: c.Int(CLIClockEventsCountWindowSize), + EnableDCGMLog: c.Bool(CLIEnableDCGMLog), + DCGMLogLevel: dcgmLogLevel, }, nil } diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index 0dacf770..7b702b8a 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -53,4 +53,6 @@ type Config struct { ReplaceBlanksInModelName bool Debug bool ClockEventsCountWindowSize int + EnableDCGMLog bool + DCGMLogLevel string } diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go index a780367e..594356cc 100644 --- a/pkg/dcgmexporter/const.go +++ b/pkg/dcgmexporter/const.go @@ -31,3 +31,23 @@ const ( const ( windowSizeInMSLabel = "window_size_in_ms" ) + +// DCGMDbgLvl is a DCGM library debug level. +const ( + DCGMDbgLvlNone = "NONE" + DCGMDbgLvlFatal = "FATAL" + DCGMDbgLvlError = "ERROR" + DCGMDbgLvlWarn = "WARN" + DCGMDbgLvlInfo = "INFO" + DCGMDbgLvlDebug = "DEBUG" + DCGMDbgLvlVerb = "VERB" +) + +var DCGMDbgLvlValues = []string{DCGMDbgLvlNone, + DCGMDbgLvlFatal, + DCGMDbgLvlError, + DCGMDbgLvlWarn, + DCGMDbgLvlInfo, + DCGMDbgLvlDebug, + DCGMDbgLvlVerb, +} diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go index 51099308..726ee356 100644 --- a/pkg/dcgmexporter/server.go +++ b/pkg/dcgmexporter/server.go @@ -22,8 +22,8 @@ import ( "sync" "time" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" "github.com/gorilla/mux" - "github.com/prometheus/common/promlog" "github.com/prometheus/exporter-toolkit/web" "github.com/sirupsen/logrus" ) @@ -72,8 +72,12 @@ func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*Metr func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { defer wg.Done() - promlogConfig := &promlog.Config{} - logger := promlog.New(promlogConfig) + // Initialize your logrus logger + logrusLogger := logrus.New() + logrusLogger.SetLevel(logrus.GetLevel()) + + // Wrap the logrus logger with the LogrusAdapter + logger := logging.NewLogrusAdapter(logrusLogger) var httpwg sync.WaitGroup httpwg.Add(1) diff --git a/pkg/stdout/capture.go b/pkg/stdout/capture.go new file mode 100644 index 00000000..e46bf733 --- /dev/null +++ b/pkg/stdout/capture.go @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package stdout + +import ( + "bufio" + "context" + "os" + "syscall" + + "github.com/sirupsen/logrus" +) + +// Capture go and C stdout and stderr and writes to logrus.StandardLogger +func Capture(ctx context.Context, inner func() error) error { + stdout, err := syscall.Dup(syscall.Stdout) + if err != nil { + return err + } + + r, w, err := os.Pipe() + if err != nil { + return err + } + + err = syscall.Dup2(int(w.Fd()), syscall.Stdout) + if err != nil { + return err + } + + defer func() { + ierr := syscall.Close(syscall.Stdout) + if ierr != nil { + err = ierr + } + + ierr = syscall.Dup2(stdout, syscall.Stdout) + if ierr != nil { + err = ierr + } + }() + + scanner := bufio.NewScanner(r) + go func() { + for scanner.Scan() { + if ctx.Err() != nil { + return + } + logEntry := scanner.Text() + parsedLogEntry := parseLogEntry(logEntry) + if parsedLogEntry.IsRawString { + _, err := logrus.StandardLogger().Out.Write([]byte(parsedLogEntry.Message + "\n")) + if err != nil { + return + } + continue + } + logrus.WithField("dcgm_level", parsedLogEntry.Level).Info(parsedLogEntry.Message) + } + }() + + // Call function here + return inner() +} diff --git a/pkg/stdout/capture_test.go b/pkg/stdout/capture_test.go new file mode 100644 index 00000000..a7717dd1 --- /dev/null +++ b/pkg/stdout/capture_test.go @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package stdout + +import ( + "bytes" + "context" + "fmt" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "strings" + "testing" + "time" +) + +func TestCapture(t *testing.T) { + type testCase struct { + name string + logMessage string + assert func(t *testing.T, str string) + } + + testCases := []testCase{ + { + name: "function writes an arbitrary string into /dev/stdout", + logMessage: "hello from dcgm", + assert: func(t *testing.T, str string) { + assert.Equal(t, "hello from dcgm", strings.TrimSpace(str)) + }, + }, + { + name: "function writes an DCGM log entry string into /dev/stdout", + logMessage: "2024-02-07 18:01:05.641 INFO [517155:517155] Linux 4.15.0-180-generic [{anonymous}::StartEmbeddedV2]", + assert: func(t *testing.T, str string) { + assert.Contains(t, strings.TrimSpace(str), "Linux 4.15.0-180-generic") + }, + }, + { + name: "function writes an DCGM log entry string with a valid date only", + logMessage: "2024-02-07 18:01:05.641", + assert: func(t *testing.T, str string) { + assert.Equal(t, "2024-02-07 18:01:05.641", strings.TrimSpace(str)) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + + buf := &bytes.Buffer{} + logrus.SetOutput(buf) + + err := Capture(ctx, func() error { + fmt.Println(tc.logMessage) + return nil + }) + + assert.NoError(t, err) + time.Sleep(1 * time.Millisecond) + tc.assert(t, buf.String()) + cancel() + }) + } +} + +func TestCaptureWithCGO(t *testing.T) { + testCaptureWithCGO(t) +} diff --git a/pkg/stdout/capture_test_wrapper.go b/pkg/stdout/capture_test_wrapper.go new file mode 100644 index 00000000..53955e13 --- /dev/null +++ b/pkg/stdout/capture_test_wrapper.go @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package stdout + +/* +#include +void printBoom() { + printf("Boom\n"); + fflush(stdout); +} +*/ +import "C" +import ( + "bytes" + "context" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "strings" + "testing" + "time" +) + +func testCaptureWithCGO(t *testing.T) { + t.Helper() + + ctx, cancel := context.WithCancel(context.Background()) + + buf := &bytes.Buffer{} + logrus.SetOutput(buf) + + err := Capture(ctx, func() error { + C.printBoom() + return nil + }) + assert.NoError(t, err) + + time.Sleep(10 * time.Millisecond) + require.Equal(t, "Boom", strings.TrimSpace(buf.String())) + + cancel() +} diff --git a/pkg/stdout/stdoutprocessor.go b/pkg/stdout/stdoutprocessor.go new file mode 100644 index 00000000..bee2a42f --- /dev/null +++ b/pkg/stdout/stdoutprocessor.go @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package stdout + +import ( + "strings" + "time" +) + +// outputEntry represents the structured form of the parsed log entry. +type outputEntry struct { + Timestamp time.Time + Level string + Message string + IsRawString bool +} + +// parseLogEntry takes a log entry string and returns a structured outputEntry object. +func parseLogEntry(entry string) outputEntry { + // Split the entry by spaces, taking care to not split the function call and its arguments. + fields := strings.Fields(entry) + + if len(fields) > 2 { + // Parse the timestamp. + timestamp, err := time.Parse("2006-01-02 15:04:05.000", fields[0]+" "+fields[1]) + if err != nil { + return outputEntry{ + Message: entry, + IsRawString: true, + } + } + + level := fields[2] + + // Reconstruct the string from the fourth field onwards to deal with function calls and arguments. + remainder := strings.Join(fields[4:], " ") + + return outputEntry{ + Timestamp: timestamp, + Level: level, + Message: remainder, + } + } + + return outputEntry{ + Message: entry, + IsRawString: true, + } +}