diff --git a/cli/src/commands/gputrace.rs b/cli/src/commands/gputrace.rs index c2e37af7..6270ba45 100644 --- a/cli/src/commands/gputrace.rs +++ b/cli/src/commands/gputrace.rs @@ -57,6 +57,11 @@ pub struct GpuTraceOptions { pub with_modules: bool, } +#[derive(Debug)] +pub struct GpuTraceCliConfig { + pub fail_on_no_process: bool, +} + impl GpuTraceOptions { fn config(&self) -> String { format!( @@ -100,6 +105,7 @@ pub fn run_gputrace( pids: &str, process_limit: u32, config: GpuTraceConfig, + cli_config: GpuTraceCliConfig, ) -> Result<()> { let kineto_config = config.config(); println!("Kineto config = \n{}", kineto_config); @@ -128,6 +134,9 @@ pub fn run_gputrace( if processes.is_empty() { println!("No processes were matched, please check --job-id or --pids flags"); + if cli_config.fail_on_no_process { + return Err(anyhow::anyhow!("No processes were matched")); + } } else { println!("Matched {} processes", processes.len()); println!("Trace output files will be written to:"); diff --git a/cli/src/main.rs b/cli/src/main.rs index b7c755e6..40b454e1 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -14,6 +14,7 @@ mod commands; use commands::gputrace::GpuTraceConfig; use commands::gputrace::GpuTraceOptions; use commands::gputrace::GpuTraceTriggerConfig; +use commands::gputrace::GpuTraceCliConfig; use commands::*; /// Instructions on adding a new Dyno CLI command: @@ -91,6 +92,9 @@ enum Command { /// Capture PyTorch operator modules in traces #[clap(long, action)] with_modules: bool, + /// Returns exit code 1 if no process is found + #[clap(long, action)] + fail_on_no_process: bool, }, /// Pause dcgm profiling. This enables running tools like Nsight compute and avoids conflicts. DcgmPause { @@ -139,6 +143,7 @@ fn main() -> Result<()> { with_stacks, with_flops, with_modules, + fail_on_no_process, } => { let trigger_config = if iterations > 0 { GpuTraceTriggerConfig::IterationBased { @@ -163,7 +168,10 @@ fn main() -> Result<()> { trigger_config, trace_options, }; - gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config) + let cli_config = GpuTraceCliConfig { + fail_on_no_process, + }; + gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config, cli_config) } Command::DcgmPause { duration_s } => dcgm::run_dcgm_pause(dyno_client, duration_s), Command::DcgmResume => dcgm::run_dcgm_resume(dyno_client),