Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add a flag --fail-on-no-process to return a non-zero exit code when no processes matched by 'dyno gputrace' #268

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cli/src/commands/gputrace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ pub struct GpuTraceOptions {
pub with_modules: bool,
}

#[derive(Debug)]
pub struct GpuTraceCliConfig {
pub fail_on_no_process: bool,
}

impl GpuTraceOptions {
fn config(&self) -> String {
format!(
Expand Down Expand Up @@ -100,6 +105,7 @@ pub fn run_gputrace(
pids: &str,
process_limit: u32,
config: GpuTraceConfig,
cli_config: GpuTraceCliConfig,
) -> Result<()> {
let kineto_config = config.config();
println!("Kineto config = \n{}", kineto_config);
Expand Down Expand Up @@ -128,6 +134,9 @@ pub fn run_gputrace(

if processes.is_empty() {
println!("No processes were matched, please check --job-id or --pids flags");
if cli_config.fail_on_no_process {
return Err(anyhow::anyhow!("No processes were matched"));
}
} else {
println!("Matched {} processes", processes.len());
println!("Trace output files will be written to:");
Expand Down
10 changes: 9 additions & 1 deletion cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ mod commands;
use commands::gputrace::GpuTraceConfig;
use commands::gputrace::GpuTraceOptions;
use commands::gputrace::GpuTraceTriggerConfig;
use commands::gputrace::GpuTraceCliConfig;
use commands::*;

/// Instructions on adding a new Dyno CLI command:
Expand Down Expand Up @@ -91,6 +92,9 @@ enum Command {
/// Capture PyTorch operator modules in traces
#[clap(long, action)]
with_modules: bool,
/// Returns exit code 1 if no process is found
#[clap(long, action)]
fail_on_no_process: bool,
},
/// Pause dcgm profiling. This enables running tools like Nsight compute and avoids conflicts.
DcgmPause {
Expand Down Expand Up @@ -139,6 +143,7 @@ fn main() -> Result<()> {
with_stacks,
with_flops,
with_modules,
fail_on_no_process,
} => {
let trigger_config = if iterations > 0 {
GpuTraceTriggerConfig::IterationBased {
Expand All @@ -163,7 +168,10 @@ fn main() -> Result<()> {
trigger_config,
trace_options,
};
gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config)
let cli_config = GpuTraceCliConfig {
fail_on_no_process,
};
gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config, cli_config)
}
Command::DcgmPause { duration_s } => dcgm::run_dcgm_pause(dyno_client, duration_s),
Command::DcgmResume => dcgm::run_dcgm_resume(dyno_client),
Expand Down