From 17103dcd9df50d770a603b2d876a3c4c713238e1 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Fri, 6 Sep 2024 15:16:58 -0500 Subject: [PATCH] [Llama] Dump RSS info for Linux Differential Revision: D62222512 Pull Request resolved: https://github.com/pytorch/executorch/pull/5101 --- examples/models/llama2/runner/runner.cpp | 13 ++++++++++ examples/models/llava/runner/llava_runner.cpp | 21 ++++++++++++++-- extension/llm/runner/util.h | 25 +++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 2c72b4c724..b048604251 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -153,6 +153,11 @@ Error Runner::generate( stats_.model_load_end_ms = util::time_in_ms(); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -213,6 +218,10 @@ Error Runner::generate( // print the first token from prefill. No prev_token so use cur_token for it. wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); + ET_LOG( + Info, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); // start the main loop prompt_tokens.push_back(cur_token); @@ -221,6 +230,10 @@ Error Runner::generate( stats_.inference_end_ms = util::time_in_ms(); printf("\n"); + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); if (num_prompt_tokens + num_generated_tokens == seq_len) { ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 04c77a1064..64763c7257 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -131,6 +131,11 @@ Error LlavaRunner::generate( ET_CHECK_OK_OR_RETURN_ERROR(load()); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -149,9 +154,21 @@ Error LlavaRunner::generate( // prefill images prefill_images(images, pos); + ET_LOG( + Info, + "RSS after prompt and image prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Generate tokens - return generate_from_pos( - prompt, seq_len, pos, wrapped_callback, stats_callback); + Error err = + generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback); + + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + + return err; } } // namespace torch::executor diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index baf6af328b..2f1d084811 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -10,6 +10,9 @@ #include #include #include +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) +#include +#endif namespace executorch { namespace extension { @@ -44,6 +47,27 @@ long inline time_in_ms() { return time.tv_sec * 1000 + time.tv_nsec / 1000000; } +// ---------------------------------------------------------------------------- +// utilities: memory usage + +// Returns the current RSS in bytes. Returns 0 if not supported. +// RSS: Resident Set Size, the amount of memory currently in the RAM for this +// process. These values are approximate, and are only used for logging +// purposes. +size_t inline get_rss_bytes() { +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) + struct rusage r_usage; + if (getrusage(RUSAGE_SELF, &r_usage) == 0) { + return r_usage.ru_maxrss * 1024; + } +#endif // __linux__ || __ANDROID__ || __unix__ + // Unsupported platform like Windows, or getrusage() failed. + // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not + // consistently return kbytes on macOS. On older versions of macOS, it + // returns bytes, but on newer versions it returns kbytes. Need to figure out + // when this changed. + return 0; +} } // namespace llm } // namespace extension } // namespace executorch @@ -53,6 +77,7 @@ namespace executor { namespace util { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. +using ::executorch::extension::llm::get_rss_bytes; using ::executorch::extension::llm::safe_printf; using ::executorch::extension::llm::time_in_ms; } // namespace util