From 17103dcd9df50d770a603b2d876a3c4c713238e1 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 6 Sep 2024 15:16:58 -0500
Subject: [PATCH] [Llama] Dump RSS info for Linux

Differential Revision: D62222512

Pull Request resolved: https://github.com/pytorch/executorch/pull/5101
---
 examples/models/llama2/runner/runner.cpp      | 13 ++++++++++
 examples/models/llava/runner/llava_runner.cpp | 21 ++++++++++++++--
 extension/llm/runner/util.h                   | 25 +++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index 2c72b4c724..b048604251 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -153,6 +153,11 @@ Error Runner::generate(
     stats_.model_load_end_ms = util::time_in_ms();
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
@@ -213,6 +218,10 @@ Error Runner::generate(
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  ET_LOG(
+      Info,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
@@ -221,6 +230,10 @@ Error Runner::generate(
 
   stats_.inference_end_ms = util::time_in_ms();
   printf("\n");
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index 04c77a1064..64763c7257 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -131,6 +131,11 @@ Error LlavaRunner::generate(
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
@@ -149,9 +154,21 @@ Error LlavaRunner::generate(
   // prefill images
   prefill_images(images, pos);
 
+  ET_LOG(
+      Info,
+      "RSS after prompt and image prefill: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Generate tokens
-  return generate_from_pos(
-      prompt, seq_len, pos, wrapped_callback, stats_callback);
+  Error err =
+      generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback);
+
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
+  return err;
 }
 
 } // namespace torch::executor
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index baf6af328b..2f1d084811 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -10,6 +10,9 @@
 #include <stdio.h>
 #include <time.h>
 #include <cctype>
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+#include <sys/resource.h>
+#endif
 
 namespace executorch {
 namespace extension {
@@ -44,6 +47,27 @@ long inline time_in_ms() {
   return time.tv_sec * 1000 + time.tv_nsec / 1000000;
 }
 
+// ----------------------------------------------------------------------------
+// utilities: memory usage
+
+// Returns the current RSS in bytes. Returns 0 if not supported.
+// RSS: Resident Set Size, the amount of memory currently in the RAM for this
+// process. These values are approximate, and are only used for logging
+// purposes.
+size_t inline get_rss_bytes() {
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+  struct rusage r_usage;
+  if (getrusage(RUSAGE_SELF, &r_usage) == 0) {
+    return r_usage.ru_maxrss * 1024;
+  }
+#endif // __linux__ || __ANDROID__ || __unix__
+  // Unsupported platform like Windows, or getrusage() failed.
+  // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not
+  // consistently return kbytes on macOS. On older versions of macOS, it
+  // returns bytes, but on newer versions it returns kbytes. Need to figure out
+  // when this changed.
+  return 0;
+}
 } // namespace llm
 } // namespace extension
 } // namespace executorch
@@ -53,6 +77,7 @@ namespace executor {
 namespace util {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
+using ::executorch::extension::llm::get_rss_bytes;
 using ::executorch::extension::llm::safe_printf;
 using ::executorch::extension::llm::time_in_ms;
 } // namespace util