nnstreamer · lhs8928 · Aug 31, 2023 · Aug 31, 2023 · Aug 31, 2023 · EunjuYang
@@ -16,3 +16,6 @@ memory_swap_path = @MEMORY_SWAP_PATH@
 
 # look ahead window size
 memory_swap_lookahead = @MEMORY_SWAP_LOOKAHEAD@
+
+# swap mode ("inference" or "train")
+memory_swap_lookahead = "train"
@@ -58,12 +58,13 @@ class NetworkGraph {
    * @param[in] enable_swap enable memory swap for tensor
    * @param[in] swap_path memory swap file path when the swap is enabled
    */
-  NetworkGraph(bool enable_swap, const std::string &swap_path = "",
-               unsigned int lookahead = 0,
+  NetworkGraph(bool enable_swap, const std::string &swap_mode = "train",
+               const std::string &swap_path = "", unsigned int lookahead = 0,
                const std::string &tensor_format_ = "NCHW",
                const std::string &tensor_dtype_ = "FP32-FP32") :
-    tensor_manager(std::make_shared<Manager>(enable_swap, swap_path, lookahead,
-                                             tensor_format_, tensor_dtype_)),
+    tensor_manager(std::make_shared<Manager>(enable_swap, swap_mode, swap_path,
+                                             lookahead, tensor_format_,
+                                             tensor_dtype_)),
     graph(),
     compiled(false),
     batch_size(0),
@@ -355,9 +356,9 @@ class NetworkGraph {
   /**
    * @brief Allocate memory for all the managed weights
    */
-  void allocateWeights() {
+  void allocateWeights(bool init = true) {
     tensor_manager->allocateWeights(
-      std::get<3>(backward_iter_end->getExecutionOrder()));
+      std::get<3>(backward_iter_end->getExecutionOrder()), init);
   }
 
   /**

@@ -33,6 +33,8 @@ MemorySwap::MemorySwap(bool value) { set(value); }
 
 MemorySwapPath::MemorySwapPath(const std::string &value) { set(value); }
 
+MemorySwapMode::MemorySwapMode(const std::string &value) { set(value); }
+
 MemorySwapLookahead::MemorySwapLookahead(const unsigned int &value) {
   set(value);
 }

@@ -179,6 +179,24 @@ class MemorySwapLookahead : public Property<unsigned int> {
   MemorySwapLookahead(const unsigned int &value = 0);
 };
 
+/**
+ * @brief cache file path property
+ *
+ */
+class MemorySwapMode : public Property<std::string> {
+public:
+  static constexpr const char *key =
+    "memory_swap_mode";          /**< unique key to access */
+  using prop_tag = str_prop_tag; /**< property type */
+
+  /**
+   * @brief Constructor
+   *
+   * @param value value to set, defaults to current directory
-   * @param value value to set, defaults to current directory
+   * @param value value to set, defaults to "train" mode
-   * @param value value to set, defaults to current directory
+   * @param value value to set, defaults to "train" mode
+   */
+  MemorySwapMode(const std::string &value = "train");
+};
+
 /**
  * @brief     Enumeration of Data Type for model & layer
  */

@@ -67,11 +67,12 @@ namespace nntrainer {
 NeuralNetwork::NeuralNetwork() :
   model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
               props::LossScale()),
-  model_flex_props(
-    props::Epochs(), props::TrainingBatchSize(), props::SavePath(),
-    props::ContinueTrain(), props::SaveBestPath(), props::MemoryOptimization(),
-    props::MemorySwap(), props::MemorySwapPath(), props::MemorySwapLookahead(),
-    props::TensorFormat(), props::ModelTensorDataType()),
+  model_flex_props(props::Epochs(), props::TrainingBatchSize(),
+                   props::SavePath(), props::ContinueTrain(),
+                   props::SaveBestPath(), props::MemoryOptimization(),
+                   props::MemorySwap(), props::MemorySwapPath(),
+                   props::MemorySwapLookahead(), props::TensorFormat(),
+                   props::ModelTensorDataType(), props::MemorySwapMode()),
   load_path(std::string()),
   epoch_idx(0),
   iter(0),
@@ -86,11 +87,12 @@ NeuralNetwork::NeuralNetwork() :
 NeuralNetwork::NeuralNetwork(AppContext app_context_) :
   model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
               props::LossScale()),
-  model_flex_props(
-    props::Epochs(), props::TrainingBatchSize(), props::SavePath(),
-    props::ContinueTrain(), props::SaveBestPath(), props::MemoryOptimization(),
-    props::MemorySwap(), props::MemorySwapPath(), props::MemorySwapLookahead(),
-    props::TensorFormat(), props::ModelTensorDataType()),
+  model_flex_props(props::Epochs(), props::TrainingBatchSize(),
+                   props::SavePath(), props::ContinueTrain(),
+                   props::SaveBestPath(), props::MemoryOptimization(),
+                   props::MemorySwap(), props::MemorySwapPath(),
+                   props::MemorySwapLookahead(), props::TensorFormat(),
+                   props::ModelTensorDataType(), props::MemorySwapMode()),
   load_path(std::string()),
   epoch_idx(0),
   iter(0),
@@ -172,6 +174,8 @@ int NeuralNetwork::compile() {
   bool memory_swap = std::get<props::MemorySwap>(model_flex_props);
   const std::string memory_swap_path =
     std::get<props::MemorySwapPath>(model_flex_props);
+  const std::string memory_swap_mode =
+    std::get<props::MemorySwapMode>(model_flex_props);
   unsigned int lookahead =
     std::get<props::MemorySwapLookahead>(model_flex_props);
 
@@ -181,8 +185,8 @@ int NeuralNetwork::compile() {
   const std::string tensor_type =
     to_string(std::get<props::ModelTensorDataType>(model_flex_props));
 
-  model_graph = NetworkGraph(memory_swap, memory_swap_path, lookahead,
-                             tensor_format, tensor_type);
+  model_graph = NetworkGraph(memory_swap, memory_swap_mode, memory_swap_path,
+                             lookahead, tensor_format, tensor_type);
 
   model_graph.setMemoryOptimizations(
     std::get<props::MemoryOptimization>(model_flex_props));
@@ -260,7 +264,9 @@ int NeuralNetwork::initialize(ExecutionMode mode) {
   }
 
   // Allocate weights
-  model_graph.allocateWeights();
+  const std::string memory_swap_mode =
+    std::get<props::MemorySwapMode>(model_flex_props);
+  model_graph.allocateWeights(memory_swap_mode.compare("inference") != 0);
 
   initialized = true;
 

@@ -624,12 +624,11 @@ s   * @retval shared_ptr<const Tensor>
                const std::string file_path) override;
 
 private:
-  using FlexiblePropTypes =
-    std::tuple<props::Epochs, props::TrainingBatchSize, props::SavePath,
-               props::ContinueTrain, props::SaveBestPath,
-               props::MemoryOptimization, props::MemorySwap,
-               props::MemorySwapPath, props::MemorySwapLookahead,
-               props::TensorFormat, props::ModelTensorDataType>;
+  using FlexiblePropTypes = std::tuple<
+    props::Epochs, props::TrainingBatchSize, props::SavePath,
+    props::ContinueTrain, props::SaveBestPath, props::MemoryOptimization,
+    props::MemorySwap, props::MemorySwapPath, props::MemorySwapLookahead,
+    props::TensorFormat, props::ModelTensorDataType, props::MemorySwapMode>;
   using RigidPropTypes =
     std::tuple<props::LossType, std::vector<props::InputConnection>,
                std::vector<props::LabelLayer>, props::ClipGradByGlobalNorm,

@@ -23,10 +23,15 @@ namespace nntrainer {
 namespace {
 
 std::map<CachePolicy, std::string> policyToStr = {
-  {WRITE_BACK, "WRITE_BACK"},           {NO_WRITE_BACK, "NO_WRITE_BACK"},
-  {READ_CONSIST, "READ_CONSIST"},       {NO_READ_CONSIST, "NO_READ_CONSIST"},
-  {ALWAYS_SYNCED, "ALWAYS_SYNCED"},     {TEMPORAL, "TEMPORAL"},
-  {FIRST_LAST_SKIP, "FIRST_LAST_SKIP"}, {ITERATION_CONSIST, "ITER_CONSIST"}};
+  {WRITE_BACK, "WRITE_BACK"},
+  {NO_WRITE_BACK, "NO_WRITE_BACK"},
+  {READ_CONSIST, "READ_CONSIST"},
+  {NO_READ_CONSIST, "NO_READ_CONSIST"},
+  {ALWAYS_SYNCED, "ALWAYS_SYNCED"},
+  {TEMPORAL, "TEMPORAL"},
+  {FIRST_LAST_SKIP, "FIRST_LAST_SKIP"},
+  {ITERATION_CONSIST, "ITER_CONSIST"},
+  {SYNC_ONCE, "SYNC_ONCE"}};
 
 inline bool checkAllocOnly(CachePolicy policy, CacheElem::Options opt) {
   return ((policy & CachePolicy::NO_READ_CONSIST) ||
@@ -37,7 +42,9 @@ inline bool checkAllocOnly(CachePolicy policy, CacheElem::Options opt) {
 inline bool checkDeallocOnly(CachePolicy policy, CacheElem::Options opt) {
   return ((policy & CachePolicy::NO_READ_CONSIST) ||
           ((opt & CacheElem::Options::LAST_ACCESS) &&
-           (policy & CachePolicy::FIRST_LAST_SKIP)));
+           (policy & CachePolicy::FIRST_LAST_SKIP)) ||
+          ((policy & FRIST_WRITE_CONSIST) &&
+           !(opt & CacheElem::Options::FIRST_WRITE)));
 }
 
 } // namespace
@@ -49,7 +56,7 @@ void CacheElem::swapIn(Options opt) {
   bool alloc_only = checkAllocOnly(policy, opt);
   void *buf = device->getBuffer(offset, length, alloc_only);
 
-  initial_opt = Options::NONE;
+  initial_opt = static_cast<Options>(initial_opt & ~Options::FIRST_ACCESS);
   mem_data->setAddr((void *)buf);
   mem_data->setValid(true);
   active = true;
@@ -63,8 +70,12 @@ void CacheElem::swapIn(Options opt) {
 
 void CacheElem::swapOut(Options opt) {
   std::lock_guard<std::mutex> lock(device_mutex);
+
+  opt = static_cast<Options>(opt | initial_opt);
   bool dealloc_only = checkDeallocOnly(policy, opt);
   void *buf = (void *)mem_data->getAddr();
+
+  initial_opt = static_cast<Options>(initial_opt & ~Options::FIRST_WRITE);
   device->putBuffer(buf, dealloc_only);
   mem_data->setAddr(nullptr);
   mem_data->setValid(false);

@@ -33,9 +33,12 @@ enum CachePolicy {
               NO_WRITE_BACK), /**< Will not be synchronized with device */
   FIRST_LAST_SKIP = 0b10000,
   /**< Will skip first read and last write */
+  FRIST_WRITE_CONSIST = 0b100000, /**< First invalidate will write to device */
   ITERATION_CONSIST = (FIRST_LAST_SKIP | ALWAYS_SYNCED),
   /**< Will skip first read and last write. other behaviors will be same as
      ALWAYS_SYNCED */
+  SYNC_ONCE = (FRIST_WRITE_CONSIST | READ_CONSIST | NO_WRITE_BACK),
+  /**< Will sync at first from the device, and the value will always consist */
 };
 
 /**
@@ -48,6 +51,9 @@ class CacheElem {
     NONE = 0b0000,         /**< No option */
     FIRST_ACCESS = 0x0001, /**< First Access */
     LAST_ACCESS = 0x0010,  /**< Last Access */
+    FIRST_WRITE = 0x0100,  /**< First Write */
+    FIRST_ACCESS_WRITE = FIRST_ACCESS | FIRST_WRITE,
+    /**< First access & write */
   };
 
   /**
@@ -57,7 +63,7 @@ class CacheElem {
   explicit CacheElem(std::shared_ptr<SwapDevice> dev, unsigned int mem_id,
                      size_t off, size_t len, std::shared_ptr<MemoryData> data,
                      CachePolicy pol = CachePolicy::ALWAYS_SYNCED) :
-    initial_opt(Options::FIRST_ACCESS),
+    initial_opt(Options::FIRST_ACCESS_WRITE),
     device(dev),
     active(false),
     id(mem_id),
@@ -114,7 +120,7 @@ class CacheElem {
    * @brief reset access count
    *
    */
-  void reset() { initial_opt = Options::FIRST_ACCESS; }
+  void reset() { initial_opt = Options::FIRST_ACCESS_WRITE; }
 
 private:
   Options initial_opt;                  /**< accessed */

@@ -46,6 +46,9 @@ convertTensorLifespanToCachePolicy(const TensorLifespan lifespan) {
   case TensorLifespan::FORWARD_FUNC_LIFESPAN:
     policy = CachePolicy::TEMPORAL;
     break;
+  case TensorLifespan::FORWARD_INFER_LIFESPAN:
+    policy = CachePolicy::SYNC_ONCE;
+    break;
   case TensorLifespan::CALC_DERIV_LIFESPAN:
     policy = CachePolicy::TEMPORAL;
     break;

@@ -144,10 +144,10 @@ void Manager::reinitialize() {
   tensor_pool.reinitialize();
 }
 
-void Manager::allocateWeights(unsigned int max_exec_order_) {
+void Manager::allocateWeights(unsigned int max_exec_order_, bool init) {
   if (!weight_pool.isAllocated()) {
     finalizeTensorPool(weight_pool, 0, max_exec_order_);
-    weight_pool.allocate();
+    weight_pool.allocate(init);
   }
 }
 
@@ -376,7 +376,9 @@ std::vector<Weight *> Manager::requestWeights(
    *  and therefore, if we remove the calcDerivative order, then tests fails.
    */
 
-  TensorLifespan var_ls = TensorLifespan::MAX_LIFESPAN;
+  TensorLifespan var_ls = swap_mode == "inference"
+                            ? TensorLifespan::FORWARD_INFER_LIFESPAN
+                            : TensorLifespan::MAX_LIFESPAN;
   TensorLifespan grad_ls = TensorLifespan::BACKWARD_FUNC_LIFESPAN;
 
   std::vector<Weight *> ret;

@@ -141,13 +141,16 @@ class Manager {
   /**
    * @brief     Constructor of Manager
    */
-  Manager(bool enable_swap, const std::string &swap_path = "",
-          unsigned int lookahead = 0, const std::string tensor_format_ = "NCHW",
+  Manager(bool enable_swap, const std::string &swap_mode = "train",
+          const std::string &swap_path = "", unsigned int lookahead = 0,
+          const std::string tensor_format_ = "NCHW",
           const std::string tensor_dtype_ = "FP32-FP32") :
     weight_pool(enable_swap, swap_path, "weight_pool"),
-    tensor_pool(enable_swap, swap_path, "tensor_pool"),
+    tensor_pool(enable_swap && (swap_mode.compare("train") == 0), swap_path,
+                "tensor_pool"),
     enable_optimizations(true),
     swap_lookahead(lookahead),
+    swap_mode(swap_mode),
     tensor_format(tensor_format_),
     tensor_dtype(split(tensor_dtype_, getRegex("\\-"))),
     exec_mode(ExecutionMode::TRAIN) {}
@@ -381,7 +384,7 @@ class Manager {
    * @note this will make requests to the tensor pool and allocate the
    * corresponding weights
    */
-  void allocateWeights(unsigned int max_exec_order_);
+  void allocateWeights(unsigned int max_exec_order_, bool init = true);
 
   /**
    * @brief Deallocate memory for all the weights
@@ -523,6 +526,8 @@ class Manager {
 
   unsigned int swap_lookahead; /** lookahead for memory swap */
 
+  std::string swap_mode; /** swap mode */
+
   std::string tensor_format;
 
   std::vector<std::string> tensor_dtype;

@@ -16,6 +16,7 @@
 #include <profiler.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -70,7 +71,7 @@ void *SwapDevice::getBuffer(off_t offset, size_t size, bool alloc_only) {
     << std::string(strerror_r(errno, error_buf, error_buflen));
 
   void *buf = static_cast<void *>(ptr + diff);
-  mapped[buf] = std::make_pair(ptr, len);
+  mapped[buf] = std::make_tuple(ptr, len, offset, (ssize_t)size);
 
   return buf;
 #else
@@ -88,7 +89,7 @@ void *SwapDevice::getBuffer(off_t offset, size_t size, bool alloc_only) {
       << "SwapDevice: seek file: " << dev_path;
 
     len = read(fd, ptr, size);
-    NNTR_THROW_IF(len != (ssize_t)size, std::runtime_error)
+    NNTR_THROW_IF(len != (size_t)size, std::runtime_error)
       << "SwapDevice: read file: " << dev_path;
   }
 
@@ -107,7 +108,22 @@ void SwapDevice::putBuffer(void *ptr, bool dealloc_only) {
   NNTR_THROW_IF(mapped.find(ptr) == mapped.end(), std::runtime_error)
     << "Couldn't find buffer";
 
+  off_t off;
+  ssize_t len;
+
   auto info = mapped[ptr];
+  if (!dealloc_only) {
+    off = lseek(fd, std::get<2>(info), SEEK_SET);
+    NNTR_THROW_IF(off < 0, std::runtime_error)
+      << "SwapDevice: seek file: " << dev_path;
+
+    ssize_t size = std::get<3>(info);
+    len = write(fd, ptr, size);
+    NNTR_THROW_IF(len != size, std::runtime_error)
+      << "SwapDevice: write file: " << len << "::" << std::to_string(size)
+      << dev_path;
+  }
+
   ret = munmap(std::get<void *>(info), std::get<size_t>(info));
   const size_t error_buflen = 100;
   char error_buf[error_buflen];

@@ -26,7 +26,7 @@
 #include <utility>
 
 /* Uncomment this to use mmap for swap data */
-//#define USE_MMAP
+#define USE_MMAP
 
 namespace nntrainer {
 
@@ -119,8 +119,8 @@ class SwapDevice {
   int fd;                     /**< device file description */
 
 #ifdef USE_MMAP
-  std::map<void *, std::pair<void *, size_t>>
-    mapped; /**< <pointer, <orig_pointer, size>> */
+  std::map<void *, std::tuple<void *, size_t, off_t, ssize_t>>
+    mapped; /**< <pointer, <orig_pointer, size, offset, origianl size>> */
 #else
   std::map<void *, std::pair<off_t, ssize_t>>
     allocated; /**< <pointer, <offset, size>> */