From 5c5e3911b2d509647611bbf81b55697e0a645ada Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Tue, 7 May 2024 11:18:52 +0400
Subject: [PATCH] [Snippets] Added Dynamism support to intermediate memory

[Snippets] Renamed BufferID to BufferRegisterGroup

[Snippets] Changed allocation shape on size

[Snippets] Added Buffer cluster_ID

[Snippets][Tests] Fixed build insert_load_store test

[Snippets] Splited SolveBufferMemory into static and dynamic logic

[Snippets] Rewrote ComputeBufferAllocationSize::get_allocation_size

[Snippets] Added synamism support to InitBuffersDefault

[Snippets][Tests] Added tests for clusters

[Snippets] Added buffer_expressions to ComputeBufferAllocationSize

[Snippets] Added  to LoopInfo for splitted loops:

[Snippets] Removed copy from UpdateLoopInfo

[Snippets] Moved UpdateLoopInfo to RunimeConfigurator

[Snippets] Add dynamic buffers support to Configurator

[Snippets] Fixed Reduce decomp: add shape infer for outputs

[snippets] Fixed broadcast_merge_dim in shape inference

[Snippets][CPU][Tests] Enabled dynamic Softmax tests

[Snippets] Removed useless function calculate_size

[Snippets][CPU][Tests] Enabled dynamic reduce test

[Snippets] Small fixes in solve_buffer_memory for dynamic nodes

[CPU][Snippets] Removed useless emitters LoadConvert and StoreConvert

[Snippets] Added missed consumers cloning

[Snippets][CPU] Added buffer offsets to call_args

[Snippets][CPU] Added dynamic offsets support to load and store emitters

[CPU][UnitTests} Fixed build

[Snippets][AArch64] Fixed build

[Snippets] Small fixes
---
 .../snippets/docs/snippets_design_guide.md    |  14 +-
 .../include/snippets/lowered/linear_ir.hpp    |   9 +-
 .../snippets/lowered/linear_ir_builder.hpp    |  15 +-
 .../include/snippets/lowered/loop_info.hpp    |  26 +-
 .../include/snippets/lowered/loop_manager.hpp |  10 +-
 .../lowered/pass/allocate_buffers.hpp         |   2 -
 .../pass/compute_buffer_allocation_size.hpp   |  38 +++
 .../lowered/pass/define_buffer_clusters.hpp   |  14 +-
 .../lowered/pass/init_buffers_default.hpp     |   2 +-
 .../snippets/lowered/pass/insert_buffers.hpp  |   4 +-
 ...ds.hpp => normalize_buffer_reg_groups.hpp} |  14 +-
 ...y_buffers.hpp => set_buffer_reg_group.hpp} |  19 +-
 .../lowered/pass/solve_buffer_memory.hpp      |  33 ++-
 .../lowered/pass/update_loop_info.hpp         |  46 ----
 .../snippets/include/snippets/op/buffer.hpp   |  52 ++--
 .../include/snippets/runtime_configurator.hpp |  27 +-
 .../snippets/include/snippets/utils.hpp       |  39 ++-
 src/common/snippets/src/lowered/linear_ir.cpp |   7 +
 .../src/lowered/linear_ir_builder.cpp         |  20 ++
 src/common/snippets/src/lowered/loop_info.cpp |  38 +--
 .../snippets/src/lowered/loop_manager.cpp     |   7 +-
 .../src/lowered/pass/allocate_buffers.cpp     |  23 +-
 .../src/lowered/pass/assign_registers.cpp     |   8 +-
 .../pass/clean_repeated_ptr_shifts.cpp        |  10 +-
 .../pass/compute_buffer_allocation_size.cpp   | 101 ++++++++
 .../lowered/pass/define_buffer_clusters.cpp   |  41 +--
 .../snippets/src/lowered/pass/fuse_loops.cpp  |   3 +-
 .../src/lowered/pass/init_buffers_default.cpp |  17 +-
 .../snippets/src/lowered/pass/init_loops.cpp  |   9 +-
 .../src/lowered/pass/insert_buffers.cpp       |  86 +------
 .../pass/insert_specific_iterations.cpp       |  19 +-
 .../src/lowered/pass/iter_handler.cpp         |   1 +
 .../src/lowered/pass/normalize_buffer_ids.cpp |  39 ---
 .../pass/normalize_buffer_reg_groups.cpp      |  39 +++
 .../src/lowered/pass/reduce_decomposition.cpp |   7 +
 ...y_buffers.cpp => set_buffer_reg_group.cpp} |  30 +--
 .../src/lowered/pass/solve_buffer_memory.cpp  | 172 +++++++++----
 .../snippets/src/lowered/pass/split_loops.cpp |   1 +
 .../src/lowered/pass/update_loop_info.cpp     |  92 -------
 .../snippets/src/lowered/pass/validate.cpp    |   1 +
 .../lowered/pass/validate_expanded_loops.cpp  |  14 +-
 src/common/snippets/src/op/buffer.cpp         |  52 ++--
 src/common/snippets/src/op/loop.cpp           |   9 +-
 src/common/snippets/src/op/memory_access.cpp  |  28 ++-
 src/common/snippets/src/op/subgraph.cpp       |   2 +-
 .../snippets/src/runtime_configurator.cpp     | 166 +++++++++++--
 .../shape_inference/shape_infer_instances.cpp |  14 +-
 src/common/snippets/src/utils.cpp             |   7 +-
 .../lowered/pass/buffer_allocation.hpp        |  13 +-
 .../src/lowered/pass/buffer_allocation.cpp    | 121 ++-------
 .../src/lowered/pass/insert_load_store.cpp    |   1 +
 .../snippets/tests/src/lowered/pass/loop.cpp  |   2 +-
 .../snippets/aarch64/jit_kernel_emitter.cpp   |   2 +-
 .../snippets/jit_snippets_call_args.hpp       |   1 +
 .../emitters/snippets/x64/cpu_generator.cpp   |  14 +-
 .../snippets/x64/jit_kernel_emitter.cpp       |   6 +-
 .../snippets/x64/jit_memory_emitters.cpp      | 233 ++++++++----------
 .../snippets/x64/jit_memory_emitters.hpp      |  57 ++---
 .../src/emitters/snippets/x64/verbose.cpp     |  22 +-
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  |   4 +
 .../snippets/x64/op/brgemm_cpu.cpp            |   1 +
 .../x64/pass/brgemm_to_brgemm_cpu.cpp         |   1 +
 .../set_brgemm_copy_b_buffers_shape.cpp       |   4 +-
 .../skip_tests_config.cpp                     |   5 +-
 .../snippets/softmax.cpp                      |  19 +-
 .../x64/lowered/buffer_allocation.cpp         | 149 +++++++++--
 66 files changed, 1176 insertions(+), 906 deletions(-)
 create mode 100644 src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp
 rename src/common/snippets/include/snippets/lowered/pass/{normalize_buffer_ids.hpp => normalize_buffer_reg_groups.hpp} (65%)
 rename src/common/snippets/include/snippets/lowered/pass/{identify_buffers.hpp => set_buffer_reg_group.hpp} (89%)
 delete mode 100644 src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp
 create mode 100644 src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp
 delete mode 100644 src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
 create mode 100644 src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp
 rename src/common/snippets/src/lowered/pass/{identify_buffers.cpp => set_buffer_reg_group.cpp} (88%)
 delete mode 100644 src/common/snippets/src/lowered/pass/update_loop_info.cpp

diff --git a/src/common/snippets/docs/snippets_design_guide.md b/src/common/snippets/docs/snippets_design_guide.md
index 3f44bde1cace3f..ce331c7c9fdc2b 100644
--- a/src/common/snippets/docs/snippets_design_guide.md
+++ b/src/common/snippets/docs/snippets_design_guide.md
@@ -605,17 +605,17 @@ Again, the explicit operations are needed to emit appropriate instructions later
 As mentioned above the `op::Buffer` operations are managed by the pass `AllocateBuffers`.
 Before describing the algorithm, it is necessary to briefly consider the structure of `Buffer`:
 * All `Buffers` represent `Buffer scratchpad` together (a common memory that is needed for intermediate results storing).
-* Each `Buffer` has an `offset` relative to the common data pointer (pointer of `Buffer scratchpad`) and `ID` (the `Buffers` with the same `ID` have the same assigned register).
+* Each `Buffer` has an `offset` relative to the common data pointer (pointer of `Buffer scratchpad`), `RegGroup` (the `Buffers` with the same `RegGroup` have the same assigned register) and `ClusterID` (the buffers from the same cluster refer to the same memory area - they have the same `offset` relative to the `Buffer scratchpad` data pointer).
 
 The algorithm supports two modes: optimized and non-optimized.
-The optimized one calculates minimal memory size and minimal unique `ID` count required to handle all the buffers.
-The non-optimized version assigns each buffer an unique `ID` and `offset`.
+The optimized one calculates minimal memory size and minimal unique `RegGroup` count required to handle all the buffers.
+The non-optimized version assigns each buffer an unique `RegGroup`, `ClusterID` and `offset`.
 The first mode is the default one, while the second one might be used for debugging the optimized version.
 The optimized algorithm `AllocateBuffers` has the main following steps:
-1. `IdentifyBuffers` - analyzes `Buffers` access patterns to avoid redundant pointer increments. A graph coloring algorithm is utilized for this purpose.
-2. `DefineBufferClusters` - creates sets of `Buffer` ops - `BufferClusters`.
-`Buffers` from one `BufferCluster` refer to the same memory area (they have the same `offset` relative to the `Buffer scratchpad` data pointer).
-For example, there is a loop with `Buffer` ops on input and output. If the body of this loop can write data to the memory from which it was read, these `Buffers` are in one `BufferCluster`.
+1. `SetBufferRegGroup` - analyzes `Buffers` access patterns to avoid redundant pointer increments. A graph coloring algorithm is utilized for this purpose.
+2. `DefineBufferClusters` - creates sets of `Buffer` ops (buffer clusters) and set `ClusterID` value to `Buffer` ops.
+As noticed above, `Buffers` from one cluster refer to the same memory area.
+For example, there is a loop with `Buffer` ops on input and output. If the body of this loop can write data to the memory from which it was read, these `Buffers` are in one cluster.
 3. `SolveBufferMemory` - calculate the most optimal memory size of `Buffer scratchpad` based on `BufferClusters` and life time of `Buffers`.
 
 More details on control flow optimization passes could be found in the `control_flow_transformations(...)` method inside [subgraph.cpp](../src/op/subgraph.cpp). 
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
index 5fd3984c430fe8..b8b11082c99eef 100644
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -76,13 +76,14 @@ class LinearIR {
     ExpressionPtr create_expression(const std::shared_ptr<Node>& n, const std::vector<PortConnectorPtr>& inputs) const;
 
     const container& get_ops() const { return m_expressions; }
+    const container& get_buffer_ops() const { return m_buffer_expressions; }
     const container& get_parameters() const { return m_parameter_expressions; }
     const container& get_results() const { return m_result_expressions; }
     const Config& get_config() const { return m_config; }
-    size_t get_buffer_scratchpad_size() const { return m_buffer_scratchpad_size; }
+    size_t get_static_buffer_scratchpad_size() const { return m_static_buffer_scratchpad_size; }
 
     void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; }
-    void set_buffer_scratchpad_size(size_t size) { m_buffer_scratchpad_size = size; }
+    void set_static_buffer_scratchpad_size(size_t size) { m_static_buffer_scratchpad_size = size; }
 
     const ExpressionPtr& get_expr_by_node(const std::shared_ptr<Node>& n) const;
 
@@ -278,13 +279,15 @@ class LinearIR {
     std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Expression>> m_node2expression_map;
     container m_parameter_expressions{};
     container m_result_expressions{};
+    container m_buffer_expressions{};
     Config m_config{};
     LoopManagerPtr m_loop_manager;
     std::shared_ptr<IShapeInferSnippetsFactory> m_shape_infer_factory;
     std::shared_ptr<ShapeInferSnippetsNode> m_shape_infer = nullptr;
     bool m_is_dynamic = false;
 
-    size_t m_buffer_scratchpad_size = 0;
+    // Size of static Buffer Scratchpad (Buffers with defined allocation size)
+    size_t m_static_buffer_scratchpad_size = 0;
 };
 using LinearIRPtr = std::shared_ptr<LinearIR>;
 
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp b/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp
index 969bf21cd27480..b9cfb87af617d6 100644
--- a/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp
@@ -17,11 +17,24 @@ namespace lowered {
 class LinearIRBuilder {
 public:
     struct Config {
-        Config(bool deep_copy_of_shapes_ = true) : deep_copy_of_shapes(deep_copy_of_shapes_) {}
+        Config(bool deep_copy_of_shapes_ = true, bool copy_missed_consumers_ = true)
+            : deep_copy_of_shapes(deep_copy_of_shapes_), copy_missed_consumers(copy_missed_consumers_) {}
 
         // If True, copy of stored pointer in `PortDescriptor::m_tensor_shape`.
         // If False, copy shapes as shared pointers.
         const bool deep_copy_of_shapes = true;
+        // At the moment, input port of expression must have only one source.
+        // However, for example, after LinearIR range insertion to the LinearIR (InsertSpecificIteration pass)
+        // several operations can write to the same consumer: several `Store` ops from different loop bodies store to the same Buffer/Result.
+        // Since `clone` algorithm is linear and during expression cloning creates only input port connectors from sources,
+        // algorithm can miss some consumers. For example:
+        //      The consumers of Store0 : Buffer0
+        //      The consumers of Store1 : Buffer0
+        // The result: Buffer0 has only one source in input connector - Store1
+        // Algorithm automatically doesn't add Buffer to consumers of Store0. Thus,
+        // If True, `clone` algorithm add missed consumers.
+        // If False, cloned LinearIR will be built by default (without extra consumers).
+        const bool copy_missed_consumers = true;
     };
 
     LinearIRBuilder(Config config = {}) : m_config(std::move(config)) {}
diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp
index ca28b27a760ac7..595d93834bd97e 100644
--- a/src/common/snippets/include/snippets/lowered/loop_info.hpp
+++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp
@@ -23,8 +23,9 @@ class LoopInfo {
     enum {UNDEFINED_DIM_IDX = std::numeric_limits<size_t>::max()};
 
     LoopInfo() = default;
-    LoopInfo(size_t work_amount, size_t increment, const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits);
-    LoopInfo(size_t work_amount, size_t increment, const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits);
+    LoopInfo(size_t work_amount, size_t increment, const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits, bool is_wa_const = false);
+    LoopInfo(size_t work_amount, size_t increment, const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits,
+             bool is_wa_const = false);
     virtual ~LoopInfo() = default;
 
     /**
@@ -76,6 +77,11 @@ class LoopInfo {
      * @return m_output_ports
      */
     const std::vector<LoopPort>& get_output_ports() const;
+    /**
+     * @brief Returns True if `work_amount` cannot be rewritten/updated by passes.
+     * @return m_is_work_amount_const
+     */
+    bool is_work_amount_const() const;
 
     /**
      * @brief Set m_work_amount value
@@ -92,6 +98,11 @@ class LoopInfo {
      * @param dim_idx - index
      */
     void set_dim_idx(size_t dim_idx);
+    /**
+     * @brief Sets `value` to `m_is_work_amount_const`
+     * @param value - value of the attribute
+     */
+    void set_work_amount_const(bool value);
 
     /**
      * @brief Replace the current LoopPort `actual_port` with new `target_ports`
@@ -164,6 +175,9 @@ class LoopInfo {
     // Note: Scalars aren't input expressions but can be before first input expr in Linear IR
     std::vector<LoopPort> m_input_ports = {};
     std::vector<LoopPort> m_output_ports = {};
+
+    // If True, no one pass can rewrite the value of `m_work_amount`
+    bool m_is_work_amount_const = false;
 };
 using LoopInfoPtr = std::shared_ptr<LoopInfo>;
 
@@ -197,13 +211,13 @@ class UnifiedLoopInfo : public LoopInfo {
     UnifiedLoopInfo(size_t work_amount, size_t increment,
                     const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                     const std::vector<LoopPortDesc>& in_descs, const std::vector<LoopPortDesc>& out_descs,
-                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers());
+                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false);
     UnifiedLoopInfo(size_t work_amount, size_t increment,
                     const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
-                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers());
+                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false);
     UnifiedLoopInfo(size_t work_amount, size_t increment,
                     const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits,
-                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers());
+                    const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false);
 
     /**
      * @brief Clone LoopInfo with new expressions
@@ -365,7 +379,7 @@ class ExpandedLoopInfo : public LoopInfo {
     ExpandedLoopInfo(size_t work_amount, size_t increment,
                      const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                      std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
-                     SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info);
+                     SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const = false);
     /**
      * @brief Clone LoopInfo with new expressions
      * @param expr_map map of new and old expressions
diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
index 570120408c37fb..f0718107ca30a2 100644
--- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp
+++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
@@ -99,12 +99,13 @@ class LoopManager {
                      size_t increment,
                      const std::vector<T>& entries,
                      const std::vector<T>& exits,
-                     bool set_default_handlers = true) {
+                     bool set_default_handlers = true,
+                     bool is_work_amount_const = false) {
         const auto normalized_increment = utils::is_dynamic_value(work_amount) || work_amount == 0 ? increment : std::min(increment, work_amount);
         const auto handlers = set_default_handlers
                                   ? SpecificIterationHandlers(work_amount, normalized_increment)
                                   : SpecificIterationHandlers();
-        const auto loop_info = std::make_shared<UnifiedLoopInfo>(work_amount, normalized_increment, entries, exits, handlers);
+        const auto loop_info = std::make_shared<UnifiedLoopInfo>(work_amount, normalized_increment, entries, exits, handlers, is_work_amount_const);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
@@ -131,8 +132,9 @@ class LoopManager {
                      size_t dim_idx,
                      const std::vector<T>& entries,
                      const std::vector<T>& exits,
-                     bool set_default_handlers = true) {
-        const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers);
+                     bool set_default_handlers = true,
+                     bool is_work_amount_const = false) {
+        const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers, is_work_amount_const);
         const auto loop_info = get_loop_info<UnifiedLoopInfo>(loop_id);
         loop_info->set_dim_idx(dim_idx);
         return loop_id;
diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
index 623c32c7ba1d39..f9a8331c65f3da 100644
--- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -42,8 +42,6 @@ class AllocateBuffers: public RangedPass {
      */
     static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
 
-    using BufferCluster = std::set<ExpressionPtr>;
-    using BufferClusters = std::vector<BufferCluster>;
 private:
     bool m_is_optimized_mode = true;
 };
diff --git a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp
new file mode 100644
index 00000000000000..89769f150d1c8d
--- /dev/null
+++ b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+#include "snippets/lowered/loop_manager.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface ComputeBufferAllocationSize
+ * @brief The pass calculate allocation sizes of Buffers.
+ * @param m_buffer_allocation_rank - rank of shape for memory allocation: shape[m_allocation_rank : -1]
+ * @ingroup snippets
+ */
+class ComputeBufferAllocationSize : public RangedPass {
+public:
+    OPENVINO_RTTI("ComputeBufferAllocationSize", "RangedPass")
+    ComputeBufferAllocationSize(size_t buffer_allocation_rank) : m_buffer_allocation_rank(buffer_allocation_rank) {}
+
+    bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
+
+    static size_t get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank);
+
+private:
+    size_t m_buffer_allocation_rank = 0;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
index 67254d879f3351..824b0d4daea75d 100644
--- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
@@ -6,8 +6,6 @@
 
 #include "pass.hpp"
 
-#include "allocate_buffers.hpp"
-
 namespace ov {
 namespace snippets {
 namespace lowered {
@@ -35,7 +33,7 @@ class DefineBufferClusters : public RangedPass {
 public:
     OPENVINO_RTTI("DefineBufferClusters", "RangedPass")
 
-    DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
+    DefineBufferClusters() = default;
 
     /**
      * @brief Apply the pass to the Linear IR
@@ -45,13 +43,15 @@ class DefineBufferClusters : public RangedPass {
     bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
+    using BufferCluster = std::set<ExpressionPtr>;
+    using BufferClusters = std::vector<BufferCluster>;
     using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
     /**
      * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
      * @param target target expression with Buffer op
      * @return vector iterator which refers to the found cluster
      */
-    AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
+    BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
     /**
      * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
      * @param buffer_expr expression with assumed Buffer op
@@ -70,7 +70,7 @@ class DefineBufferClusters : public RangedPass {
      * @param cluster set of Buffer expressions - cluster
      * @return common buffer ID or SIZE_MAX - size value
      */
-    size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
+    size_t get_cluster_buffer_id(const BufferCluster& cluster) const;
 
     /**
      * @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
@@ -126,10 +126,10 @@ class DefineBufferClusters : public RangedPass {
      * @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
      * @return Return True if clusters have been united
      */
-    bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
+    bool unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster,
                                const ExpressionPtr& outer_buffer, bool is_outer_up);
 
-    AllocateBuffers::BufferClusters& m_clusters;
+    BufferClusters m_clusters;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
index 3b085ca2b32f80..5ddb2749d63998 100644
--- a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
@@ -13,7 +13,7 @@ namespace pass {
 
 /**
  * @interface InitBuffersDefault
- * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
+ * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and reg groups to Buffers.
  * @ingroup snippets
  */
 
diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
index f38666cd4de1ba..40a2611b80ef48 100644
--- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp
@@ -24,7 +24,7 @@ namespace pass {
 class InsertBuffers : public RangedPass {
 public:
     OPENVINO_RTTI("InsertBuffers", "RangedPass")
-    InsertBuffers(int32_t buffer_allocation_rank);
+    InsertBuffers() = default;
     bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override;
 
 private:
@@ -39,8 +39,6 @@ class InsertBuffers : public RangedPass {
                                                     const LoopManagerPtr& loop_manager,
                                                     const ExpressionPtr& expr,
                                                     const ExpressionPtr& down_expr);
-
-    int32_t m_buffer_allocation_rank;
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp
similarity index 65%
rename from src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
rename to src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp
index 81b7536b63edaa..e07d11da70d904 100644
--- a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp
@@ -12,20 +12,20 @@ namespace lowered {
 namespace pass {
 
 /**
- * @interface NormalizeBufferIDs
- * @brief After optimizations some Buffer IDs might be set unevenly: some numbers are missed.
+ * @interface NormalizeBufferRegisterGroups
+ * @brief After optimizations some Buffer RegGroups might be set unevenly: some numbers are missed.
  *        For example,
- *                 [Buffer -> ID]
- *                  Buffer0 -> 0    Two Buffers have ID = 0, one has ID = 2.
- *                  Buffer1 -> 2    Obviosly, we can normalize this IDs to set ID = 1 to Buffer1.
+ *                 [Buffer -> RegGroup]
+ *                  Buffer0 -> 0    Two Buffers have RegGroup = 0, one has RegGroup = 2.
+ *                  Buffer1 -> 2    Obviosly, we can normalize this IDs to set RegGroup = 1 to Buffer1.
  *                  Buffer2 -> 0    It helps to assign GPR registers in `AssignRegister` more effective.
  *        Thus, the pass normalize IDs of Buffers in Linear IR.
  * @ingroup snippets
  */
 
-class NormalizeBufferIDs : public RangedPass {
+class NormalizeBufferRegisterGroups : public RangedPass {
 public:
-    OPENVINO_RTTI("NormalizeBufferIDs", "RangedPass")
+    OPENVINO_RTTI("NormalizeBufferRegisterGroups", "RangedPass")
     /**
      * @brief Apply the pass to the Linear IR
      * @param linear_ir the target Linear IR
diff --git a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp
similarity index 89%
rename from src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
rename to src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp
index 2289ef0246e8ff..f1f57afc6e2fd4 100644
--- a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp
@@ -14,9 +14,9 @@ namespace lowered {
 namespace pass {
 
 /**
- * @interface IdentifyBuffers
- * @brief The pass set identifiers for Buffers in common Buffer system.
- *        The buffers with the same identifier will be assigned the same data register.
+ * @interface SetBufferRegGroup
+ * @brief The pass groups Buffers by Register groups.
+ *        The buffers with the same RegGroup will be assigned the same data register.
  *        The pass uses greedy graph coloring algorithm using adjacency matrix:
  *          - Buffers - are vertices of graph;
  *          - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
@@ -26,13 +26,12 @@ namespace pass {
  *            or one of the Buffers is in some a Loop but another Buffer is not;
  *          - Firstly, create adjacency matrix using the definition above;
  *          - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
- *        Note: should be called before ResetBuffer() pass to have correct offsets
  * @ingroup snippets
  */
-class IdentifyBuffers: public RangedPass {
+class SetBufferRegGroup: public RangedPass {
 public:
-    OPENVINO_RTTI("IdentifyBuffers", "RangedPass")
-    IdentifyBuffers() = default;
+    OPENVINO_RTTI("SetBufferRegGroup", "RangedPass")
+    SetBufferRegGroup() = default;
 
     /**
      * @brief Apply the pass to the Linear IR
@@ -57,12 +56,12 @@ class IdentifyBuffers: public RangedPass {
     };
 
     /**
-     * @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
+     * @brief Check if two Buffers can be in one register group by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
      * @param lhs Data pointer shift params for first Buffer
      * @param rhs Data pointer shift params for second Buffer
-     * @return Returns True if params are valid for reusing. Otherwise returns False
+     * @return Returns True if params are valid to reuse one register. Otherwise returns False
      */
-    static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+    static bool can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
 
 private:
     using BufferPool = std::vector<ExpressionPtr>;
diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
index dfa5c3fc54d120..45eda9d4dc145f 100644
--- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
@@ -6,7 +6,6 @@
 
 #include "pass.hpp"
 
-#include "allocate_buffers.hpp"
 #include "openvino/runtime/memory_solver.hpp"
 
 namespace ov {
@@ -17,7 +16,9 @@ namespace pass {
 /**
  * @interface SolveBufferMemory
  * @brief The pass optimally calculates the common buffer scratchpad size and
- *        set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API.
+ *        set the offsets relative to the common data pointer to all defined Buffers.
+ *        The pass uses MemorySolver API to calculate common allocation size for static Buffers.
+ *        If some Buffers have unknown allocation size, the pass set `dynamic` offset.
  *        Note: The pass requires expression enumeration. It should be executed separately before this pass!
  *        Note: this transformation works only with m_clusters, no lir or iterators are really needed
  * @ingroup snippets
@@ -26,8 +27,7 @@ class SolveBufferMemory : public Pass {
 public:
     OPENVINO_RTTI("SolveBufferMemory", "Pass")
 
-    SolveBufferMemory(size_t& buffer_scratchpad_size, AllocateBuffers::BufferClusters& clusters)
-        : m_buffer_scratchpad_size(buffer_scratchpad_size), m_clusters(clusters) {}
+    SolveBufferMemory(size_t& static_buffer_scratchpad_size) : m_static_buffer_scratchpad_size(static_buffer_scratchpad_size) {}
     /**
      * @brief Apply the pass to the Linear IR
      * @param linear_ir the target Linear IR
@@ -36,15 +36,32 @@ class SolveBufferMemory : public Pass {
     bool run(lowered::LinearIR& linear_ir) override;
 
 private:
+    /**
+     * @brief Split buffer expressions of Linear IR into
+     *        static (with defined allocation size) and dynamic (with unknown size) buffers
+     * @param buffer_expressions buffer expressions
+     * @return the pair of static and dynamic buffer expressions
+     */
+    std::pair<LinearIR::container, LinearIR::container> extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions);
     /**
      * @brief Initializes boxes for MemorySolver
-     * @param buffer_clusters buffer clusters. These clusters could be obtained using DefineBufferClusters pass
+     * @param buffer_expressions buffer expressions
      * @return vector of boxes for MemorySolver
      */
-    std::vector<ov::MemorySolver::Box> init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters);
+    std::vector<ov::MemorySolver::Box> init_boxes(const LinearIR::container& buffer_expressions);
+    /**
+     * @brief Calculate memory size and propagate offsets to MA ops for buffer with defined allocation size
+     * @param static_buffer_expressions static buffer expressions
+     */
+    void solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions);
+    /**
+     * @brief Propagate dynamic offset to MA ops for buffer with undefined allocation size
+     *        Note: should be called after `solve_static_buffer_memory`
+     * @param dynamic_buffer_expressions dynamic buffer expressions
+     */
+    void set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions);
 
-    size_t& m_buffer_scratchpad_size;
-    AllocateBuffers::BufferClusters& m_clusters;
+    size_t& m_static_buffer_scratchpad_size;
 
     constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory
 };
diff --git a/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp b/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp
deleted file mode 100644
index 2b391251bbe8a4..00000000000000
--- a/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "pass.hpp"
-
-#include "snippets/lowered/loop_info.hpp"
-
-namespace ov {
-namespace snippets {
-namespace lowered {
-namespace pass {
-
-/**
- * @interface UpdateLoopInfo
- * @brief The pass update the existing UnifiedLoopInfo and the corresponding ExpandedLoopInfos.
- *        Notes:
- *          - LinearIR must have LoopManager only with ExpandedLoopInfo (LinearIR contains decomposed loops).
- *            Each of them has the pointer to UnifiedLoopInfo.
- *          - ExpandedLoopInfos` in LoopManager are sorted by execution order (NormalizeLoopIDs pas has been already passed).
- * @ingroup snippets
- */
-
-class UpdateLoopInfo : public Pass {
-public:
-    OPENVINO_RTTI("UpdateLoopInfo", "Pass")
-    UpdateLoopInfo() = default;
-    bool run(LinearIR& linear_ir) override;
-
-private:
-    /**
-     * @brief Initializes common ptr_increments and finalization offsets for ExpandedLoopInfo from ports of UnifiedLoopInfo
-     * @param unified_loop_info  UnifiedLoopInfo
-     * @param ptr_increments ref of vector with ptr increments
-     * @param finalization_offsets ref of vector with finalization offsets
-     */
-    static void init_data_ptr_shifts(const UnifiedLoopInfoPtr& unified_loop_info, std::vector<int64_t>& ptr_increments,
-                                     std::vector<int64_t>& finalization_offsets);
-};
-
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp
index 454fb5301adca4..199ebb99e8532b 100644
--- a/src/common/snippets/include/snippets/op/buffer.hpp
+++ b/src/common/snippets/include/snippets/op/buffer.hpp
@@ -6,6 +6,7 @@
 
 #include "openvino/op/op.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
+#include "snippets/utils.hpp"
 
 namespace ov {
 namespace snippets {
@@ -15,35 +16,42 @@ namespace op {
  * @interface Buffer
  * @brief This is a base class for memory storage.
  *        Notes:
- *               - All buffers with the same ID in a graph have the same memory pointer. So if we have a few buffers,
+ *               - All buffers with the same reg_group in a graph have the same memory pointer. So if we have a few buffers,
  *                 each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
  *               - Buffer should be a single consumer for operation output port
- * @param m_shape - output allocation shape for Buffer with type NewMemory
+ * @param m_allocation_size - memory size for allocation in u8 data type. Dynamic value means undefined size.
  * @param m_offset - offset in common Buffer scratchpad
- * @param m_id - Buffer ID in common Buffer system
+ * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR
+ * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset.
  * @ingroup snippets
  */
 class Buffer : public ov::op::Op {
 public:
     OPENVINO_OP("Buffer", "SnippetsOpset");
     Buffer() = default;
-    Buffer(const OutputVector& arguments, const ov::Shape& shape, size_t id, ov::element::Type element_type = ov::element::u8);
+    Buffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value<size_t>(), size_t reg_group = 0, size_t cluster_id = 0);
 
     bool visit_attributes(AttributeVisitor& visitor) override;
 
-    size_t get_id() const { return m_id; }
-    int64_t get_offset() const { return m_offset; }
-    void set_id(size_t id) { m_id = id; }
-    const ov::Shape& get_allocation_shape() const { return m_shape; }
-    void set_allocation_shape(const ov::Shape& allocation_shape) { m_shape = allocation_shape; }
-    void set_offset(int64_t offset) { m_offset = offset; }
+    size_t get_reg_group() const { return m_reg_group; }
+    size_t get_cluster_id() const { return m_cluster_id; }
+    size_t get_offset() const { return m_offset; }
+    size_t get_allocation_size() const { return m_allocation_size; }
     size_t get_byte_size() const;
 
+    void set_reg_group(size_t reg_group) { m_reg_group = reg_group; }
+    void set_cluster_id(size_t cluster) { m_cluster_id = cluster; }
+    void set_allocation_size(size_t allocation_size) { m_allocation_size = allocation_size; }
+    void set_offset(size_t offset) { m_offset = offset; }
+
+    // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined
+    bool is_defined() const;
+
 protected:
-    ov::Shape m_shape = {};
-    size_t m_id = 0;  // Default ID - 0. All Buffers are from the same set
-    ov::element::Type m_element_type = ov::element::u8;  // u8 - default 1 byte
-    int64_t m_offset = 0;
+    size_t m_allocation_size = utils::get_dynamic_value<size_t>();
+    size_t m_reg_group = 0;
+    size_t m_cluster_id = 0;
+    size_t m_offset = 0;
 };
 
 /**
@@ -56,14 +64,11 @@ class IntermediateMemoryBuffer : public Buffer {
 public:
     OPENVINO_OP("IntermediateMemoryBuffer", "SnippetsOpset", Buffer);
     IntermediateMemoryBuffer() = default;
-    IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, const ov::Shape& shape, size_t id = 0);
-    IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, int32_t allocation_rank = -1, size_t id = 0);
+    IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, size_t allocation_size = utils::get_dynamic_value<size_t>(),
+                             size_t reg_group = 0, size_t cluster_id = 0);
 
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
-private:
-    ov::Shape compute_shape_from_allocation_rank(const ov::Output<ov::Node>& arg, int32_t allocation_rank);
 };
 
 /**
@@ -76,18 +81,23 @@ class NewMemoryBuffer : public Buffer {
 public:
     OPENVINO_OP("NewMemoryBuffer", "SnippetsOpset", Buffer);
     NewMemoryBuffer() = default;
-    NewMemoryBuffer(const ov::Shape& shape, size_t id = 0, ov::element::Type element_type = ov::element::u8);
+    NewMemoryBuffer(const ov::Shape& shape, size_t reg_group = 0, size_t cluster_id = 0, ov::element::Type element_type = ov::element::u8);
 
     void validate_and_infer_types() override;
-    void set_element_type(ov::element::Type element_type);
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
+    void set_element_type(ov::element::Type element_type);
+
     class ShapeInfer : public IShapeInferSnippets {
         ov::Shape m_shape;
     public:
         explicit ShapeInfer(const std::shared_ptr<ov::Node>& n);
         Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
     };
+
+private:
+    ov::Shape m_output_shape;
+    ov::element::Type m_element_type = ov::element::u8;  // u8 - default 1 byte
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp
index 3f830ccc490664..10a2c26b1d843a 100644
--- a/src/common/snippets/include/snippets/runtime_configurator.hpp
+++ b/src/common/snippets/include/snippets/runtime_configurator.hpp
@@ -37,9 +37,12 @@ class RuntimeConfig {
 
     size_t tensor_rank = 0;
     size_t tile_rank = 0;
+
     std::vector<ov::snippets::VectorDims> io_data_offsets = {};
     ov::snippets::VectorDims master_shape = {};
+
     size_t buffer_scratchpad_size = 0;
+    std::vector<size_t> buffer_cluster_offsets;
 };
 
 /**
@@ -65,7 +68,7 @@ class RuntimeConfigurator {
      */
     virtual void update(const std::shared_ptr<lowered::LinearIR>& linear_ir);
     /**
-     * @brief Allocate and intialize fields in RuntimeConfig
+     * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator
      * @param linear_ir LinearIR
      */
     virtual void initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir);
@@ -76,11 +79,30 @@ class RuntimeConfigurator {
      * @param linear_ir LinearIR
      */
     void init_data_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
+    /**
+     * @brief Initializes information of buffers:
+     *        - static buffer_scratchpad_size
+     *        - offsets of static clusters (with static buffers)
+     *        - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()`
+     * @param linear_ir LinearIR
+     */
+    void init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir);
     /**
      * @brief Initializes tensor rank of config
      * @param linear_ir LinearIR
      */
     virtual void init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
+    /**
+     * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo
+     * @param linear_ir LinearIR
+     */
+    void update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
+    /**
+     * @brief Update Buffer scratchpad size and offsets if needed
+     *        Note: `update_loop_info` must be called before
+     * @param linear_ir LinearIR
+     */
+    void update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const;
     /**
      * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig
      */
@@ -91,12 +113,13 @@ class RuntimeConfigurator {
     void update_latest_shapes();
 
     std::shared_ptr<RuntimeConfig> m_config = nullptr;
-    lowered::pass::PassPipeline m_state_updater = {};
 
     size_t m_io_num = 0;
     size_t m_in_num = 0;
     std::vector<snippets::lowered::PortDescriptorPtr> m_io_descs = {};
     std::vector<size_t> m_io_data_sizes = {};
+    // [cluster_id -> buffer expressions ]
+    std::map<size_t, std::set<lowered::ExpressionPtr>> m_dynamic_buffer_clusters;
 
     std::vector<ov::snippets::VectorDims> m_latest_shapes = {};
 };
diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
index d0328d9d08a2c3..e5c7a443d34eb9 100644
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@@ -8,9 +8,13 @@
  */
 #pragma once
 
-#include "snippets_isa.hpp"
-#include "emitter.hpp"
-#include "shape_types.hpp"
+#include "snippets/emitter.hpp"
+#include "snippets/shape_types.hpp"
+#include "snippets/lowered/expression.hpp"
+#include "snippets/lowered/expression_port.hpp"
+
+#include "openvino/op/fake_quantize.hpp"
+#include "openvino/op/constant.hpp"
 
 
 namespace ov {
@@ -19,10 +23,10 @@ namespace utils {
 
 // Get non-scalar Constant count that will be created after FakeQuantize decomposition.
 // This count is needed to know exact count of non-scalar Constants during tokenization.
-auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::opset1::FakeQuantize>& fq) -> size_t;
+auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) -> size_t;
 
 inline auto is_scalar_constant(const std::shared_ptr<ov::Node>& source_output_node) -> bool {
-    return ov::is_type<ov::opset1::Constant>(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1;
+    return ov::is_type<ov::op::v0::Constant>(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1;
 }
 
 inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t {
@@ -73,7 +77,30 @@ inline bool is_dynamic_vdims(const VectorDimsPtr& shape) {
     return is_dynamic_vdims(*shape);
 }
 
-void broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2);
+template<class T>
+inline void dynamic_safe_add(T& lhs, const T& rhs) {
+    if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) {
+        lhs = utils::get_dynamic_value<T>();
+        return;
+    }
+    lhs += rhs;
+}
+
+template<class T>
+inline void dynamic_safe_mul(T& lhs, const T& rhs) {
+    if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) {
+        lhs = utils::get_dynamic_value<T>();
+        return;
+    }
+    lhs *= rhs;
+}
+
+template<class T>
+inline std::string value2str(const T& value) {
+    return utils::is_dynamic_value(value) ? "?" : std::to_string(value);
+}
+
+bool broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2);
 
 VectorDims pshape_to_vdims(const PartialShape&);
 ov::PartialShape vdims_to_pshape(const VectorDims&);
diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
index f1ff133ab20d79..098d6e2e1d2f32 100644
--- a/src/common/snippets/src/lowered/linear_ir.cpp
+++ b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -179,6 +179,8 @@ void LinearIR::register_expression(const ExpressionPtr& expr, bool io_allowed) {
         m_parameter_expressions.push_back(expr);
     if (ov::is_type<ov::op::v0::Result>(node))
         m_result_expressions.push_back(expr);
+    if (ov::is_type<op::Buffer>(node))
+        m_buffer_expressions.push_back(expr);
 }
 
 void LinearIR::unregister_expression(const ExpressionPtr& expr) {
@@ -191,6 +193,11 @@ void LinearIR::unregister_expression(const ExpressionPtr& expr) {
     m_node2expression_map.erase(node);
     OPENVINO_ASSERT(!ov::is_type<ov::op::v0::Parameter>(node) && !ov::is_type<ov::op::v0::Result>(node),
                     "unregister_expression mustn't be called for parameter or result expressions");
+    if (ov::is_type<op::Buffer>(node)) {
+        const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), expr);
+        OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "Buffer Expression has not been found in the list of LinearIR Buffers!");
+        m_buffer_expressions.erase(it);
+    }
 }
 
 LinearIR::exprIt LinearIR::insert(constExprIt pos, container::value_type&& value) {
diff --git a/src/common/snippets/src/lowered/linear_ir_builder.cpp b/src/common/snippets/src/lowered/linear_ir_builder.cpp
index 1aff2b9e99d07b..6054e94d26d4b9 100644
--- a/src/common/snippets/src/lowered/linear_ir_builder.cpp
+++ b/src/common/snippets/src/lowered/linear_ir_builder.cpp
@@ -111,12 +111,32 @@ LinearIR::container LinearIRBuilder::clone_range(LinearIR::container::const_iter
                         result_expr->get_input_count() == original_expr->get_input_count() &&
                         result_expr->get_output_count() == original_expr->get_output_count(),
                         "Expressions after copying aren't matched!");
+        // Copy tensor shapes as shared pointer if needed
         if (!m_config.deep_copy_of_shapes) {
             for (size_t i = 0; i < original_expr->get_input_count(); ++i)
                 result_expr->get_input_port_descriptor(i)->m_tensor_shape = original_expr->get_input_port_descriptor(i)->m_tensor_shape;
             for (size_t i = 0; i < original_expr->get_output_count(); ++i)
                 result_expr->get_output_port_descriptor(i)->m_tensor_shape = original_expr->get_output_port_descriptor(i)->m_tensor_shape;
         }
+
+        // Copy missed consumers if needed
+        if (m_config.copy_missed_consumers) {
+            for (size_t i = 0; i < original_expr->get_output_count(); i++) {
+                const auto& original_consumers = original_expr->get_output_port_connector(i)->get_consumers();
+                for (const auto& original_consumer : original_consumers) {
+                    const auto result_consumers = result_expr->get_output_port_connector(i)->get_consumers();
+                    // Check if consumer is from the cloned body
+                    const auto original_expr_ptr = original_consumer.get_expr().get();
+                    if (expression_map.count(original_expr_ptr)) {
+                        const auto target_consumer = expression_map[original_expr_ptr]->get_input_port(original_consumer.get_index());
+                        // If missed, add to existing consumers
+                        if (std::find(result_consumers.cbegin(), result_consumers.cend(), target_consumer) == result_consumers.cend()) {
+                            result_expr->get_output_port_connector(i)->add_consumer(target_consumer);
+                        }
+                    }
+                }
+            }
+        }
     }
 
     return result;
diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp
index e26703c294881b..fd26d5d90cc278 100644
--- a/src/common/snippets/src/lowered/loop_info.cpp
+++ b/src/common/snippets/src/lowered/loop_info.cpp
@@ -11,11 +11,12 @@ namespace ov {
 namespace snippets {
 namespace lowered {
 
-LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits)
-    : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits) {}
+LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits, bool is_wa_const)
+    : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits), m_is_work_amount_const(is_wa_const) {}
 
-LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits)
-    : m_work_amount(work_amount), m_increment(increment) {
+LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits,
+                   bool is_wa_const)
+    : m_work_amount(work_amount), m_increment(increment), m_is_work_amount_const(is_wa_const) {
     m_input_ports.reserve(entries.size());
     m_output_ports.reserve(exits.size());
     for (const auto& port : entries)
@@ -68,6 +69,10 @@ const std::vector<LoopPort>& LoopInfo::get_output_ports() const {
     return m_output_ports;
 }
 
+bool LoopInfo::is_work_amount_const() const {
+    return m_is_work_amount_const;
+}
+
 void LoopInfo::set_work_amount(size_t work_amount) {
     m_work_amount = work_amount;
 }
@@ -80,6 +85,10 @@ void LoopInfo::set_dim_idx(size_t dim_idx) {
     iterate_through_ports([dim_idx](LoopPort& port) { port.dim_idx = dim_idx; });
 }
 
+void LoopInfo::set_work_amount_const(bool value) {
+    m_is_work_amount_const = value;
+}
+
 template<>
 std::vector<LoopPort>::iterator LoopInfo::find_loop_port(const LoopPort& loop_port) {
     auto& ports = loop_port.expr_port->get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports;
@@ -138,16 +147,16 @@ std::vector<LoopPort> LoopInfo::clone_loop_ports(const ExpressionMap& expr_map,
 
 UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment,
                                  const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
-                                 const SpecificIterationHandlers& handlers)
-    : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers),
+                                 const SpecificIterationHandlers& handlers, bool is_wa_const)
+    : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers),
       m_input_port_descs(std::vector<LoopPortDesc>(entries.size())), m_output_port_descs(std::vector<LoopPortDesc>(exits.size())) {
     validate();
 }
 
 UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment,
                                  const std::vector<ExpressionPort>& entries, const std::vector<ExpressionPort>& exits,
-                                 const SpecificIterationHandlers& handlers)
-    : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers),
+                                 const SpecificIterationHandlers& handlers, bool is_wa_const)
+    : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers),
       m_input_port_descs(std::vector<LoopPortDesc>(entries.size())), m_output_port_descs(std::vector<LoopPortDesc>(exits.size())) {
     validate();
 }
@@ -155,8 +164,8 @@ UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment,
 UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment,
                                  const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                                  const std::vector<LoopPortDesc>& in_shifts, const std::vector<LoopPortDesc>& out_shifts,
-                                 const SpecificIterationHandlers& handlers)
-    : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) {
+                                 const SpecificIterationHandlers& handlers, bool is_wa_const)
+    : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) {
     validate();
 }
 
@@ -170,7 +179,7 @@ std::shared_ptr<LoopInfo> UnifiedLoopInfo::clone_with_new_expr(const ExpressionM
     const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports);
 
     return std::make_shared<UnifiedLoopInfo>(m_work_amount, m_increment, new_input_ports, new_output_ports,
-                                             m_input_port_descs, m_output_port_descs, m_handlers);
+                                             m_input_port_descs, m_output_port_descs, m_handlers, m_is_work_amount_const);
 }
 
 const SpecificIterationHandlers& UnifiedLoopInfo::get_handlers() const {
@@ -294,8 +303,9 @@ void UnifiedLoopInfo::replace_with_new_ports(const ExpressionPort& actual_port,
 ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment,
                                    const std::vector<LoopPort>& entries, const std::vector<LoopPort>& exits,
                                    std::vector<int64_t> ptr_increments, std::vector<int64_t> final_offsets, std::vector<int64_t> data_sizes,
-                                   SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info)
-    : LoopInfo(work_amount, increment, entries, exits), m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)),
+                                   SpecificLoopIterType type, std::shared_ptr<UnifiedLoopInfo> unified_loop_info, bool is_wa_const)
+    : LoopInfo(work_amount, increment, entries, exits, is_wa_const),
+      m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)),
       m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) {
     validate();
 }
@@ -313,7 +323,7 @@ std::shared_ptr<LoopInfo> ExpandedLoopInfo::clone_with_new_expr(const Expression
     const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports);
 
     return std::make_shared<ExpandedLoopInfo>(m_work_amount, m_increment, new_input_ports, new_output_ports,
-                                              m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info);
+                                              m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const);
 }
 
 const std::shared_ptr<UnifiedLoopInfo>& ExpandedLoopInfo::get_unified_loop_info() const {
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
index 75a196e4f623bb..224e1add666948 100644
--- a/src/common/snippets/src/lowered/loop_manager.cpp
+++ b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -5,6 +5,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 
 #include "snippets/lowered/expression.hpp"
+#include "snippets/op/loop.hpp"
 #include "snippets/utils.hpp"
 
 #include "openvino/core/graph_util.hpp"
@@ -180,7 +181,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
         OPENVINO_ASSERT(index < size, "Incorrect index for broadcasting");
         const auto lhs_value = index < lhs_size ? *(lhs.crbegin() + index) : 1;
         const auto rhs_value = index < rhs_size ? *(rhs.crbegin() + index) : 1;
-        utils::broadcast_merge_dim(*(lhs.rbegin() + index), lhs_value, rhs_value);
+        OPENVINO_ASSERT(utils::broadcast_merge_dim(*(lhs.rbegin() + index), lhs_value, rhs_value),
+                        "Failed to broadcast work amount in marking loop");
     };
 
     auto is_outside_loop = [&FULL_DIM](const std::vector<size_t>& subtensor) {
@@ -281,13 +283,14 @@ void LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR::
     const auto work_amount = std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount());
     const auto increment = std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment());
     const auto handlers = SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers());
+    const auto is_work_amount_const = loop_info_upper->is_work_amount_const() || loop_info_lower->is_work_amount_const();
 
     auto new_entries = input_ports_upper;
     new_entries.insert(new_entries.end(), input_ports_lower.begin(), input_ports_lower.end());
     auto new_exits = output_ports_upper;
     new_exits.insert(new_exits.end(), output_ports_lower.begin(), output_ports_lower.end());
 
-    m_map[to] = std::make_shared<UnifiedLoopInfo>(work_amount, increment, new_entries, new_exits, handlers);
+    m_map[to] = std::make_shared<UnifiedLoopInfo>(work_amount, increment, new_entries, new_exits, handlers, is_work_amount_const);
 
     for (auto it = loop_begin_target; it != loop_end_target; ++it) {
         const auto& expr = *it;
diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
index a0c5328adb76fc..e830c7f9073206 100644
--- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -6,11 +6,12 @@
 #include "snippets/lowered/pass/allocate_buffers.hpp"
 
 #include "snippets/lowered/pass/enumerate_expressions.hpp"
+#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp"
 #include "snippets/lowered/pass/solve_buffer_memory.hpp"
 #include "snippets/lowered/pass/init_buffers_default.hpp"
-#include "snippets/lowered/pass/identify_buffers.hpp"
+#include "snippets/lowered/pass/set_buffer_reg_group.hpp"
 #include "snippets/lowered/pass/define_buffer_clusters.hpp"
-#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
+#include "snippets/lowered/pass/normalize_buffer_reg_groups.hpp"
 #include "snippets/pass/tokenization.hpp"
 #include "snippets/itt.hpp"
 #include "snippets/utils.hpp"
@@ -70,20 +71,20 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
     size_t buffer_scratchpad_size = 0;
 
+    PassPipeline pipeline;
+    pipeline.register_pass<ComputeBufferAllocationSize>(linear_ir.get_config().m_loop_depth);
     if (m_is_optimized_mode) {
-        BufferClusters buffer_clusters;
-        PassPipeline pipeline;
         pipeline.register_pass<EnumerateExpressions>();
-        pipeline.register_pass<IdentifyBuffers>();
-        pipeline.register_pass<DefineBufferClusters>(buffer_clusters);
-        pipeline.register_pass<SolveBufferMemory>(buffer_scratchpad_size, buffer_clusters);
-        pipeline.register_pass<NormalizeBufferIDs>();
-        pipeline.run(linear_ir);
+        pipeline.register_pass<SetBufferRegGroup>();
+        pipeline.register_pass<DefineBufferClusters>();
+        pipeline.register_pass<SolveBufferMemory>(buffer_scratchpad_size);
+        pipeline.register_pass<NormalizeBufferRegisterGroups>();
     } else {
-        InitBuffersDefault(buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
+        pipeline.register_pass<InitBuffersDefault>(buffer_scratchpad_size);
     }
+    pipeline.run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
 
-    linear_ir.set_buffer_scratchpad_size(buffer_scratchpad_size);
+    linear_ir.set_static_buffer_scratchpad_size(buffer_scratchpad_size);
 
     return buffer_scratchpad_size > 0;
 }
diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp
index 70da1a6cc17424..b81a1552f97b03 100644
--- a/src/common/snippets/src/lowered/pass/assign_registers.cpp
+++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp
@@ -85,22 +85,22 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
     for (const auto& expr : exprs) {
         auto op = expr->get_node();
         if (const auto& buffer = ov::as_type_ptr<op::Buffer>(op)) {
-            const auto buffer_id = buffer->get_id();
+            const auto reg_group = buffer->get_reg_group();
             // All buffers have one common data pointer
             if (ov::is_type<op::IntermediateMemoryBuffer>(buffer)) {
                 manually_assigned_gprs[expr->get_input_port_connector(0)] =
-                        static_cast<Reg>(num_results + num_parameters + buffer_id);
+                        static_cast<Reg>(num_results + num_parameters + reg_group);
                 // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start.
                 // child shape info ops share the same memory as IntermediateMemoryBuffer.
                 const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr);
                 for (const auto& child_shape_infer_expr : shape_infer_consumers) {
                     manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] =
                         manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] =
-                        static_cast<Reg>(num_results + num_parameters + buffer_id);
+                        static_cast<Reg>(num_results + num_parameters + reg_group);
                 }
             }
             manually_assigned_gprs[expr->get_output_port_connector(0)] =
-                    static_cast<Reg>(num_results + num_parameters + buffer_id);
+                    static_cast<Reg>(num_results + num_parameters + reg_group);
         } else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
             // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer.
             // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
index 9552cbfdfbee76..4cf201047d63f5 100644
--- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
+++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp
@@ -24,7 +24,7 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop
     const auto output_count = loop_end->get_output_num();
 
     std::set<size_t> resetting_data_indexes;
-    std::set<size_t> buffers_ids;
+    std::set<size_t> buffers_groups;
     // We count expressions only on inputs of Loop because we can only read from the same data but not write to the same data.
     //       Parameter
     //        |    |
@@ -34,8 +34,8 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop
         const auto& parent_output = loop_connectors[i]->get_source().get_expr();
         if (const auto buffer = ov::as_type_ptr<op::Buffer>(parent_output->get_node())) {
             // If Buffer is missed in set, Just save - it's first meeting
-            if (buffers_ids.count(buffer->get_id()) == 0) {
-                buffers_ids.insert(buffer->get_id());
+            if (buffers_groups.count(buffer->get_reg_group()) == 0) {
+                buffers_groups.insert(buffer->get_reg_group());
             } else {
                 // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting
                 resetting_data_indexes.insert(i);
@@ -60,8 +60,8 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop
             if (const auto buffer = ov::as_type_ptr<op::Buffer>(child_node)) {
                 buffer_count++;
                 // If Buffer is missed in set, Just save - it's first meeting
-                if (buffers_ids.count(buffer->get_id()) == 0) {
-                    buffers_ids.insert(buffer->get_id());
+                if (buffers_groups.count(buffer->get_reg_group()) == 0) {
+                    buffers_groups.insert(buffer->get_reg_group());
                 } else {
                     // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting
                     resetting_data_indexes.insert(input_count + i);
diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp
new file mode 100644
index 00000000000000..760606d8dc067c
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp"
+
+#include "snippets/op/buffer.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+namespace {
+std::vector<size_t> get_parent_inner_loops(const std::vector<size_t>& parent_loops, const std::vector<size_t>& current_loops) {
+    const auto common_rank = std::min(parent_loops.size(), current_loops.size());
+    size_t i = 0;
+    while (i < common_rank && parent_loops[i] == current_loops[i])
+        ++i;
+    return std::vector<size_t>(parent_loops.cbegin() + i, parent_loops.cend());
+}
+}  // namespace
+
+// Ticket: 113744
+// TODO: This logic covers only several specific cases so it should be generalized.
+size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank) {
+    const auto& parent_port = buffer_expr->get_input_port_connector(0)->get_source();
+    const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), buffer_expr->get_loop_ids());
+    const auto planar_shape = utils::get_preordered_vdims(parent_port);
+
+    const size_t rank = allocation_rank >= 0 ? std::min(static_cast<size_t>(allocation_rank), planar_shape.size())
+                                             : planar_shape.size();
+
+    const auto& subtensor =  parent_port.get_descriptor_ptr()->get_subtensor();
+
+    size_t allocation_size = 1;
+    std::set<size_t> processed_dim_idxs;
+    for (const auto& parent_loop : parent_loop_ids) {
+        const auto loop_info = loop_manager->get_loop_info(parent_loop);
+        const auto& output_ports = loop_info->get_output_ports();
+        auto it = std::find_if(output_ports.begin(), output_ports.end(), [&parent_port](const LoopPort& port) { return *port.expr_port == parent_port; });
+        OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found");
+        const auto& loop_port = *it;
+        const auto& dim_idx = loop_port.dim_idx;
+        if (loop_port.is_incremented && dim_idx < rank) {
+            if (const auto& unified_loop_info = ov::as_type_ptr<UnifiedLoopInfo>(loop_info))
+                utils::dynamic_safe_mul(allocation_size, unified_loop_info->get_work_amount());
+            else if (const auto& expanded_loop_info = ov::as_type_ptr<ExpandedLoopInfo>(loop_info))
+                utils::dynamic_safe_mul(allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount());
+            else
+                OPENVINO_THROW("Unknown LoopInfo type");
+            processed_dim_idxs.insert(dim_idx);
+        }
+    }
+    const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size();
+    for (size_t i = 0; i < std::min(processing_rank, rank); ++i) {
+        if (processed_dim_idxs.count(i) == 0) {
+            if (i < subtensor.size())
+                utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i)));
+            else
+                utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i));
+        }
+    }
+
+    // Corner case when the current information is not enough
+    if (processing_rank == 0 && processed_dim_idxs.empty()) {
+        for (size_t i = 0; i < rank; ++i) {
+            utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i));
+        }
+    }
+
+    return allocation_size;
+}
+
+bool ComputeBufferAllocationSize::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ComputeBufferAllocationSize")
+
+    const auto& loop_manager = linear_ir.get_loop_manager();
+
+    const auto& buffer_expressions = linear_ir.get_buffer_ops();
+    for (const auto& buffer_expr : buffer_expressions) {
+        const auto node = buffer_expr->get_node();
+        OPENVINO_ASSERT(ov::is_type<op::Buffer>(node), "Expected Buffer ops in Buffer expressions of LinearIR");
+        if (const auto buffer = ov::as_type_ptr<op::IntermediateMemoryBuffer>(node)) {
+            // If the current size is undefined, update it
+            if (!buffer->is_defined())
+                buffer->set_allocation_size(get_allocation_size(loop_manager, buffer_expr, m_buffer_allocation_rank));
+        }
+    }
+
+
+    return true;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
index d093085dcc8922..41a13cadeb10e0 100644
--- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
+++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
@@ -4,7 +4,7 @@
 
 #include "snippets/lowered/pass/define_buffer_clusters.hpp"
 
-#include "snippets/lowered/pass/identify_buffers.hpp"
+#include "snippets/lowered/pass/set_buffer_reg_group.hpp"
 #include "snippets/pass/tokenization.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
@@ -14,11 +14,11 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-using ShiftPtrParams = IdentifyBuffers::ShiftPtrParams;
+using ShiftPtrParams = SetBufferRegGroup::ShiftPtrParams;
 
-AllocateBuffers::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) {
+DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) {
     return std::find_if(m_clusters.begin(), m_clusters.end(),
-                        [&target](const AllocateBuffers::BufferCluster& cluster) { return cluster.count(target) > 0; });
+                        [&target](const BufferCluster& cluster) { return cluster.count(target) > 0; });
 }
 
 bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const {
@@ -30,15 +30,15 @@ void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr)
     const auto cluster_it = find_cluster_by_expr(buffer_expr);
     // If Buffer is missed in clusters, create new cluster with the single Buffer node inside
     if (cluster_it == m_clusters.cend()) {
-        m_clusters.push_back(AllocateBuffers::BufferCluster{buffer_expr});
+        m_clusters.push_back(BufferCluster{buffer_expr});
     }
 }
 
-size_t DefineBufferClusters::get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const {
+size_t DefineBufferClusters::get_cluster_buffer_id(const BufferCluster& cluster) const {
     OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!");
-    const auto id = (ov::as_type_ptr<op::Buffer>(cluster.cbegin()->get()->get_node()))->get_id();
+    const auto id = (ov::as_type_ptr<op::Buffer>(cluster.cbegin()->get()->get_node()))->get_reg_group();
     if (std::all_of(cluster.cbegin(), cluster.cend(),
-                    [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr<op::Buffer>(expr->get_node()))->get_id() == id; })) {
+                    [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr<op::Buffer>(expr->get_node()))->get_reg_group() == id; })) {
         return id;
     }
     return SIZE_MAX;
@@ -148,7 +148,7 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) {
             if (has_been_added) break;
         }
         if (!has_been_added) {
-            m_clusters.push_back(AllocateBuffers::BufferCluster{output_buffer_expr});
+            m_clusters.push_back(BufferCluster{output_buffer_expr});
         }
     }
 
@@ -248,8 +248,8 @@ int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr
     return final_offset;
 }
 
-bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it,
-                                                 AllocateBuffers::BufferCluster& outer_cluster,
+bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it,
+                                                 BufferCluster& outer_cluster,
                                                  const ExpressionPtr& outer_buffer, bool is_outer_up) {
     for (const auto& inner_buffer : *inner_cluster_it) {
         ExpressionPtr common_loop_end_expr = nullptr;
@@ -263,11 +263,11 @@ bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferCl
             const auto& inner_ptr_increments = common_loop_end->get_ptr_increments();
             const auto& inner_final_offsets = common_loop_end->get_finalization_offsets();
             const auto& inner_data_sizes = common_loop_end->get_element_type_sizes();
-            if (IdentifyBuffers::can_reuse_id({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] },
-                                              { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) {
-                const auto buffer_id = ov::as_type_ptr<op::Buffer>(outer_buffer->get_node())->get_id();
+            if (SetBufferRegGroup::can_be_in_one_group({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] },
+                                                       { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) {
+                const auto buffer_reg_group = ov::as_type_ptr<op::Buffer>(outer_buffer->get_node())->get_reg_group();
                 for (const auto& inner_buffer : *inner_cluster_it)
-                    ov::as_type_ptr<op::Buffer>(inner_buffer->get_node())->set_id(buffer_id);
+                    ov::as_type_ptr<op::Buffer>(inner_buffer->get_node())->set_reg_group(buffer_reg_group);
 
                 outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend());
                 m_clusters.erase(inner_cluster_it);
@@ -339,6 +339,8 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) {
 bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters");
 
+    m_clusters.clear();
+
     for (auto expr_it = begin; expr_it != end; ++expr_it) {
         const auto& expr = *expr_it;
         const auto op = expr->get_node();
@@ -353,6 +355,15 @@ bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR::
         }
     }
 
+    for (size_t cluster_id = 0; cluster_id < m_clusters.size(); ++cluster_id) {
+        const auto& cluster = m_clusters[cluster_id];
+        std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const ExpressionPtr& buffer_expr) {
+            const auto& buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+            OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+            buffer->set_cluster_id(cluster_id);
+        });
+    }
+
     return true;
 }
 
diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
index ec0743bf4df7d0..a06c58a21bf272 100644
--- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp
@@ -65,12 +65,13 @@ bool FuseLoops::can_be_fused(const UnifiedLoopInfoPtr& loop_upper, const Unified
         (work_amount_upper == work_amount_lower) && increment_upper == increment_lower;
     const bool bcastable_upper = work_amount_upper == 1 && increment_upper == 1;
     const bool bcastable_lower = work_amount_lower == 1 && increment_lower == 1;
+    const auto is_const_wa_equal = loop_upper->is_work_amount_const() == loop_lower->is_work_amount_const();
     // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't,
     // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters
     // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped.
     const bool first_iter_handlers_match = loop_upper->get_handlers().get_passes<SpecificLoopIterType::FIRST_ITER>().empty() ==
                                            loop_lower->get_handlers().get_passes<SpecificLoopIterType::FIRST_ITER>().empty();
-    return first_iter_handlers_match && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower);
+    return first_iter_handlers_match && is_const_wa_equal && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower);
 }
 
 void FuseLoops::move(LinearIR& linear_ir, const LoopManagerPtr& loop_manager, size_t loop_id,
diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
index 36cb41d3b9c96e..8ba9c39322fd66 100644
--- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
+++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
@@ -17,17 +17,22 @@ namespace pass {
 bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault");
 
-    size_t id = 0;
+    size_t idx = 0;
     size_t offset = 0;
     for (auto expr_it = begin; expr_it != end; ++expr_it) {
         const auto& expr = *expr_it;
         const auto op = expr->get_node();
         if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
-            AllocateBuffers::set_buffer_offset(expr, offset);
-            buffer->set_id(id);
-
-            offset += buffer->get_byte_size();
-            id++;
+            buffer->set_reg_group(idx);
+            buffer->set_cluster_id(idx);
+
+            if (!buffer->is_defined()) {
+                AllocateBuffers::set_buffer_offset(expr, utils::get_dynamic_value<size_t>());
+            } else {
+                AllocateBuffers::set_buffer_offset(expr, offset);
+                offset += buffer->get_byte_size();
+            }
+            idx++;
         }
     }
 
diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp
index 1e0c556cff013f..c2360b4f2a54f4 100644
--- a/src/common/snippets/src/lowered/pass/init_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/init_loops.cpp
@@ -6,7 +6,7 @@
 
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
-#include "snippets/op/memory_access.hpp"
+#include "snippets/snippets_isa.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
 
@@ -123,8 +123,9 @@ inline void init_work_amount(const LoopInfoPtr& loop_info) {
             const auto& shape = desc->get_shape();
             const auto& layout = desc->get_layout();
             const auto is_input = loop_port.expr_port->get_type() == ExpressionPort::Input;
-            const auto dim_idx = is_input ? utils::get_input_dim_idx(layout, loop_port.dim_idx) : utils::get_input_dim_idx(layout, loop_port.dim_idx);
-            utils::broadcast_merge_dim(work_amount, work_amount, shape[dim_idx]);
+            const auto dim_idx = is_input ? utils::get_input_dim_idx(layout, loop_port.dim_idx) : utils::get_output_dim_idx(layout, loop_port.dim_idx);
+            OPENVINO_ASSERT(utils::broadcast_merge_dim(work_amount, work_amount, shape[dim_idx]),
+                            "Failed to broadcast work_amount");
         }
     });
     loop_info->set_work_amount(work_amount);
@@ -133,7 +134,7 @@ inline void init_work_amount(const LoopInfoPtr& loop_info) {
 
 void InitLoops::init_loop_info(const UnifiedLoopInfoPtr& loop_info, const size_t loop_id, bool only_runtime_args) {
     OPENVINO_ASSERT(loop_info != nullptr, "UnifiedLoopInfo is nullptr, nothing to initialize");
-    if (utils::is_dynamic_value(loop_info->get_work_amount()))
+    if (!loop_info->is_work_amount_const())
         init_work_amount(loop_info);
 
     const auto work_amount = loop_info->get_work_amount();
diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
index 7c9ee6b8f1b000..87e5c489cb1029 100644
--- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
@@ -29,82 +29,8 @@ std::vector<size_t> get_buffer_loop_ids(const std::vector<size_t>& lhs, const st
     }
     return buffer_loop_ids;
 }
-
-// Ticket: 113744
-// TODO: This logic covers only several specific cases so it should be generalized.
-ov::Shape compute_allocation_shape(const LoopManagerPtr& loop_manager,
-                                   const std::vector<size_t>& buffer_loop_ids,
-                                   const ExpressionPort& parent_expr_output,
-                                   const int allocation_rank) {
-    const auto& parent_expr = parent_expr_output.get_expr();
-    const auto& parent_loop_ids = parent_expr->get_loop_ids();
-    const auto planar_shape = utils::get_preordered_vdims(parent_expr_output);
-
-    const size_t rank = allocation_rank >= 0 ? std::min(static_cast<size_t>(allocation_rank), planar_shape.size())
-                                             : planar_shape.size();
-    ov::Shape allocation_shape(rank);
-    for (size_t i = 0; i < rank; ++i) {
-        *(allocation_shape.rbegin() + i) = *(planar_shape.rbegin() + i);
-    }
-
-    if (buffer_loop_ids.empty() || parent_loop_ids.empty()) {
-        return allocation_shape;
-    }
-
-    // If subtensor is set, its information is used for allocation shape computation. Two situations are possible:
-    // 1. Buffer is outside the parent loop: the corresponding subtensor value is ignored, parent loop work amount is set instead
-    // 2. Buffer is inside the parent loop: the corresponding subtensor value is used in allocation shape.
-    // Since we can defenitely know which subtensor value corresponds to the loop only for 1st case
-    // (we can extract this info from loop output port), we copy subtensor, and then replace subtensor values with parent loop work amount if needed.
-    // Example:
-    // Parent subtensor: [M_blk, N_blk]
-    // Buffer loop idces: [M_loop_idx], parent loop idces: [M_loop_idx, N_loop_idx]
-    //
-    // 1. Allocation shape is set to subtensor: [M_blk, N_blk]
-    // 2. Buffer is inside M_loop_idx loop => allocation shape is not changed
-    // 3. Buffer is outside N_loop_idx loop => the corresponding allocation shape value is replaced with N loop work amount
-    // So the result allocation shape is [M_blk, N_loop_work_amount]
-    const auto& subtensor =  parent_expr_output.get_descriptor_ptr()->get_subtensor();
-    if (!subtensor.empty()) {
-        for (size_t i = 0; i < std::min(rank, subtensor.size()); ++i) {
-            auto& cur_val = *(allocation_shape.rbegin() + i);
-            const auto& subtensor_val = *(subtensor.rbegin() + i);
-            cur_val = std::min(cur_val, subtensor_val);
-        }
-        for (const auto& parent_loop : parent_loop_ids) {
-            if (std::find(buffer_loop_ids.begin(), buffer_loop_ids.end(), parent_loop) == buffer_loop_ids.end()) {
-                const auto loop_info = loop_manager->get_loop_info(parent_loop);
-                const auto& output_ports = loop_info->get_output_ports();
-                auto it = std::find_if(output_ports.begin(),
-                                       output_ports.end(),
-                                       [&parent_expr_output](const LoopPort& port) {
-                                           return *port.expr_port == parent_expr_output;
-                                       });
-                OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found");
-                const auto& loop_port = *it;
-                if (loop_port.is_incremented && loop_port.dim_idx < allocation_shape.size()) {
-                    *(allocation_shape.rbegin() + loop_port.dim_idx) = loop_info->get_work_amount();
-                }
-            }
-        }
-    } else {
-        // WA: In case of empty subtensors another information have to be used to update allocation shape.
-        for (size_t i = 0; i < std::min(rank, parent_loop_ids.size()); ++i) {
-            const auto loop = loop_manager->get_loop_info(*(parent_loop_ids.rbegin() + i));
-            OPENVINO_ASSERT(loop->get_dim_idx() == i, "compute_allocation_shape: eltwise loop has unexpected dimension index");
-            *(allocation_shape.rbegin() + i) = loop->get_work_amount();
-        }
-        for (int i = 0; i < allocation_rank - static_cast<int>(parent_loop_ids.size()); ++i) {
-            allocation_shape[i] = 1;
-        }
-    }
-    return allocation_shape;
-}
 }  // namespace
 
-InsertBuffers::InsertBuffers(int32_t buffer_allocation_rank)
-    : RangedPass(), m_buffer_allocation_rank(buffer_allocation_rank) {}
-
 LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_ir, const LoopManagerPtr& loop_manager,
                                                         const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) {
     const auto& up_loops = up_expr->get_loop_ids();
@@ -189,11 +115,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir,
             //          Current expr Loop identifies:  3, 4, 6
             //          Need to insert between 2nd and 4th Loops - after 2nd Loop
             const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr);
-            const auto allocation_shape = compute_allocation_shape(loop_manager,
-                                                                   buffer_loop_ids,
-                                                                   parent_expr_output,
-                                                                   m_buffer_allocation_rank);
-            const auto buffer = std::make_shared<op::IntermediateMemoryBuffer>(parent->output(parent_port), allocation_shape);
+            const auto buffer = std::make_shared<op::IntermediateMemoryBuffer>(parent->output(parent_port));
             const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0)  : *entry_port;
             linear_ir.insert_node(buffer, std::vector<ExpressionPort>{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer  });
         }
@@ -276,11 +198,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir,
             // Note: All potential consumers must have the same count of first equal Loop identifies and the same count of different last identifies
             const auto pos = insertion_position(linear_ir, loop_manager, expr, consumer_expr);
 
-            const auto allocation_shape = compute_allocation_shape(loop_manager,
-                                                                   buffer_loop_ids,
-                                                                   *exit_port,
-                                                                   m_buffer_allocation_rank);
-            auto buffer = std::make_shared<op::IntermediateMemoryBuffer>(node->output(port_idx), allocation_shape);
+            auto buffer = std::make_shared<op::IntermediateMemoryBuffer>(node->output(port_idx));
             // We cannot insert Node output connector on Buffer output because not all consumers of Node needs Buffer
             //  Example:
             //       Add
diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
index e89c711627a911..dd418839ca84cc 100644
--- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp
@@ -75,12 +75,29 @@ LoopManager::LoopBounds InsertSpecificIterations::insert_copy_loop(LinearIR& lin
                                                                    std::vector<LoopPort>& new_entry_ports, std::vector<LoopPort>& new_exit_ports) {
     const auto& loop_manager = linear_ir.get_loop_manager();
     const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id);
+    const auto loop_begin_pos = loop_bounds.first;
+    const auto loop_end_pos = std::next(loop_bounds.second);
+
     ExpressionMap expression_map;
     const auto& cloning_config = LinearIRBuilder::Config(false);
-    const auto& loop_copy_range = LinearIRBuilder(cloning_config).clone_range(loop_bounds.first, std::next(loop_bounds.second), expression_map);
+    const auto& loop_copy_range = LinearIRBuilder(cloning_config).clone_range(loop_begin_pos, loop_end_pos, expression_map);
     const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end());
     const auto new_loop_end_pos = std::prev(insert_pos);
 
+    // Added connections between output of cloned bodies and the current LinearIR
+    for (LinearIR::constExprIt result_it = new_loop_begin_pos, original_it = loop_begin_pos; original_it != loop_end_pos; ++result_it, ++original_it) {
+        const auto result_expr = *result_it;
+        const auto original_expr = *original_it;
+        for (size_t i = 0; i < original_expr->get_output_count(); i++) {
+            const auto& consumers = original_expr->get_output_port_connector(i)->get_consumers();
+            for (const auto& consumer : consumers) {
+                if (std::find(loop_begin_pos, loop_end_pos, consumer.get_expr()) == loop_end_pos) {
+                    result_expr->get_output_port_connector(i)->add_consumer(consumer);
+                }
+            }
+        }
+    }
+
     auto clone_ports = [&expression_map](const std::vector<LoopPort>& ports, std::vector<LoopPort>& new_ports) {
         new_ports.resize(ports.size());
         for (size_t i = 0; i < ports.size(); ++i) {
diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp
index dd2d601366cb1a..3a928819e6c85d 100644
--- a/src/common/snippets/src/lowered/pass/iter_handler.cpp
+++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp
@@ -115,6 +115,7 @@ bool TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt beg
             offset = offset / inner_loop_work_amount * static_cast<int64_t>(m_tail_size);
         }
         inner_loop_end->set_work_amount(m_tail_size);
+        inner_loop_info->set_work_amount_const(true);
         // TODO: if m_tail_size more than inner loop increment,
         // handlers of the inner loop must be reset with new tail size
         inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size));
diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
deleted file mode 100644
index 76ef3562760daa..00000000000000
--- a/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
-
-#include "snippets/op/buffer.hpp"
-#include "snippets/itt.hpp"
-
-
-namespace ov {
-namespace snippets {
-namespace lowered {
-namespace pass {
-
-bool NormalizeBufferIDs::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs");
-
-    // [ original Buffer ID -> normalized ]
-    std::map<size_t, size_t> buffer_ids;
-    for (auto expr_it = begin; expr_it != end; ++expr_it) {
-        const auto& expr = *expr_it;
-        const auto op = expr->get_node();
-        if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
-            const auto buffer_id = buffer->get_id();
-            if (buffer_ids.count(buffer_id) == 0) {
-                const auto new_id = buffer_ids.size();
-                buffer_ids[buffer_id] = new_id;
-            }
-            buffer->set_id(buffer_ids[buffer_id]);
-        }
-    }
-    return buffer_ids.size();
-}
-
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp
new file mode 100644
index 00000000000000..3e235749ce7ca2
--- /dev/null
+++ b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/normalize_buffer_reg_groups.hpp"
+
+#include "snippets/op/buffer.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+bool NormalizeBufferRegisterGroups::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferRegisterGroups");
+
+    // [ original Buffer reg group -> normalized ]
+    std::map<size_t, size_t> buffer_reg_groups;
+    for (auto expr_it = begin; expr_it != end; ++expr_it) {
+        const auto& expr = *expr_it;
+        const auto op = expr->get_node();
+        if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
+            const auto group = buffer->get_reg_group();
+            if (buffer_reg_groups.count(group) == 0) {
+                const auto new_id = buffer_reg_groups.size();
+                buffer_reg_groups[group] = new_id;
+            }
+            buffer->set_reg_group(buffer_reg_groups[group]);
+        }
+    }
+    return buffer_reg_groups.size();
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
index 017704adf28089..9a89edd24767a6 100644
--- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
+++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -10,6 +10,7 @@
 #include "snippets/op/reduce.hpp"
 #include "snippets/op/horizon_max.hpp"
 #include "snippets/op/horizon_sum.hpp"
+#include "snippets/snippets_isa.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
 
@@ -101,6 +102,12 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin,
         replace_input_port_connectors({fill.first->get()->get_input_port(0)}, reduce_expr->get_input_port_connector(0));
         replace_input_port_connectors(reduce_expr->get_output_port_connector(0)->get_consumers(), horizon.first->get()->get_output_port_connector(0));
 
+        // Update input shapes of consumers
+        const auto reduce_consumers = horizon.first->get()->get_output_port_connector(0)->get_consumers();
+        for (const auto& consumer : reduce_consumers) {
+            consumer.get_expr()->updateShapes();
+        }
+
         // Update Loop info for outer loops
         const std::vector<ExpressionPort> input_ports{(*fill.first)->get_input_port(0)};
         const std::vector<ExpressionPort> output_ports{(*horizon.first)->get_output_port(0)};
diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp
similarity index 88%
rename from src/common/snippets/src/lowered/pass/identify_buffers.cpp
rename to src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp
index 7e859ce8b1b173..59c9bf21a0894a 100644
--- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "snippets/lowered/pass/identify_buffers.hpp"
+#include "snippets/lowered/pass/set_buffer_reg_group.hpp"
 
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/snippets_isa.hpp"
@@ -19,22 +19,22 @@ inline size_t index(size_t col_num, size_t row, size_t col) {
 }
 } // namespace
 
-bool operator==(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
+bool operator==(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferRegGroup::ShiftPtrParams& rhs) {
     if (&lhs == &rhs)
         return true;
     return lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset && lhs.data_size == rhs.data_size;
 }
-bool operator!=(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
+bool operator!=(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferRegGroup::ShiftPtrParams& rhs) {
     return !(rhs == lhs);
 }
 
-size_t IdentifyBuffers::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) {
+size_t SetBufferRegGroup::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) {
     const auto iter = std::find(pool.cbegin(), pool.cend(), target);
     OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph");
     return std::distance(pool.cbegin(), iter);
 }
 
-bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) {
+bool SetBufferRegGroup::can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) {
     // If data pointer shift parameters are unknown on model compilation stage (dynamic),
     // we cannot be sure that these data pointers will be proportionally shifted.
     // Then we force `false` value here to set unique registers for these buffers
@@ -44,13 +44,13 @@ bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrPara
     return are_static && equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0));
 }
 
-bool IdentifyBuffers::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+bool SetBufferRegGroup::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
                                    const std::pair<ExpressionPtr, ShiftPtrParams>& rhs) {
     const auto& lhs_ids = lhs.first->get_loop_ids();
     const auto& rhs_ids = rhs.first->get_loop_ids();
     const auto equal_loop_ids = lhs_ids == rhs_ids;
     if (equal_loop_ids) {  // Buffers are connected to the same Loop and have the same outer Loops
-        return !can_reuse_id(lhs.second, rhs.second);
+        return !can_be_in_one_group(lhs.second, rhs.second);
     } else {  // Buffers are connected to the same Loop, but one of Buffers - inside this Loop, another - outside
         // Buffers are adjacent if outer Buffer has not zero data shift params
         if (lhs_ids.size() == rhs_ids.size()) // If the count of outer Loops are equal, it means that outer loops are already different
@@ -64,7 +64,7 @@ bool IdentifyBuffers::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams
     }
 }
 
-void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+void SetBufferRegGroup::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
                                         const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
                                         const BufferPool& buffers,
                                         std::vector<bool>& adj) {
@@ -80,7 +80,7 @@ void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrP
     }
 }
 
-std::vector<bool> IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) {
+std::vector<bool> SetBufferRegGroup::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) {
     // The sync point to check for adjacency is Loop because only in Loop we increment pointers.
     // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes)
     // they are called as adjacent
@@ -113,7 +113,7 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt
     return adj;
 }
 
-IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_neighbours(const ExpressionPtr& loop_end_expr) {
+SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const ExpressionPtr& loop_end_expr) {
     const auto& loop_end = ov::as_type_ptr<op::LoopEnd>(loop_end_expr->get_node());
     const auto input_count = loop_end->get_input_num();
     const auto output_count = loop_end->get_output_num();
@@ -157,7 +157,7 @@ IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_neighbours(const Exp
     return buffer_neighbours;
 }
 
-IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_inside(const LinearIR::constExprIt& loop_end_it) {
+SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_inside(const LinearIR::constExprIt& loop_end_it) {
     const auto& loop_end = ov::as_type_ptr<op::LoopEnd>((*loop_end_it)->get_node());
     const auto loop_begin = loop_end->get_loop_begin();
     BufferMap inner_buffers;
@@ -172,7 +172,7 @@ IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_inside(const LinearI
     return inner_buffers;
 }
 
-auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferPool> {
+auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferPool> {
     size_t color = 0;
     std::map<size_t, BufferPool> color_groups;
     const auto size = buffers.size();
@@ -217,8 +217,8 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> s
     return color_groups;
 }
 
-bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers")
+bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBufferRegGroup")
     // Identify Buffers using Graph coloring algorithm.
     BufferPool buffer_pool;
 
@@ -239,7 +239,7 @@ bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt be
         const auto color = pair.first;
         const auto& united_buffers = pair.second;
         for (const auto& buffer_expr : united_buffers) {
-            ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->set_id(color);
+            ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->set_reg_group(color);
         }
     }
 
diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
index 5b7d7e07714b64..f8416f2ea7326e 100644
--- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
+++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
@@ -4,6 +4,7 @@
 
 #include "snippets/lowered/pass/solve_buffer_memory.hpp"
 
+#include "snippets/lowered/pass/allocate_buffers.hpp"
 #include "snippets/pass/tokenization.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
@@ -14,73 +15,144 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-std::vector<ov::MemorySolver::Box> SolveBufferMemory::init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters) {
-    std::vector<ov::MemorySolver::Box> boxes;
-    const auto count = static_cast<int>(buffer_clusters.size());
-    for (int i = 0; i < count; i++) {
-        ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
-        int64_t box_size = 0;
-        for (const auto& buffer_expr : buffer_clusters[i]) {
-            int e_start = 0, e_finish = 0;
-            const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(buffer_expr->get_node());
-            OPENVINO_ASSERT(buffer != nullptr, "BufferSolver expects Buffer ops in clusters");
-
-            // life finish time - order of LoopEnd / MemoryAccess ops
-            const auto& buffer_outs = buffer_expr->get_output_port_connectors();
-            for (const auto& buffer_out : buffer_outs) {
-                const auto consumers = buffer_out->get_consumers();
-                for (const auto& consumer : consumers) {
-                    const auto consumer_order = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node()));
-                    e_finish = std::max(e_finish, consumer_order);  // the last consumer
-                }
+std::pair<LinearIR::container, LinearIR::container> SolveBufferMemory::extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions) {
+    LinearIR::container static_buffer_exprs, dynamic_buffer_exprs;
+    for (const auto& buffer_expr : buffer_expressions) {
+        const auto& buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+        OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+
+        auto& clusters = buffer->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs;
+        clusters.push_back(buffer_expr);
+    }
+
+    // Validation check that buffer cluster has only static or dynamic buffers.
+    for (const auto& static_buffer : static_buffer_exprs) {
+        const auto static_cluster_id = ov::as_type_ptr<op::Buffer>(static_buffer->get_node())->get_cluster_id();
+        auto is_cluster_ids_the_same = [&static_cluster_id](const ExpressionPtr& expr) {
+            return static_cluster_id == ov::as_type_ptr<op::Buffer>(expr->get_node())->get_cluster_id();
+        };
+        OPENVINO_ASSERT(std::none_of(dynamic_buffer_exprs.cbegin(), dynamic_buffer_exprs.cend(), is_cluster_ids_the_same),
+                        "There is Buffer cluster with buffers which has defined and undefined allocation sizes");
+    }
+
+    return { static_buffer_exprs, dynamic_buffer_exprs };
+}
+
+std::vector<ov::MemorySolver::Box> SolveBufferMemory::init_boxes(const LinearIR::container& buffer_expressions) {
+    std::map<int, ov::MemorySolver::Box> map_boxes;
+    for (const auto& buffer_expr : buffer_expressions) {
+        const auto& buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+        OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+        auto cluster_id = static_cast<int>(buffer->get_cluster_id());
+
+        if (map_boxes.count(cluster_id) == 0) {
+            map_boxes[cluster_id] = { std::numeric_limits<int>::max(), 0, 0, cluster_id };
+        }
+
+        auto& box = map_boxes.at(cluster_id);
+
+        int e_start = 0, e_finish = 0;
+
+        // life finish time - order of LoopEnd / MemoryAccess ops
+        const auto& buffer_outs = buffer_expr->get_output_port_connectors();
+        for (const auto& buffer_out : buffer_outs) {
+            const auto consumers = buffer_out->get_consumers();
+            for (const auto& consumer : consumers) {
+                const auto consumer_order = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node()));
+                e_finish = std::max(e_finish, consumer_order);  // the last consumer
             }
-            e_start = e_finish;
-
-            const auto& buffer_ins = buffer_expr->get_input_port_connectors();
-            for (const auto& buffer_in : buffer_ins) {
-                const auto& source = buffer_in->get_source();
-                e_start = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node()));
-
-                const auto buffer_siblings = buffer_in->get_consumers();
-                for (const auto& sibling : buffer_siblings) {
-                    if (const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(sibling.get_expr()->get_node())) {
-                        e_start = std::min(e_start, static_cast<int>(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin())));
-                    }
+        }
+        e_start = e_finish;
+
+        const auto& buffer_ins = buffer_expr->get_input_port_connectors();
+        for (const auto& buffer_in : buffer_ins) {
+            const auto& source = buffer_in->get_source();
+            e_start = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node()));
+
+            const auto buffer_siblings = buffer_in->get_consumers();
+            for (const auto& sibling : buffer_siblings) {
+                if (const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(sibling.get_expr()->get_node())) {
+                    e_start = std::min(e_start, static_cast<int>(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin())));
                 }
             }
-            OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!");
+        }
+        OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!");
 
-            auto buffer_size = static_cast<int64_t>(buffer->get_byte_size());
-            box_size = std::max(buffer_size, box_size);
+        auto buffer_size = static_cast<int64_t>(buffer->get_byte_size());
+        box.size = std::max(buffer_size, box.size);
 
-            box.start = std::min(e_start, box.start);
-            box.finish = std::max(e_finish, box.finish);
-        }
+        box.start = std::min(e_start, box.start);
+        box.finish = std::max(e_finish, box.finish);
+    }
+
+    std::vector<ov::MemorySolver::Box> boxes(map_boxes.size());
+    for (const auto& p : map_boxes) {
+        const auto& buffer_id = static_cast<size_t>(p.first);
+        OPENVINO_ASSERT(buffer_id < boxes.size(), "Incorrect Buffer Cluster ID");
+        boxes[buffer_id] = p.second;
 
         // We use data alignment to put data in the line cache
-        box.size = utils::div_up(box_size, m_alignment);
-        boxes.push_back(box);
+        boxes.at(buffer_id).size = utils::div_up(boxes.at(buffer_id).size, m_alignment);
     }
+
     return boxes;
 }
 
-
-bool SolveBufferMemory::run(LinearIR& linear_ir) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory");
-
-    const auto boxes = init_boxes(m_clusters);
+void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions) {
+    const auto boxes = init_boxes(static_buffer_expressions);
 
     ov::MemorySolver memSolver(boxes);
-    m_buffer_scratchpad_size = static_cast<size_t>(memSolver.solve()) * m_alignment;  // alignment in byte
+    m_static_buffer_scratchpad_size = static_cast<size_t>(memSolver.solve()) * m_alignment;  // alignment in byte
 
     // Set offsets for Buffers
-    for (const auto& box : boxes) {
-        for (const auto& buffer : m_clusters[box.id]) {
-            const auto offset = static_cast<size_t>(memSolver.get_offset(static_cast<int>(box.id)));
-            AllocateBuffers::set_buffer_offset(buffer, offset * m_alignment);  // alignment in byte
+    for (const auto& buffer_expr : static_buffer_expressions) {
+        const auto& buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+        OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+        auto cluster_id = static_cast<int>(buffer->get_cluster_id());
+
+        const auto offset = static_cast<size_t>(memSolver.get_offset(static_cast<int>(boxes[cluster_id].id)));
+        AllocateBuffers::set_buffer_offset(buffer_expr, offset * m_alignment);  // alignment in byte
+    }
+}
+
+void SolveBufferMemory::set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions) {
+    size_t offset = utils::get_dynamic_value<size_t>();
+
+    // If there are not allocated memory for static buffers in LinearIR and there is only one cluster of dynamic buffer exprs,
+    // we can force offset = 0
+    if (m_static_buffer_scratchpad_size == 0) {
+        std::set<size_t> dynamic_clusters;
+        for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) {
+            const auto& buffer = ov::as_type_ptr<op::Buffer>(dynamic_buffer_expr->get_node());
+            OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+            dynamic_clusters.insert(buffer->get_cluster_id());
         }
+        if (dynamic_clusters.size() == 1)
+            offset = 0;
+    }
+
+    // Set offsets for Buffers
+    for (const auto& buffer_expr : dynamic_buffer_expressions) {
+        const auto& buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+        OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes");
+
+        AllocateBuffers::set_buffer_offset(buffer_expr, offset);
     }
-    return m_buffer_scratchpad_size > 0;
+}
+
+bool SolveBufferMemory::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory");
+
+    LinearIR::container static_buffer_exprs, dynamic_buffer_exprs;
+    std::tie(static_buffer_exprs, dynamic_buffer_exprs) = extract_static_and_dynamic_buffers(linear_ir.get_buffer_ops());
+
+    if (!static_buffer_exprs.empty())
+        solve_static_buffer_memory(static_buffer_exprs);
+
+    if (!dynamic_buffer_exprs.empty())
+        set_dynamic_buffer_offset(dynamic_buffer_exprs);
+
+    return !static_buffer_exprs.empty() && !dynamic_buffer_exprs.empty();
 }
 
 } // namespace pass
diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp
index 163980a21e5f72..365bcd250b4556 100644
--- a/src/common/snippets/src/lowered/pass/split_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/split_loops.cpp
@@ -70,6 +70,7 @@ bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin,
             if (FuseLoops::can_be_fused(upper_loop, lower_loop) && can_be_split(loop_to_split, loop_to_fuse)) {
                 loop_was_split = true;
                 loop_to_split->set_work_amount(loop_to_fuse->get_increment());
+                loop_to_split->set_work_amount_const(true);
 
                 const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id;
                 const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id,
diff --git a/src/common/snippets/src/lowered/pass/update_loop_info.cpp b/src/common/snippets/src/lowered/pass/update_loop_info.cpp
deleted file mode 100644
index 3112701a737dc0..00000000000000
--- a/src/common/snippets/src/lowered/pass/update_loop_info.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/lowered/pass/update_loop_info.hpp"
-
-#include "snippets/lowered/pass/init_loops.hpp"
-#include "snippets/lowered/pass/insert_specific_iterations.hpp"
-#include "snippets/lowered/loop_manager.hpp"
-#include "snippets/itt.hpp"
-
-namespace ov {
-namespace snippets {
-namespace lowered {
-namespace pass {
-
-void UpdateLoopInfo::init_data_ptr_shifts(const UnifiedLoopInfoPtr& unified_loop_info, std::vector<int64_t>& ptr_increments,
-                                          std::vector<int64_t>& finalization_offsets) {
-    const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count();
-    ptr_increments.resize(count);
-    finalization_offsets.resize(count);
-
-    size_t idx = 0;
-    unified_loop_info->iterate_through_descs(
-        [&ptr_increments, &finalization_offsets, &idx](const UnifiedLoopInfo::LoopPortDesc& desc) {
-            ptr_increments[idx] = desc.ptr_increment;
-            finalization_offsets[idx] = desc.finalization_offset;
-            ++idx;
-        });
-}
-
-bool UpdateLoopInfo::run(LinearIR& linear_ir) {
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::UpdateLoopInfo")
-
-    // Initialized UnifiedLoopInfo
-    struct CurrentUnifiedLoopInfo {
-        UnifiedLoopInfoPtr updated_unified_loop_info = nullptr;
-        size_t current_work_amount = 0;
-        std::vector<int64_t> ptr_increments;
-        std::vector<int64_t> finalization_offsets;
-    };
-    std::unordered_map<UnifiedLoopInfoPtr, CurrentUnifiedLoopInfo> initializated_info_map;
-
-    const auto& loop_map = linear_ir.get_loop_manager()->get_map();
-    for (const auto& p : loop_map) {
-        const auto& expanded_loop_info = ov::as_type_ptr<ExpandedLoopInfo>(p.second);
-        OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager");
-
-        // First visiting of unified (whole) loop
-        const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info();
-        if (initializated_info_map.count(current_unified_loop_info) == 0) {
-            auto& current_info = initializated_info_map[current_unified_loop_info];
-            // make a copy to avoid original loop info corruption
-            current_info.updated_unified_loop_info = std::make_shared<UnifiedLoopInfo>(*current_unified_loop_info);
-            InitLoops::init_loop_info(current_info.updated_unified_loop_info, true);
-
-            current_info.current_work_amount = current_info.updated_unified_loop_info->get_work_amount();
-            init_data_ptr_shifts(current_info.updated_unified_loop_info, current_info.ptr_increments, current_info.finalization_offsets);
-        }
-
-        auto& initializated_info = initializated_info_map.at(current_unified_loop_info);
-        auto& current_work_amount = initializated_info.current_work_amount;
-        const auto& updated_unified_loop_info = initializated_info.updated_unified_loop_info;
-        const auto& ptr_increments = initializated_info.ptr_increments;
-        const auto& finalization_offsets = initializated_info.finalization_offsets;
-
-        const auto& decomposed_loop_type = expanded_loop_info->get_type();
-
-        // If the specific iteration is not needed, we skip loop evaluation - set zero as work amount is enough
-        if (!InsertSpecificIterations::is_decomposed_loop_needed(updated_unified_loop_info, decomposed_loop_type, current_work_amount)) {
-            expanded_loop_info->set_work_amount(0);
-            continue;
-        }
-
-        expanded_loop_info->set_work_amount(
-            InsertSpecificIterations::get_decomposed_loop_work_amount(updated_unified_loop_info, decomposed_loop_type, current_work_amount));
-        // Update remaining Loop work amount
-        current_work_amount -= expanded_loop_info->get_work_amount();
-
-        expanded_loop_info->update_ptr_increments(ptr_increments);
-        if (current_work_amount > 0) {
-            expanded_loop_info->update_finalization_offsets(std::vector<int64_t>(finalization_offsets.size(), 0));
-        } else {
-            expanded_loop_info->update_finalization_offsets(finalization_offsets);
-        }
-    }
-    return true;
-}
-} // namespace pass
-} // namespace lowered
-} // namespace snippets
-} // namespace ov
diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp
index dc9dbdea76b5c8..c0d605e0dfa1c4 100644
--- a/src/common/snippets/src/lowered/pass/validate.cpp
+++ b/src/common/snippets/src/lowered/pass/validate.cpp
@@ -5,6 +5,7 @@
 #include "snippets/lowered/pass/validate.hpp"
 
 #include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
 
diff --git a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp
index 1653d9da993f6d..9c62907e569670 100644
--- a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp
@@ -5,6 +5,7 @@
 #include "snippets/lowered/pass/validate_expanded_loops.hpp"
 
 #include "snippets/lowered/loop_manager.hpp"
+#include "snippets/op/loop.hpp"
 #include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
 
@@ -17,15 +18,6 @@ namespace pass {
     OPENVINO_ASSERT((cond), "Failed to validate ExpandedLoops: ", __VA_ARGS__)
 
 namespace {
-template<class T>
-void dynamic_safe_add(T& lhs, const T& rhs) {
-    if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) {
-        lhs = utils::get_dynamic_value<T>();
-        return;
-    }
-    lhs += rhs;
-}
-
 bool is_inner_splitted_tail(const ExpressionPtr& loop_expr, const LoopManagerPtr& loop_manager) {
     const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
     INFORMATIVE_ASSERT(loop_end, "expects LoopEnd");
@@ -81,7 +73,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir)
             total_finalization_offsets.resize(num_ports, 0);
         }
 
-        dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount());
+        utils::dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount());
         INFORMATIVE_ASSERT(current_unified_loop_info->get_ptr_increments() == expanded_loop_info->get_ptr_increments(),
                            "incompatible pointer increments with UnifiedLoopInfo");
 
@@ -89,7 +81,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir)
         INFORMATIVE_ASSERT(finalization_offsets.size() == total_finalization_offsets.size(),
                            "incompatible finalization offset count");
         for (size_t i = 0; i < num_ports; ++i)
-            dynamic_safe_add(total_finalization_offsets[i], finalization_offsets[i]);
+            utils::dynamic_safe_add(total_finalization_offsets[i], finalization_offsets[i]);
     }
 }
 
diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp
index 424063d68d5a59..dc455300522ba1 100644
--- a/src/common/snippets/src/op/buffer.cpp
+++ b/src/common/snippets/src/op/buffer.cpp
@@ -13,75 +13,67 @@ namespace ov {
 namespace snippets {
 namespace op {
 
-Buffer::Buffer(const OutputVector& arguments, const ov::Shape& shape, size_t id, ov::element::Type element_type)
-    : Op(arguments), m_shape(shape), m_id(id), m_element_type(std::move(element_type)), m_offset(0) {
+Buffer::Buffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id)
+    : Op(arguments), m_allocation_size(allocation_size), m_reg_group(reg_group), m_cluster_id(cluster_id), m_offset(0) {
     constructor_validate_and_infer_types();
 }
 
 bool Buffer::visit_attributes(AttributeVisitor& visitor) {
     INTERNAL_OP_SCOPE(Buffer_visit_attributes);
-    visitor.on_attribute("allocation_shape", m_shape);
+    auto element_type = get_element_type();
+    auto allocation_size = utils::value2str(m_allocation_size);
+    visitor.on_attribute("allocation_size", allocation_size);
     visitor.on_attribute("offset", m_offset);
-    visitor.on_attribute("id", m_id);
-    visitor.on_attribute("element_type", m_element_type);
+    visitor.on_attribute("reg_group", m_reg_group);
+    visitor.on_attribute("cluster_id", m_cluster_id);
+    visitor.on_attribute("element_type", element_type);
     return true;
 }
 
-size_t Buffer::get_byte_size() const {
-    const auto shape = get_allocation_shape();
-    return ov::shape_size(shape) * m_element_type.size();
+bool Buffer::is_defined() const {
+    return !utils::is_dynamic_value(m_allocation_size);
 }
 
-IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, const ov::Shape& shape, size_t id)
-    : Buffer({arg}, shape, id) {
-    constructor_validate_and_infer_types();
+size_t Buffer::get_byte_size() const {
+    if (is_defined())
+        return m_allocation_size * get_element_type().size();
+    return utils::get_dynamic_value<size_t>();
 }
 
-IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, int32_t allocation_rank, size_t id)
-    : Buffer({arg}, compute_shape_from_allocation_rank(arg, allocation_rank), id) {
+IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output<ov::Node>& arg, size_t allocation_size, size_t reg_group, size_t cluster_id)
+    : Buffer({arg}, allocation_size, reg_group, cluster_id) {
     constructor_validate_and_infer_types();
 }
 
-ov::Shape IntermediateMemoryBuffer::compute_shape_from_allocation_rank(const ov::Output<ov::Node>& arg, int32_t allocation_rank) {
-    const auto& pshape = arg.get_partial_shape();
-    OPENVINO_ASSERT(pshape.is_static(), "Buffer supports only static input shape");
-    const auto shape = pshape.get_shape();
-    const auto normalize_rank = utils::normalize_rank(static_cast<int32_t>(allocation_rank), shape.size());
-    const auto offset = static_cast<int32_t>(shape.size()) - normalize_rank;
-    return ov::Shape{shape.begin() + offset, shape.end()};
-}
-
 void IntermediateMemoryBuffer::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
     ov::PartialShape output_shape;
-    m_element_type = get_input_element_type(0);
-    output_shape = get_input_partial_shape(0);
-    set_output_type(0, m_element_type, output_shape);
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
 }
 
 std::shared_ptr<Node> IntermediateMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    auto new_buffer = std::make_shared<IntermediateMemoryBuffer>(new_args.at(0), m_shape, m_id);
+    auto new_buffer = std::make_shared<IntermediateMemoryBuffer>(new_args.at(0), m_allocation_size, m_reg_group, m_cluster_id);
     new_buffer->set_offset(m_offset);
     return new_buffer;
 }
 
-NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t id, ov::element::Type element_type)
-    : Buffer({}, shape, id, element_type) {
+NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t reg_group, size_t cluster_id, ov::element::Type element_type)
+    : Buffer({}, ov::shape_size(shape), reg_group, cluster_id), m_output_shape(shape), m_element_type(element_type) {
     constructor_validate_and_infer_types();
 }
 
 void NewMemoryBuffer::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
     OPENVINO_ASSERT(get_input_size() == 0, "Buffer with new allocated memory mustn't have arguments!");
-    set_output_type(0, m_element_type, m_shape);
+    set_output_type(0, m_element_type, m_output_shape);
 }
 
 std::shared_ptr<Node> NewMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    auto new_buffer = std::make_shared<NewMemoryBuffer>(m_shape, m_id, m_element_type);
+    auto new_buffer = std::make_shared<NewMemoryBuffer>(m_output_shape, m_reg_group, m_cluster_id, m_element_type);
     new_buffer->set_offset(m_offset);
     return new_buffer;
 }
diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp
index 39c93a304a74d0..26b1dcb008244d 100644
--- a/src/common/snippets/src/op/loop.cpp
+++ b/src/common/snippets/src/op/loop.cpp
@@ -83,11 +83,14 @@ void LoopEnd::validate_and_infer_types() {
 
 bool LoopEnd::visit_attributes(AttributeVisitor &visitor) {
     std::vector<int> int_incremented(m_is_incremented.cbegin(), m_is_incremented.cend());
+    auto work_amount = utils::value2str(m_work_amount);
+    auto ptr_increments = ov::PartialShape(m_ptr_increments);
+    auto final_offsets = ov::PartialShape(m_finalization_offsets);
     visitor.on_attribute("is_incremented", int_incremented);
-    visitor.on_attribute("ptr_incr", m_ptr_increments);
-    visitor.on_attribute("fin_offset", m_finalization_offsets);
+    visitor.on_attribute("ptr_incr", ptr_increments);
+    visitor.on_attribute("fin_offset", final_offsets);
     visitor.on_attribute("data_sizes", m_element_type_sizes);
-    visitor.on_attribute("work_amount", m_work_amount);
+    visitor.on_attribute("work_amount", work_amount);
     visitor.on_attribute("increment", m_work_amount_increment);
     visitor.on_attribute("input_num", m_input_num);
     visitor.on_attribute("output_num", m_output_num);
diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp
index f3fa1f90c206ae..352e355ef75df5 100644
--- a/src/common/snippets/src/op/memory_access.cpp
+++ b/src/common/snippets/src/op/memory_access.cpp
@@ -2,8 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "snippets/itt.hpp"
 #include "snippets/op/memory_access.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
 
 namespace ov {
 namespace snippets {
@@ -49,20 +50,33 @@ bool MemoryAccess::is_full_memory_access_op(const std::shared_ptr<ov::Node>& op)
 }
 
 bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) {
+    bool is_dynamic = false;
     for (const auto& p : m_input_ports) {
         auto idx = p.first;
         auto port = p.second;
-        visitor.on_attribute("count_in_" + std::to_string(idx), port.count);
-        visitor.on_attribute("offset_in_" + std::to_string(idx), port.offset);
-        visitor.on_attribute("stride_in_" + std::to_string(idx), port.stride);
+        auto count = utils::value2str(port.count);
+        auto offset = utils::value2str(port.offset);
+        auto stride = utils::value2str(port.stride);
+        visitor.on_attribute("count_in_" + std::to_string(idx), count);
+        visitor.on_attribute("offset_in_" + std::to_string(idx), offset);
+        visitor.on_attribute("stride_in_" + std::to_string(idx), stride);
+        is_dynamic |= utils::is_dynamic_value(port.count) || utils::is_dynamic_value(port.offset) || utils::is_dynamic_value(port.stride);
     }
     for (const auto& p : m_output_ports) {
         auto idx = p.first;
         auto port = p.second;
-        visitor.on_attribute("count_out_" + std::to_string(idx), port.count);
-        visitor.on_attribute("offset_out_" + std::to_string(idx), port.offset);
-        visitor.on_attribute("stride_out_" + std::to_string(idx), port.stride);
+        auto count = utils::value2str(port.count);
+        auto offset = utils::value2str(port.offset);
+        auto stride = utils::value2str(port.stride);
+        visitor.on_attribute("count_out_" + std::to_string(idx), count);
+        visitor.on_attribute("offset_out_" + std::to_string(idx), offset);
+        visitor.on_attribute("stride_out_" + std::to_string(idx), stride);
+        is_dynamic |= utils::is_dynamic_value(port.count) || utils::is_dynamic_value(port.offset) || utils::is_dynamic_value(port.stride);
     }
+
+    std::string dynamic_status = is_dynamic ? "DYNAMIC" : "STATIC";
+    visitor.on_attribute("dynamic_status", dynamic_status);
+
     return true;
 }
 
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 8d888d5a75e7c2..ab793c722d1e3e 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -454,7 +454,7 @@ void Subgraph::control_flow_transformations(size_t min_parallel_work_amount, siz
     pipeline.register_pass<lowered::pass::FuseLoops>();
     pipeline.register_pass<lowered::pass::SplitLoops>();
     pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();
-    pipeline.register_pass<lowered::pass::InsertBuffers>(static_cast<int32_t>(loop_depth));
+    pipeline.register_pass<lowered::pass::InsertBuffers>();
     pipeline.register_pass<lowered::pass::InsertLoadStore>(vector_size);
     pipeline.register_pass<lowered::pass::MoveScalarToConsumer>();
     pipeline.register_pass<lowered::pass::InsertBroadcastMove>();
diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp
index 60ea8dd1a35e02..06e7ebaf74944e 100644
--- a/src/common/snippets/src/runtime_configurator.cpp
+++ b/src/common/snippets/src/runtime_configurator.cpp
@@ -4,17 +4,34 @@
 
 #include "snippets/runtime_configurator.hpp"
 
+#include "snippets/lowered/pass/init_loops.hpp"
+#include "snippets/lowered/pass/insert_specific_iterations.hpp"
+#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp"
+#include "snippets/snippets_isa.hpp"
 #include "snippets/utils.hpp"
-#include "snippets/lowered/pass/update_loop_info.hpp"
 
 namespace ov {
 namespace snippets {
 
+namespace {
+void init_data_ptr_shifts(const lowered::UnifiedLoopInfoPtr& unified_loop_info, std::vector<int64_t>& ptr_increments,
+                          std::vector<int64_t>& finalization_offsets) {
+    const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count();
+    ptr_increments.resize(count);
+    finalization_offsets.resize(count);
+
+    size_t idx = 0;
+    unified_loop_info->iterate_through_descs(
+        [&ptr_increments, &finalization_offsets, &idx](const lowered::UnifiedLoopInfo::LoopPortDesc& desc) {
+            ptr_increments[idx] = desc.ptr_increment;
+            finalization_offsets[idx] = desc.finalization_offset;
+            ++idx;
+        });
+}
+}  // namespace
+
 RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr<RuntimeConfig> c) : m_config(std::move(c)) {
     OPENVINO_ASSERT(m_config, "Runtime config is nullptr!");
-
-    // Init LinearIR StateUpdater: some passes to update LoopInfo, BufferInfo etc
-    m_state_updater.register_pass<lowered::pass::UpdateLoopInfo>();
 }
 
 const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
@@ -26,21 +43,10 @@ const std::shared_ptr<RuntimeConfig>& RuntimeConfigurator::get_updated_config(co
     return m_config;
 }
 
-void RuntimeConfigurator::update(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
-    if (linear_ir->is_dynamic()) {
-        m_state_updater.run(*linear_ir);
-    }
-
-    m_config->master_shape = linear_ir->get_master_shape();
-    m_config->buffer_scratchpad_size = linear_ir->get_buffer_scratchpad_size();
-
-    update_data_offsets();
-    update_latest_shapes();
-}
-
 void RuntimeConfigurator::initialization(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
     init_data_info(linear_ir);
     init_tensor_rank(linear_ir);
+    init_buffer_info(linear_ir);
 
     OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results");
     m_latest_shapes.resize(m_io_num);
@@ -48,6 +54,18 @@ void RuntimeConfigurator::initialization(const std::shared_ptr<lowered::LinearIR
     m_config->tile_rank = linear_ir->get_config().m_loop_depth;
 }
 
+void RuntimeConfigurator::update(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
+    if (linear_ir->is_dynamic()) {
+        update_loop_info(linear_ir);
+        update_buffer_scratchpad_size(linear_ir);
+    }
+
+    m_config->master_shape = linear_ir->get_master_shape();
+
+    update_data_offsets();
+    update_latest_shapes();
+}
+
 void RuntimeConfigurator::init_tensor_rank(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
     m_config->tensor_rank = linear_ir->get_master_shape().size();
 }
@@ -94,6 +112,122 @@ void RuntimeConfigurator::init_data_info(const std::shared_ptr<lowered::LinearIR
     }
 }
 
+void RuntimeConfigurator::init_buffer_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) {
+    std::set<size_t> cluster_ids;
+    std::map<size_t, std::set<lowered::ExpressionPtr>> dynamic_buffer_clusters, static_buffer_clusters;
+
+    const auto& buffer_expressions = linear_ir->get_buffer_ops();
+    for (const auto& buffer_expr : buffer_expressions) {
+        const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+        OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR");
+
+        auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters;
+        clusters[buffer->get_cluster_id()].insert(buffer_expr);
+        cluster_ids.insert(buffer->get_cluster_id());
+    }
+
+    OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters");
+    OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)),
+                    "Incorrect indetifiers of Buffer clusters");
+
+    m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size();
+    m_config->buffer_cluster_offsets.resize(cluster_ids.size(), utils::get_dynamic_value<size_t>());
+
+    for (const auto& p : static_buffer_clusters) {
+        const auto& cluster_id = p.first;
+        const auto& cluster = p.second;
+        OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic");
+
+        OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster");
+        size_t cluster_offset = ov::as_type_ptr<op::Buffer>((*cluster.cbegin())->get_node())->get_offset();
+        for (const auto& buffer_expr : cluster) {
+            OPENVINO_ASSERT(cluster_offset == ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->get_offset(),
+                            "Static Buffers from the same cluster must have the same offset!");
+        }
+
+        m_config->buffer_cluster_offsets[cluster_id] = cluster_offset;
+    }
+
+    m_dynamic_buffer_clusters = std::move(dynamic_buffer_clusters);
+}
+
+void RuntimeConfigurator::update_loop_info(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
+    // Initialized UnifiedLoopInfo
+    struct CurrentUnifiedLoopInfo {
+        size_t current_work_amount = 0;
+        std::vector<int64_t> ptr_increments;
+        std::vector<int64_t> finalization_offsets;
+    };
+    std::unordered_map<lowered::UnifiedLoopInfoPtr, CurrentUnifiedLoopInfo> initializated_info_map;
+
+    const auto& loop_map = linear_ir->get_loop_manager()->get_map();
+    for (const auto& p : loop_map) {
+        const auto& expanded_loop_info = ov::as_type_ptr<lowered::ExpandedLoopInfo>(p.second);
+        OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager");
+
+        // First visiting of unified (whole) loop
+        const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info();
+        if (initializated_info_map.count(current_unified_loop_info) == 0) {
+            auto& current_info = initializated_info_map[current_unified_loop_info];
+            lowered::pass::InitLoops::init_loop_info(current_unified_loop_info, true);
+
+            current_info.current_work_amount = current_unified_loop_info->get_work_amount();
+            init_data_ptr_shifts(current_unified_loop_info, current_info.ptr_increments, current_info.finalization_offsets);
+        }
+
+        auto& initializated_info = initializated_info_map.at(current_unified_loop_info);
+        auto& current_work_amount = initializated_info.current_work_amount;
+        const auto& ptr_increments = initializated_info.ptr_increments;
+        const auto& finalization_offsets = initializated_info.finalization_offsets;
+
+        const auto& decomposed_loop_type = expanded_loop_info->get_type();
+
+        // If the specific iteration is not needed, we skip loop evaluation - set zero as work amount is enough
+        if (!lowered::pass::InsertSpecificIterations::is_decomposed_loop_needed(current_unified_loop_info, decomposed_loop_type, current_work_amount)) {
+            expanded_loop_info->set_work_amount(0);
+            continue;
+        }
+
+        expanded_loop_info->set_work_amount(
+            lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount));
+        // Update remaining Loop work amount
+        current_work_amount -= expanded_loop_info->get_work_amount();
+
+        expanded_loop_info->update_ptr_increments(ptr_increments);
+        if (current_work_amount > 0) {
+            expanded_loop_info->update_finalization_offsets(std::vector<int64_t>(finalization_offsets.size(), 0));
+        } else {
+            expanded_loop_info->update_finalization_offsets(finalization_offsets);
+        }
+    }
+}
+
+void RuntimeConfigurator::update_buffer_scratchpad_size(const std::shared_ptr<lowered::LinearIR>& linear_ir) const {
+    const auto& loop_manager = linear_ir->get_loop_manager();
+    m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size();
+
+    for (const auto& p : m_dynamic_buffer_clusters) {
+        const auto& cluster_id = p.first;
+        const auto& cluster = p.second;
+
+        auto& cluster_offset = m_config->buffer_cluster_offsets[cluster_id];
+        cluster_offset = utils::get_dynamic_value<size_t>();
+
+        size_t additional_size = 0;
+        for (const auto& buffer_expr : cluster) {
+            const auto& allocation_size = lowered::pass::ComputeBufferAllocationSize::get_allocation_size(loop_manager, buffer_expr, m_config->tile_rank);
+            additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size);
+        }
+
+        cluster_offset = m_config->buffer_scratchpad_size;
+        OPENVINO_ASSERT(!utils::is_dynamic_value(cluster_offset), "Offset of the cluster must be defined!");
+        OPENVINO_ASSERT(!utils::is_dynamic_value(additional_size), "Buffer scratchpad size must be defined!");
+        m_config->buffer_scratchpad_size += additional_size;
+    }
+
+    OPENVINO_ASSERT(!utils::is_dynamic_value(m_config->buffer_scratchpad_size), "Buffer scratchpad size must be defined!");
+}
+
 void RuntimeConfigurator::update_data_offsets() const {
     for (size_t i = 0; i < m_io_num; ++i) {
         // offsets represent distance between consecutive elements of corresponding dimension.
diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
index 49cc1a379c8b18..d56c28acf66a28 100644
--- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
+++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
@@ -12,16 +12,6 @@ using Result = IShapeInferSnippets::Result;
  * Merge SRC to DST with broadcasting rules defined by the Autobroadcast specifier
  */
 bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op::AutoBroadcastSpec& autob) {
-    auto broadcast_merge_dim = [](size_t& dst, const size_t& d1, const size_t& d2) {
-        if (d1 == d2 || d1 == 1 || utils::is_dynamic_value(d1)) {
-            dst = d2;
-        } else if (d2 == 1 || utils::is_dynamic_value(d2)) {
-            dst = d1;
-        } else {
-           return false;
-        }
-        return true;
-    };
     // Ranks are both static.
     const auto dst_rank = static_cast<int64_t>(dst.size());
     const auto src_rank = static_cast<int64_t>(src.size());
@@ -35,7 +25,7 @@ bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op::
             for (int64_t i = 0; i < new_rank; i++) {
                 auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)];
                 auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)];
-                success &= broadcast_merge_dim(dims[i], dsti, srci);
+                success &= utils::broadcast_merge_dim(dims[i], dsti, srci);
             }
             dst = std::move(dims);
             return success;
@@ -55,7 +45,7 @@ bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op::
                     if (src[i] > dst[axis + i])
                         return false;
                 }
-                success &= broadcast_merge_dim(dst[axis + i], dst[axis + i], src[i]);
+                success &= utils::broadcast_merge_dim(dst[axis + i], dst[axis + i], src[i]);
             }
             return success;
         }
diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp
index 9d67248efc079f..a7f00bbfebcb9c 100644
--- a/src/common/snippets/src/utils.cpp
+++ b/src/common/snippets/src/utils.cpp
@@ -102,14 +102,15 @@ auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ov::op::v0::Fake
     }
 }
 
-void broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2) {
+bool broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2) {
     if (d1 == d2 || d1 == 1 || is_dynamic_value(d2)) {
         dst = d2;
+        return true;
     } else if (d2 == 1 || is_dynamic_value(d1)) {
         dst = d1;
-    } else {
-        OPENVINO_THROW("Failed to broadcast dims: ", d1, " and ", d2);
+        return true;
     }
+    return false;
 }
 
 VectorDims pshape_to_vdims(const PartialShape& pshape) {
diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
index 2909b00136514d..1090fb6eb1a858 100644
--- a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
+++ b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
@@ -17,7 +17,8 @@ typedef std::tuple<
     bool,   // Optimized pipeline
     bool,   // With SplitLoops opt
     size_t, // Expected Buffer size in bytes
-    size_t  // Expected unique Buffer IDs count
+    size_t, // Expected unique Buffer reg group count
+    size_t  // Expected unique Buffer cluster count
 > BufferAllocationParams;
 
 class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
@@ -38,7 +39,8 @@ class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParam
     ov::snippets::lowered::LinearIR m_linear_ir;
 
     size_t m_expected_size = 0;
-    size_t m_expected_count = 0;
+    size_t m_expected_reg_group_count = 0;
+    size_t m_expected_cluster_count = 0;
 
     size_t m_loop_depth = 2;
     size_t m_vector_size = 16;
@@ -52,13 +54,6 @@ class EltwiseBufferAllocationTest : public BufferAllocationTest {
     std::shared_ptr<ov::Model> GetModel() const override;
 };
 
-class MHABufferAllocationTest : public BufferAllocationTest {
-protected:
-    std::shared_ptr<ov::Model> GetModel() const override;
-
-    static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
-};
-
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
index 26f63454318d37..e56a31a8e92a4c 100644
--- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@@ -26,20 +26,21 @@ namespace snippets {
 
 std::string BufferAllocationTest::getTestCaseName(testing::TestParamInfo<ov::test::snippets::BufferAllocationParams> obj) {
     bool is_optimized, with_split_loops;
-    size_t expected_size, expected_count;
+    size_t expected_size, expected_reg_group_count, expected_cluster_count;
 
-    std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
+    std::tie(is_optimized, with_split_loops, expected_size, expected_reg_group_count, expected_cluster_count) = obj.param;
 
     std::ostringstream result;
     result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
     result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
     result << "ExpBufferSize=" << expected_size << "_";
-    result << "ExpBufferNum=" << expected_count;
+    result << "ExpBufferRegGroupCount=" << expected_reg_group_count << "_";
+    result << "ExpBufferClustersCount=" << expected_reg_group_count << "_";
     return result.str();
 }
 
 void BufferAllocationTest::SetUp() {
-    std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
+    std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_reg_group_count, m_expected_cluster_count) = this->GetParam();
 
     const auto body = GetModel();
     m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::IShapeInferSnippetsFactory>());
@@ -71,7 +72,7 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippe
     pipeline.register_pass<ov::snippets::lowered::pass::ReduceDecomposition>(m_vector_size);
     pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
-    pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
+    pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>();
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
     pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
@@ -80,14 +81,16 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippe
 }
 
 void BufferAllocationTest::Validate() {
-    std::set<size_t> gprs;
+    std::set<size_t> reg_groups, clusters;
     for (const auto& expr : m_linear_ir) {
         if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
-            gprs.insert(buffer->get_id());
+            reg_groups.insert(buffer->get_reg_group());
+            clusters.insert(buffer->get_cluster_id());
         }
     }
-    EXPECT_EQ(gprs.size(), m_expected_count);
-    EXPECT_EQ(m_linear_ir.get_buffer_scratchpad_size(), m_expected_size);
+    EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count);
+    EXPECT_EQ(clusters.size(), m_expected_cluster_count);
+    EXPECT_EQ(m_linear_ir.get_static_buffer_scratchpad_size(), m_expected_size);
 }
 
 std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
@@ -98,9 +101,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
     const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
     const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
     const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
-    const auto buffer0 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto buffer0 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(add);
     const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
-    const auto buffer1 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto buffer1 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(relu);
     const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
     const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});
 
@@ -113,65 +116,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
     return body;
 }
 
-void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
-    const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
-                                                    ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
-    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
-        node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
-}
-
-std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
-    const auto subtensor_scalar = std::vector<size_t>{1};
-    const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
-    const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-    const auto subtensor_power = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
-
-    const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
-    const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
-    const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
-
-    const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
-    const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
-    const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
-    const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
-    const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);
-
-    // Decomposed Softmax
-    const auto reduce_max = std::make_shared<ov::snippets::op::ReduceMax>(relu1, 3);
-    ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_max);
-    const auto subtract = std::make_shared<ov::op::v1::Subtract>(relu1, reduce_max);
-    const auto exp = std::make_shared<ov::op::v0::Exp>(subtract);
-
-    const auto reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(exp, 3);
-    ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum);
-    const auto power = std::make_shared<ov::snippets::op::PowerStatic>(reduce_sum, -1.f);
-    const auto multiply = std::make_shared<ov::op::v1::Multiply>(exp, power);
-
-    const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(multiply, parameter2);
-    const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);
-
-    const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
-
-    MarkOp(load_reshape, subtensor_scalar);
-    MarkOp(store, subtensor_scalar);
-    MarkOp(power, subtensor_power);
-
-    MarkBrgemm(matmul0, subtensor_brgemm);
-    MarkBrgemm(matmul1, subtensor_brgemm);
-
-    return body;
-}
-
 TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
     Validate();
 }
-TEST_P(MHABufferAllocationTest, BufferAllocation) {
-    Validate();
-}
 
 namespace BufferAllocationTest_Instances {
 
@@ -179,8 +126,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseNotOptimized, El
                          ::testing::Combine(
                                  ::testing::Values(false),
                                  ::testing::Values(false),  // in this test it doesn't make sense
-                                 ::testing::Values(80000), // Each Buffer has own allocated memory
-                                 ::testing::Values(2)),  // Each Buffer has unique ID
+                                 ::testing::Values(80000),  // Each Buffer has own allocated memory
+                                 ::testing::Values(2),      // Each Buffer has unique reg group
+                                 ::testing::Values(2)),     // Each Buffer has unique cluster ID
                          BufferAllocationTest::getTestCaseName);
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, EltwiseBufferAllocationTest,
@@ -188,39 +136,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, Eltwi
                                  ::testing::Values(true),
                                  ::testing::Values(false),  // in this test it doesn't make sense
                                  ::testing::Values(40000),  // Two Buffer reuse memory
-                                 ::testing::Values(1)),  // Two Buffers reuse IDs
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(false),
-                                 ::testing::Values(true),
-                                 ::testing::Values(139264), // Each Buffer has own allocated memory
-                                 ::testing::Values(7)),  // Each Buffer has unique ID
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(true),
-                                 ::testing::Values(true),
-                                 ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
-                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(false),
-                                 ::testing::Values(false),
-                                 ::testing::Values(360448), // Each Buffer has own allocated memory
-                                 ::testing::Values(7)),  // Each Buffer has unique ID
-                         BufferAllocationTest::getTestCaseName);
-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
-                         ::testing::Combine(
-                                 ::testing::Values(true),
-                                 ::testing::Values(false),
-                                 ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
-                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
+                                 ::testing::Values(1),      // Two Buffers reuse IDs
+                                 ::testing::Values(1)),     // Two Buffers are from the same luster
                          BufferAllocationTest::getTestCaseName);
 
 }  // namespace BufferAllocationTest_Instances
diff --git a/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp
index 4df2aa7c56033f..f799e5d38e1ab3 100644
--- a/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp
@@ -6,6 +6,7 @@
 
 #include "openvino/opsets/opset10.hpp"
 #include "snippets/lowered/pass/insert_load_store.hpp"
+#include "snippets/snippets_isa.hpp"
 
 namespace ov {
 namespace test {
diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp
index 0169201e0aee60..c86be368a5ab1b 100644
--- a/src/common/snippets/tests/src/lowered/pass/loop.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp
@@ -47,7 +47,7 @@ static void init_linear_ir(const std::vector<ov::PartialShape>& in_shapes, Linea
     const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1));
     const auto outer_inc = blocked_wa;
     loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_input_ports, loop_output_ports);
-    loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_input_ports, loop_output_ports);
+    loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_input_ports, loop_output_ports, true, true);
     const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_input_ports, loop_output_ports);
     const auto& outer_loop_info = loop_manager->get_loop_info<UnifiedLoopInfo>(loop_id);
     const auto outer_tail_size = outer_wa % outer_inc;
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp
index 71e6b3294e1773..cd36de7847bdbe 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp
@@ -48,7 +48,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov
     std::set<size_t> unique_buffers;
     for (const auto& expr : *body) {
         if (const auto buffer = ov::as_type_ptr<snippets::op::Buffer>(expr->get_node())) {
-            const auto buffer_id = buffer->get_id();
+            const auto buffer_id = buffer->get_cluster_id();
             if (unique_buffers.count(buffer_id) == 0) {
                 mem_access_exprs.push_back(expr);
                 unique_buffers.insert(buffer_id);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp
index 2a34ca9fc50e00..f04720466a4631 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp
@@ -46,6 +46,7 @@ struct jit_snippets_call_args {
     // for all non-static data members. So we can keep them public or friend all control-flow emitters
     loop_args_t* loop_args = nullptr;
     amx_tile_config_t amx_tile_config;
+    size_t memory_access_offsets[SNIPPETS_MAX_DATA_PTR_COUNT] = {};
 };
 
 struct jit_snippets_call_args::loop_args_t {
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index a1fde3bf28f3bf..e444d65bab774b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -61,8 +61,7 @@ namespace ov {
 static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) {
     bool ret = false;
     if (dynamic_cast<const intel_cpu::jit_load_memory_emitter*>(emitter) ||
-        dynamic_cast<const intel_cpu::jit_load_broadcast_emitter*>(emitter) ||
-        dynamic_cast<const intel_cpu::jit_load_convert_emitter*>(emitter)) {
+        dynamic_cast<const intel_cpu::jit_load_broadcast_emitter*>(emitter)) {
         return true;
     }
     return ret;
@@ -70,8 +69,7 @@ static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) {
 
 static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) {
     bool ret = false;
-    if (dynamic_cast<const intel_cpu::jit_store_memory_emitter*>(emitter) ||
-        dynamic_cast<const intel_cpu::jit_store_convert_emitter*>(emitter)) {
+    if (dynamic_cast<const intel_cpu::jit_store_memory_emitter*>(emitter)) {
         return true;
     }
     return ret;
@@ -171,12 +169,12 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter);
-    jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_convert_emitter);
-    jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_convert_emitter);
+    jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
+    jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
 
     jitters[snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter);
-    jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_convert_emitter);
-    jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_convert_emitter);
+    jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter);
+    jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter);
 
     jitters[snippets::op::Scalar::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_scalar_emitter);
     jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
index 42f8d61c669dd5..fe1864ab8467c2 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
@@ -44,10 +44,10 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov
     std::set<size_t> unique_buffers;
     for (const auto& expr : *body) {
         if (const auto buffer = ov::as_type_ptr<snippets::op::Buffer>(expr->get_node())) {
-            const auto buffer_id = buffer->get_id();
-            if (unique_buffers.count(buffer_id) == 0) {
+            const auto buffer_reg_group = buffer->get_reg_group();
+            if (unique_buffers.count(buffer_reg_group) == 0) {
                 mem_access_exprs.push_back(expr);
-                unique_buffers.insert(buffer_id);
+                unique_buffers.insert(buffer_reg_group);
             }
         } else {
             if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() &&
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp
index 21ba08422a1665..df888b63b3d601 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp
@@ -4,8 +4,10 @@
 
 #include "jit_memory_emitters.hpp"
 
+#include "emitters/snippets/jit_snippets_call_args.hpp"
 #include "transformations/snippets/x64/op/load_convert.hpp"
 #include "transformations/snippets/x64/op/store_convert.hpp"
+#include "snippets/op/buffer.hpp"
 
 
 using namespace Xbyak;
@@ -19,143 +21,122 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator;
 using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
 using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
 
-jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) {
+jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map in_out_type)
+    : jit_emitter(h, isa) {
+    in_out_type_ = in_out_type;
+
     const auto n = expr->get_node();
     src_prc = n->get_input_element_type(0);
     dst_prc = n->get_output_element_type(0);
-}
 
-jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
-    : jit_memory_emitter(h, isa, expr) {
-    if (src_prc != dst_prc)
-        OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ",
-                                 src_prc.get_type_name(),
-                                 " and ",
-                                 dst_prc.get_type_name());
+    const auto& memory_access = std::dynamic_pointer_cast<ov::snippets::modifier::MemoryAccess>(expr->get_node());
+    if (in_out_type_ == emitter_in_out_map::gpr_to_vec) {
+        OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_input_port(0), "must be input port - memory access");
+        count = memory_access->get_input_count();
+        compiled_byte_offset = memory_access->get_input_offset();
+        runtime_args_offset = get_parent_buffer_cluster_id(expr);
+    } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) {
+        OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access");
+        count = memory_access->get_output_count();
+        compiled_byte_offset = memory_access->get_output_offset();
+        runtime_args_offset = get_consumer_buffer_cluster_id(expr);
+    } else {
+        OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type");
+    }
 
-    const auto load = std::dynamic_pointer_cast<snippets::op::Load>(expr->get_node());
-    count = load->get_count();
-    byte_offset = load->get_offset();
-    in_out_type_ = emitter_in_out_map::gpr_to_vec;
-    load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
+    if (ov::snippets::utils::is_dynamic_value(compiled_byte_offset)) {
+        is_offset_runtime = true;
+        // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset pointer in the register
+        compiled_byte_offset = 0;
+        OPENVINO_ASSERT(runtime_args_offset != SIZE_MAX, "Incorrect buffer offset in call_args");
+    }
 }
 
-void jit_load_memory_emitter::emit_impl(const std::vector<size_t>& in,
-                            const std::vector<size_t>& out) const {
-    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-    } else {
-        OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_);
+size_t jit_memory_emitter::aux_gprs_count() const {
+    // for runtime arguments
+    return is_offset_runtime ? 1 : 0;
+}
+
+size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) {
+    OPENVINO_ASSERT(expr->get_input_port_connectors().size() == 1, "MemoryAccess must have one parent");
+    const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr();
+    if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(parent_expr->get_node())) {
+        return buffer->get_cluster_id();
     }
+    return SIZE_MAX;
 }
 
-template <cpu_isa_t isa>
-void jit_load_memory_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-    if (!load_emitter)
-        OV_CPU_JIT_EMITTER_THROW("Load CPU emitter isn't initialized!");
-    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) {
+    OPENVINO_ASSERT(expr->get_output_port_connectors().size() == 1, "MemoryAccess must have one consumer");
+    const auto& consumers = expr->get_output_port_connector(0)->get_consumers();
+    for (const auto& consumer : consumers)
+        if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(consumer.get_expr()->get_node()))
+            return buffer->get_cluster_id();
+    return SIZE_MAX;
 }
 
-void jit_load_memory_emitter::emit_data() const {
-    load_emitter->emit_data();
+std::vector<size_t> jit_memory_emitter::get_available_aux_gprs() const {
+    if (aux_gpr_idxs.empty())
+        return aux_gpr_idxs;
+    return std::vector<size_t>(aux_gpr_idxs.cbegin() + static_cast<size_t>(is_offset_runtime), aux_gpr_idxs.cend());
 }
 
-jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
-    : jit_memory_emitter(h, isa, expr) {
-    if (src_prc != dst_prc)
-        OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ",
-                                 src_prc.get_type_name(),
-                                 " and ",
-                                 dst_prc.get_type_name());
+void jit_memory_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs,
+                                   const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) const {
+    emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs);
 
-    const auto broadcast_load = std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(expr->get_node());
-    byte_offset = broadcast_load->get_offset();
-    in_out_type_ = emitter_in_out_map::gpr_to_vec;
-}
+    Reg64 reg_runtime_params = abi_param1;  // defined by jit_kernel_emitter
+    Reg64 aux_gpr = is_offset_runtime ? Reg64(static_cast<int>(aux_gpr_idxs[0])) : Reg64();
 
-void jit_load_broadcast_emitter::emit_impl(const std::vector<size_t>& in,
-                                     const std::vector<size_t>& out) const {
-    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    Reg64 data_reg;
+    if (in_out_type_ == emitter_in_out_map::gpr_to_vec) {
+        data_reg = Reg64(in_idxs[0]);
+    } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) {
+        data_reg = Reg64(out_idxs[0]);
     } else {
-        OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_);
+        OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type");
     }
-}
 
-template <cpu_isa_t isa>
-void jit_load_broadcast_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
-            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-    Reg64 in_reg(in[0]);
-    Vmm vmm_dst = Vmm(out[0]);
+    if (is_offset_runtime) {
+        h->mov(aux_gpr, h->ptr[reg_runtime_params + GET_OFF(memory_access_offsets) + runtime_args_offset * sizeof(size_t)]);
+        h->add(data_reg, aux_gpr);
+    }
 
-    // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
-    // key point here is not to add post-increment, it might be fixed by some other approach in future
-    switch (src_prc.size()) {
-        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + byte_offset]); break;
-        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + byte_offset]); break;
-        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + byte_offset]); break;
-        default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type");
+    emit_impl(in_idxs, out_idxs);
+
+    if (is_offset_runtime) {
+        h->sub(data_reg, aux_gpr);
     }
-}
 
-jit_load_convert_emitter::jit_load_convert_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
-    : jit_memory_emitter(h, isa, expr) {
-    const auto load = ov::as_type_ptr<snippets::op::Load>(expr->get_node());
-    count = load->get_count();
-    byte_offset = load->get_offset();
-    in_out_type_ = emitter_in_out_map::gpr_to_vec;
-    load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
+    emitter_postamble();
 }
 
-void jit_load_convert_emitter::emit_impl(const std::vector<size_t>& in,
-                                   const std::vector<size_t>& out) const {
-    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
-    } else {
-        OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_);
-    }
+jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
+    : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) {
+    OV_CPU_JIT_EMITTER_ASSERT(ov::is_type<snippets::op::Load>(expr->get_node()), "expects Load node");
+    load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
 }
 
-template <cpu_isa_t isa>
-void jit_load_convert_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-    if (!load_emitter)
-        OV_CPU_JIT_EMITTER_THROW("Load CPU emitter isn't initialized!");
-    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+void jit_load_memory_emitter::emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
+    OV_CPU_JIT_EMITTER_ASSERT(load_emitter, "Load CPU emitter isn't initialized!");
+    load_emitter->emit_code({in[0], compiled_byte_offset}, {out[0]}, aux_vec_idxs, get_available_aux_gprs());
 }
 
-void jit_load_convert_emitter::emit_data() const {
+void jit_load_memory_emitter::emit_data() const {
     load_emitter->emit_data();
 }
 
-jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) {
+jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
+    : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) {
+    OV_CPU_JIT_EMITTER_ASSERT(ov::is_type<snippets::op::BroadcastLoad>(expr->get_node()), "expects BroadcastLoad node");
     if (src_prc != dst_prc)
         OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ",
                                  src_prc.get_type_name(),
                                  " and ",
                                  dst_prc.get_type_name());
-
-    const auto store = ov::as_type_ptr<snippets::op::Store>(expr->get_node());
-    count = store->get_count();
-    byte_offset = store->get_offset();
-    in_out_type_ = emitter_in_out_map::vec_to_gpr;
-    store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
 }
 
-void jit_store_memory_emitter::emit_impl(const std::vector<size_t>& in,
-                             const std::vector<size_t>& out) const {
+void jit_load_broadcast_emitter::emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
     if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
         emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
     } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
@@ -168,51 +149,41 @@ void jit_store_memory_emitter::emit_impl(const std::vector<size_t>& in,
 }
 
 template <cpu_isa_t isa>
-void jit_store_memory_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-    if (!store_emitter)
-        OV_CPU_JIT_EMITTER_THROW("Store CPU emitter isn't initialized!");
-    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
-}
+void jit_load_broadcast_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+    Reg64 in_reg(in[0]);
+    Vmm vmm_dst = Vmm(out[0]);
 
-void jit_store_memory_emitter::emit_data() const {
-    store_emitter->emit_data();
+    // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
+    // key point here is not to add post-increment, it might be fixed by some other approach in future
+    switch (src_prc.size()) {
+        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break;
+        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break;
+        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break;
+        default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type");
+    }
 }
 
-jit_store_convert_emitter::jit_store_convert_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
-    : jit_memory_emitter(h, isa, expr) {
-    const auto store = ov::as_type_ptr<snippets::op::Store>(expr->get_node());
-    count = store->get_count();
-    byte_offset = store->get_offset();
-    in_out_type_ = emitter_in_out_map::vec_to_gpr;
-
+jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr)
+    : jit_memory_emitter(h, isa, expr, emitter_in_out_map::vec_to_gpr) {
     if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(expr->get_node())) {
         store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::truncation));
     } else if (ov::is_type<ov::intel_cpu::StoreConvertSaturation>(expr->get_node())) {
         store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::saturation));
-    }
-}
-
-void jit_store_convert_emitter::emit_impl(const std::vector<size_t>& in,
-                                    const std::vector<size_t>& out) const {
-    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
-        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
-        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
-    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
-        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else if (ov::is_type<ov::snippets::op::Store>(expr->get_node())) {
+        store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
     } else {
-        OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_);
+        OV_CPU_JIT_EMITTER_THROW("expects Store node");
     }
 }
 
-template <cpu_isa_t isa>
-void jit_store_convert_emitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
-    if (!store_emitter)
-        OV_CPU_JIT_EMITTER_THROW("Store CPU emitter isn't initialized!");
-    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+void jit_store_memory_emitter::emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const {
+    OV_CPU_JIT_EMITTER_ASSERT(store_emitter, "Store CPU emitter isn't initialized!");
+    store_emitter->emit_code({in[0], compiled_byte_offset}, {out[0]}, aux_vec_idxs, get_available_aux_gprs());
 }
 
-void jit_store_convert_emitter::emit_data() const {
+void jit_store_memory_emitter::emit_data() const {
     store_emitter->emit_data();
 }
 
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp
index 50276d9d9e2f1b..50315ec298a2d4 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp
@@ -14,14 +14,27 @@ namespace intel_cpu {
 class jit_memory_emitter : public jit_emitter  {
 public:
     jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                       const ov::snippets::lowered::ExpressionPtr& expr);
+                       const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map in_out_type);
+
+    void emit_code(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs,
+                   const std::vector<size_t> &pool_vec_idxs = {}, const std::vector<size_t> &pool_gpr_idxs = {}) const override;
 
 protected:
+    static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr);
+    static size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr);
+
+    size_t aux_gprs_count() const override;
+
+    std::vector<size_t> get_available_aux_gprs() const;
+
     ov::element::Type src_prc;
     ov::element::Type dst_prc;
 
     size_t count = 0;
-    size_t byte_offset = 0;
+    size_t compiled_byte_offset = 0;
+    size_t runtime_args_offset = 0;
+    bool is_offset_runtime = false;
+
 #ifdef SNIPPETS_DEBUG_CAPS
     friend std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter);
 #endif
@@ -37,8 +50,6 @@ class jit_load_memory_emitter : public jit_memory_emitter {
 private:
     void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
 
-    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
     void emit_data() const override;
 
 private:
@@ -59,24 +70,6 @@ class jit_load_broadcast_emitter : public jit_memory_emitter {
     void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 };
 
-class jit_load_convert_emitter : public jit_memory_emitter {
-public:
-    jit_load_convert_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                             const ov::snippets::lowered::ExpressionPtr& expr);
-
-    size_t get_inputs_num() const override {return 0;}
-
-private:
-    void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
-
-    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
-    void emit_data() const override;
-
-private:
-    std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
-};
-
 class jit_store_memory_emitter : public jit_memory_emitter  {
 public:
     jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
@@ -87,26 +80,6 @@ class jit_store_memory_emitter : public jit_memory_emitter  {
 private:
     void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
 
-    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
-    void emit_data() const override;
-
-private:
-    std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
-};
-
-class jit_store_convert_emitter : public jit_memory_emitter {
-public:
-    jit_store_convert_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                              const ov::snippets::lowered::ExpressionPtr& expr);
-
-    size_t get_inputs_num() const override {return 1;}
-
-private:
-    void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
-
-    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
-    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
     void emit_data() const override;
 
 private:
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp
index 6bc410b1b042ee..d9c87dbf8b3ae3 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp
@@ -56,7 +56,7 @@ std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter) {
     ss << " src_precision:" << emitter->src_prc
        << " dst_precision:" << emitter->dst_prc
        << " load/store_element_number:" << emitter->count
-       << " byte_offset:" << emitter->byte_offset;
+       << " byte_offset:" << emitter->compiled_byte_offset;
     return ss.str();
 }
 
@@ -76,14 +76,6 @@ static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast
     return ss.str();
 }
 
-static std::string init_info_jit_load_convert_emitter(const jit_load_convert_emitter *emitter) {
-    std::stringstream ss;
-    std::string memory_emitter_info = init_info_jit_memory_emitter(emitter);
-    ss << "Emitter_type_name:jit_load_convert_emitter"
-       << memory_emitter_info;
-    return ss.str();
-}
-
 static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter *emitter) {
     std::stringstream ss;
     std::string memory_emitter_info = init_info_jit_memory_emitter(emitter);
@@ -92,14 +84,6 @@ static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emi
     return ss.str();
 }
 
-static std::string init_info_jit_store_convert_emitter(const jit_store_convert_emitter *emitter) {
-    std::stringstream ss;
-    std::string memory_emitter_info = init_info_jit_memory_emitter(emitter);
-    ss << "Emitter_type_name:jit_store_convert_emitter"
-       << memory_emitter_info;
-    return ss.str();
-}
-
 std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter) {
     std::stringstream ss;
     ss << "Emitter_type_name:jit_brgemm_emitter"
@@ -190,12 +174,8 @@ void jit_emitter_info_t::init(const jit_emitter *emitter) {
         str_ = init_info_jit_load_memory_emitter(e_type);
     } else if (auto e_type = dynamic_cast<const jit_load_broadcast_emitter*>(emitter)) {
         str_ = init_info_jit_load_broadcast_emitter(e_type);
-    }  else if (auto e_type = dynamic_cast<const jit_load_convert_emitter*>(emitter)) {
-        str_ = init_info_jit_load_convert_emitter(e_type);
     } else if (auto e_type = dynamic_cast<const jit_store_memory_emitter*>(emitter)) {
         str_ = init_info_jit_store_memory_emitter(e_type);
-    } else if (auto e_type = dynamic_cast<const jit_store_convert_emitter*>(emitter)) {
-        str_ = init_info_jit_store_convert_emitter(e_type);
     } else if (auto e_type = dynamic_cast<const jit_brgemm_emitter*>(emitter)) {
         str_ = init_info_jit_brgemm_emitter(e_type);
     } else if (auto e_type = dynamic_cast<const jit_brgemm_copy_b_emitter*>(emitter)) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 01a7e2eedb967e..5949e2a755d782 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -151,6 +151,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
 
     inline void init_call_args(jit_snippets_call_args& call_args) {
         call_args.register_loops(loop_args);
+        std::copy(buffer_offsets.cbegin(), buffer_offsets.cend(), call_args.memory_access_offsets);
 
         if (m_buffer_scratchpad_size > 0)
             call_args.buffer_scratchpad_ptr =
@@ -191,10 +192,12 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
 
     void init_runtime_params(const std::shared_ptr<CPURuntimeConfig>& snippet_config) override {
         SubgraphExecutor::init_runtime_params(snippet_config);
+        buffer_offsets = snippet_config->buffer_cluster_offsets;
         data_offsets = snippet_config->io_data_offsets;
         loop_args = snippet_config->loop_args;
     }
 
+    std::vector<size_t> buffer_offsets = {};
     std::vector<std::vector<size_t>> data_offsets = {};
     std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
 };
@@ -846,6 +849,7 @@ void Subgraph::SubgraphExecutor::init_runtime_params(const std::shared_ptr<CPURu
     OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
 
     m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
+    OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!");
     m_buffer_scratchpad.resize(m_buffer_scratchpad_size * parallel_get_max_threads(), 0);
 
     init_parallel_domain(snippet_config, m_parallel_exec_domain);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp
index 9c3cf3dca21ab6..d0265b8606d286 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp
@@ -5,6 +5,7 @@
 #include "brgemm_cpu.hpp"
 #include "snippets/itt.hpp"
 #include "snippets/utils.hpp"
+#include "snippets/snippets_isa.hpp"
 #include "snippets/lowered/port_descriptor.hpp"
 #include "utils/general_utils.h"
 #include "snippets/utils.hpp"
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
index bf8f635bd2fe1b..80fde9c733ba18 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp
@@ -8,6 +8,7 @@
 
 #include "snippets/utils.hpp"
 #include "snippets/op/brgemm.hpp"
+#include "snippets/op/buffer.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 #include "transformations/snippets/x64/op/brgemm_cpu.hpp"
 #include "transformations/tpp/x64/op/modifiers.hpp"
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
index 31b135a77da3e9..7e4ec11d8bd532 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp
@@ -29,10 +29,10 @@ bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::Lin
         if (auto copy_b = ov::as_type_ptr<ov::intel_cpu::BrgemmCopyB>(expr->get_node())) {
             const auto buffer = get_buffer_from_output(expr, 0);
             const auto buffer_shape = copy_b->get_repacking_buffer_shape();
-            buffer->set_allocation_shape(buffer_shape);
+            buffer->set_allocation_size(ov::shape_size(buffer_shape));
             if (copy_b->is_with_compensations()) {
                 const auto compensations_buffer = get_buffer_from_output(expr, 1);
-                compensations_buffer->set_allocation_shape(copy_b->get_compensations_buffer_shape());
+                compensations_buffer->set_allocation_size(ov::shape_size(copy_b->get_compensations_buffer_shape()));
             }
             modified = true;
         }
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 85e8c2e10615b7..6ccd9ec5c7c484 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -306,9 +306,8 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*FQLayerDQBias.smoke_CompareWithRefs.*)",
         R"(.*smoke_matmulBrgemmInt8/MatmulBrgemmInt8Test.CompareWithRefs.*MatMul.*InputType=i8_OutputType=i8.*)",
         R"(.*smoke_Snippets_MHAWOTransposeOnInputs_4D/MHAWOTransposeOnInputs.CompareWithRefImpl.*)",
-        // Issue: 123274 (Dynamic Softmax aren't supported)
-        R"(smoke_Snippets_(Softmax|AddSoftmax|Reduce).*\[.*\?.*\].*)",
-        R"(smoke_Snippets_BroadcastSelect_Dynamic.*)"
+        // Issue: 142448
+        R"(smoke_Snippets_BroadcastSelect_Dynamic.*)",
         // Issue: 141705
         R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/.*trip_count=5_exec_cond=1_netType=i8.*)",
         R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/Input0_IS=\[\?.1.\?\]_TS=\(10.1.10\)_\(1.1.1\)_\(1.1.1\)_\(5.1.3\)_Input1_IS=\[\?.\?.\?\]_TS=.*_Input2_IS=\[\?.1.\?\]_.*_types=0_0_1_trip_count_type=.*_trip_count=(1|5)_exec_cond=1_netType=i8.*)",
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
index 11a959b0a70f47..eba3e0db1f08ce 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
@@ -39,7 +39,10 @@ const std::vector<InputShape> inputShape = {
     {{}, {{1, 3, 128, 20}}},
     // DS
     {{-1, -1}, {{1, 16}, {1, 32}, {1, 1}, {1, 9}, {1, 17}, {1, 19}, {1, 49}, {1, 50}, {5, 16}, {1, 16}, {1, 9}}},
-    {{-1, -1, -1, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}}
+    {{-1, -1, -1, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}},
+    {{-1, -1, -1, 128}, {{1, 3, 128, 128}, {1, 3, 128, 128}, {1, 3, 64, 128}, {1, 3, 32, 128}, {1, 3, 64, 128}, {1, 3, 32, 128}}},
+    {{-1, -1, -1, 130}, {{1, 3, 8, 130}, {1, 3, 18, 130}, {1, 3, 8, 130}, {1, 3, 32, 130}, {1, 3, 18, 130}, {1, 3, 32, 130}}},
+    {{-1, -1, 128, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}},
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax,
@@ -59,8 +62,18 @@ const std::vector<std::pair<InputShape, InputShape>> inputShapesPair = {
     {{{}, {{1, 5, 16, 35}}}, {{}, {{1, 5,  1, 35}}}},
     {{{}, {{1, 5,  1, 35}}}, {{}, {{1, 5,  1, 35}}}},
     // DS
-    {{{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16,  1}, {1, 5, 16, 35}}}, {{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 35}, {1, 5, 16, 35}}}},
-    {{{-1, {1, 8}, {1, 16}, {1, 16}}, {{1, 3, 1, 8}, {1, 8, 16, 16}, {1, 3, 1, 8}}}, {{-1, {1, 8}, -1, {1, 8}}, {{1, 3, 2, 8}, {2, 1, 1, 1}, {1, 3, 2, 8}}}}
+    {{{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 32}, {1, 5, 16, 35}}},
+     {{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 32}, {1, 5, 16, 35}}}},
+    {{{-1, {1, 8}, {1, 16}, {1, 16}}, {{1, 3, 1, 8}, {1, 8, 16, 16}, {1, 3, 1, 8}}},
+     {{-1, {1, 8}, -1, {1, 16}}, {{1, 3, 2, 8}, {2, 1, 1, 16}, {1, 3, 2, 8}}}},
+    {{{-1, -1, -1, 128}, {{1, 5, 32, 128}, {1, 5, 16, 128}, {1, 5, 32, 128}}},
+     {{-1, -1, -1, 128}, {{1, 5, 32, 128}, {1, 5, 16, 128}, {1, 5, 1, 128}}}},
+    {{{-1, -1, -1, 130}, {{1, 5, 16, 130}, {1, 5, 32, 130}, {1, 5, 32, 130}}},
+     {{-1, -1, -1, 130}, {{1, 1, 1, 130}, {1, 1, 1, 130}, {1, 5, 32, 130}}}},
+    {{{-1, -1, 32, -1}, {{1, 5, 32, 35}, {1, 5, 32, 32}, {1, 5, 32, 35}, {1, 5, 32, 35}}},
+     {{-1, -1, -1, -1}, {{1, 5, 32, 35}, {1, 5, 32, 32}, {1, 5, 32, 35}, {1, 5, 32, 35}}}},
+    {{{-1, -1, 5, -1}, {{1, 1, 5, 35}, {1, 3, 5, 32}, {1, 5, 5, 18}, {1, 2, 5, 35}}},
+     {{-1, -1, 5, -1}, {{1, 1, 5, 35}, {1, 1, 5, 32}, {1, 5, 5, 18}, {1, 2, 5, 35}}}},
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax,
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
index c05d4fc712d05b..8e188b0dbf332c 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
@@ -38,7 +38,8 @@ typedef std::tuple<
     bool,   // Optimized pipeline
     bool,   // With SplitLoops opt
     size_t, // Expected Buffer size in bytes
-    size_t  // Expected unique Buffer IDs count
+    size_t, // Expected unique Buffer reg group count
+    size_t  // Expected unique Buffer cluster count
 > BufferAllocationCPUParams;
 
 class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCPUParams> {
@@ -46,19 +47,20 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
     using VectorDims = ov::snippets::VectorDims;
     static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationCPUParams> obj) {
         bool is_optimized, with_split_loops;
-        size_t expected_size, expected_count;
-        std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
+        size_t expected_size, expected_reg_group_count, expected_cluster_count;
+        std::tie(is_optimized, with_split_loops, expected_size, expected_reg_group_count, expected_cluster_count) = obj.param;
         std::ostringstream result;
         result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
         result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
         result << "ExpBufferSize=" << expected_size << "_";
-        result << "ExpBufferNum=" << expected_count;
+        result << "ExpBufferRegGroupCount=" << expected_reg_group_count << "_";
+        result << "ExpBufferClustersCount=" << expected_reg_group_count << "_";
         return result.str();
     }
 
 protected:
     void SetUp() override {
-        std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
+        std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_reg_group_count, m_expected_cluster_count) = this->GetParam();
 
         const auto body = GetModel();
         m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::CPUShapeInferSnippetsFactory>());
@@ -82,7 +84,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
         pipeline.register_pass<ov::snippets::lowered::pass::ReduceDecomposition>(m_vector_size);
         pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
         pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
-        pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
+        pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>();
         pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
         pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
         pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
@@ -92,14 +94,16 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
     }
 
     void Validate() {
-        std::set<size_t> gprs;
+        std::set<size_t> reg_groups, clusters;
         for (const auto& expr : m_linear_ir) {
             if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
-                gprs.insert(buffer->get_id());
+                reg_groups.insert(buffer->get_reg_group());
+                clusters.insert(buffer->get_cluster_id());
             }
         }
-        EXPECT_EQ(gprs.size(), m_expected_count);
-        EXPECT_EQ(m_linear_ir.get_buffer_scratchpad_size(), m_expected_size);
+        EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count);
+        EXPECT_EQ(clusters.size(), m_expected_cluster_count);
+        EXPECT_EQ(m_linear_ir.get_static_buffer_scratchpad_size(), m_expected_size);
     }
 
     virtual std::shared_ptr<ov::Model> GetModel() const = 0;
@@ -116,7 +120,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
     ov::snippets::lowered::LinearIR m_linear_ir;
 
     size_t m_expected_size = 0;
-    size_t m_expected_count = 0;
+    size_t m_expected_reg_group_count = 0;
+    size_t m_expected_cluster_count = 0;
 
     size_t m_loop_depth = 2;
     size_t m_vector_size = 16;
@@ -125,6 +130,64 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
     bool m_with_split_loops = true;
 };
 
+class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest {
+protected:
+    std::shared_ptr<ov::Model> GetModel() const override {
+        const size_t m_blk = 32;
+        const size_t k_blk = 16;
+        const size_t n_blk = 64;
+        const auto subtensor_scalar = std::vector<size_t>{1};
+        const auto subtensor_power = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+        const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);
+
+        const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
+        const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
+        const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
+
+        const auto order = std::vector<size_t>{0, 2, 3, 1};
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
+        const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
+        const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
+        const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(parameter0, relu0, ov::intel_cpu::BrgemmCPU::Type::Floating);
+        brgemm_cpu0->set_m_block_size(m_blk);
+        brgemm_cpu0->set_k_block_size(k_blk);
+        brgemm_cpu0->set_n_block_size(n_blk);
+
+        const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
+
+        // Decomposed Softmax
+        const auto reduce_max = std::make_shared<ov::snippets::op::ReduceMax>(relu1, 3);
+        ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_max);
+        const auto subtract = std::make_shared<ov::op::v1::Subtract>(relu1, reduce_max);
+        const auto exp = std::make_shared<ov::op::v0::Exp>(subtract);
+
+        const auto reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(exp, 3);
+        ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum);
+        const auto power = std::make_shared<ov::snippets::op::PowerStatic>(reduce_sum, -1.f);
+        const auto multiply = std::make_shared<ov::op::v1::Multiply>(exp, power);
+
+        const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(multiply, parameter2, ov::intel_cpu::BrgemmCPU::Type::Floating);
+        brgemm_cpu1->set_m_block_size(m_blk);
+        brgemm_cpu1->set_k_block_size(k_blk);
+        brgemm_cpu1->set_n_block_size(n_blk);
+
+        const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);
+
+        const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
+
+        MarkOp(load_reshape, subtensor_scalar);
+        MarkOp(store, subtensor_scalar);
+        MarkOp(power, subtensor_power);
+
+        MarkOp(brgemm_cpu0, subtensor_full);
+        MarkOp(brgemm_cpu1, subtensor_full);
+
+        ov::snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(load_reshape->input(0))->set_layout(order);
+
+        return body;
+    }
+};
+
 class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
 protected:
     std::shared_ptr<ov::Model> GetModel() const override {
@@ -139,7 +202,8 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
         const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 128, 12, 64}));
         const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
 
-        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
+        const auto order = std::vector<size_t>{0, 2, 3, 1};
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
         const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
         const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
         const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);
@@ -197,47 +261,92 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
         MarkOp(scratch0, subtensor_full);
         MarkOp(scratch1, subtensor_full);
 
+        ov::snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(load_reshape->input(0))->set_layout(order);
+
         return body;
     }
 };
 
-TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) {
+TEST_P(MHAFP32BufferAllocationTest, BufferAllocationCPU) {
     Validate();
 }
 
+TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) {
+    Validate();
+}
 
 namespace BufferAllocationCPUTest_Instances {
 
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHAFP32BufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(true),
+                                 ::testing::Values(75264), // Each Buffer has own allocated memory
+                                 ::testing::Values(7),     // Each Buffer has unique ID
+                                 ::testing::Values(7)),    // Each Buffer has unique cluster ID
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHAFP32BufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(true),
+                                 ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
+                                 ::testing::Values(2),     // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
+                                 ::testing::Values(3)),    // (Buffer before brgemm0) + (between brgemms) + (after brgemm1)
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHAFP32BufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(false),
+                                 ::testing::Values(198144), // Each Buffer has own allocated memory
+                                 ::testing::Values(7),      // Each Buffer has unique ID
+                                 ::testing::Values(7)),     // Each Buffer has unique cluster ID
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHAFP32BufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(false),
+                                 ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
+                                 ::testing::Values(2),     // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
+                                 ::testing::Values(3)),    // (Buffer before brgemm0) + (between brgemms) + (after brgemm1)
+                         BufferAllocationCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWSplit, MHABF16AMXBufferAllocationTest,
                          ::testing::Combine(
                                  ::testing::Values(false),
                                  ::testing::Values(true),
-                                 ::testing::Values(167936),
+                                 ::testing::Values(120064),
+                                 ::testing::Values(11),
                                  ::testing::Values(11)),
                          BufferAllocationCPUTest::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABF16AMXBufferAllocationTest,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXOptimizedWSplit, MHABF16AMXBufferAllocationTest,
                          ::testing::Combine(
                                  ::testing::Values(true),
                                  ::testing::Values(true),
                                  ::testing::Values(73728),
-                                 ::testing::Values(3)),
+                                 ::testing::Values(3),
+                                 ::testing::Values(8)),
                          BufferAllocationCPUTest::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
                          ::testing::Combine(
                                  ::testing::Values(false),
                                  ::testing::Values(false),
-                                 ::testing::Values(364544),
+                                 ::testing::Values(218368),
+                                 ::testing::Values(11),
                                  ::testing::Values(11)),
                          BufferAllocationCPUTest::getTestCaseName);
 
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
                          ::testing::Combine(
                                  ::testing::Values(true),
                                  ::testing::Values(false),
                                  ::testing::Values(116736),
-                                 ::testing::Values(3)),
+                                 ::testing::Values(3),
+                                 ::testing::Values(8)),
                          BufferAllocationCPUTest::getTestCaseName);
 
 }  // namespace BufferAllocationCPUTest_Instances