From 5c5e3911b2d509647611bbf81b55697e0a645ada Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Tue, 7 May 2024 11:18:52 +0400 Subject: [PATCH] [Snippets] Added Dynamism support to intermediate memory [Snippets] Renamed BufferID to BufferRegisterGroup [Snippets] Changed allocation shape on size [Snippets] Added Buffer cluster_ID [Snippets][Tests] Fixed build insert_load_store test [Snippets] Splited SolveBufferMemory into static and dynamic logic [Snippets] Rewrote ComputeBufferAllocationSize::get_allocation_size [Snippets] Added synamism support to InitBuffersDefault [Snippets][Tests] Added tests for clusters [Snippets] Added buffer_expressions to ComputeBufferAllocationSize [Snippets] Added to LoopInfo for splitted loops: [Snippets] Removed copy from UpdateLoopInfo [Snippets] Moved UpdateLoopInfo to RunimeConfigurator [Snippets] Add dynamic buffers support to Configurator [Snippets] Fixed Reduce decomp: add shape infer for outputs [snippets] Fixed broadcast_merge_dim in shape inference [Snippets][CPU][Tests] Enabled dynamic Softmax tests [Snippets] Removed useless function calculate_size [Snippets][CPU][Tests] Enabled dynamic reduce test [Snippets] Small fixes in solve_buffer_memory for dynamic nodes [CPU][Snippets] Removed useless emitters LoadConvert and StoreConvert [Snippets] Added missed consumers cloning [Snippets][CPU] Added buffer offsets to call_args [Snippets][CPU] Added dynamic offsets support to load and store emitters [CPU][UnitTests} Fixed build [Snippets][AArch64] Fixed build [Snippets] Small fixes --- .../snippets/docs/snippets_design_guide.md | 14 +- .../include/snippets/lowered/linear_ir.hpp | 9 +- .../snippets/lowered/linear_ir_builder.hpp | 15 +- .../include/snippets/lowered/loop_info.hpp | 26 +- .../include/snippets/lowered/loop_manager.hpp | 10 +- .../lowered/pass/allocate_buffers.hpp | 2 - .../pass/compute_buffer_allocation_size.hpp | 38 +++ .../lowered/pass/define_buffer_clusters.hpp | 14 +- .../lowered/pass/init_buffers_default.hpp | 2 +- .../snippets/lowered/pass/insert_buffers.hpp | 4 +- ...ds.hpp => normalize_buffer_reg_groups.hpp} | 14 +- ...y_buffers.hpp => set_buffer_reg_group.hpp} | 19 +- .../lowered/pass/solve_buffer_memory.hpp | 33 ++- .../lowered/pass/update_loop_info.hpp | 46 ---- .../snippets/include/snippets/op/buffer.hpp | 52 ++-- .../include/snippets/runtime_configurator.hpp | 27 +- .../snippets/include/snippets/utils.hpp | 39 ++- src/common/snippets/src/lowered/linear_ir.cpp | 7 + .../src/lowered/linear_ir_builder.cpp | 20 ++ src/common/snippets/src/lowered/loop_info.cpp | 38 +-- .../snippets/src/lowered/loop_manager.cpp | 7 +- .../src/lowered/pass/allocate_buffers.cpp | 23 +- .../src/lowered/pass/assign_registers.cpp | 8 +- .../pass/clean_repeated_ptr_shifts.cpp | 10 +- .../pass/compute_buffer_allocation_size.cpp | 101 ++++++++ .../lowered/pass/define_buffer_clusters.cpp | 41 +-- .../snippets/src/lowered/pass/fuse_loops.cpp | 3 +- .../src/lowered/pass/init_buffers_default.cpp | 17 +- .../snippets/src/lowered/pass/init_loops.cpp | 9 +- .../src/lowered/pass/insert_buffers.cpp | 86 +------ .../pass/insert_specific_iterations.cpp | 19 +- .../src/lowered/pass/iter_handler.cpp | 1 + .../src/lowered/pass/normalize_buffer_ids.cpp | 39 --- .../pass/normalize_buffer_reg_groups.cpp | 39 +++ .../src/lowered/pass/reduce_decomposition.cpp | 7 + ...y_buffers.cpp => set_buffer_reg_group.cpp} | 30 +-- .../src/lowered/pass/solve_buffer_memory.cpp | 172 +++++++++---- .../snippets/src/lowered/pass/split_loops.cpp | 1 + .../src/lowered/pass/update_loop_info.cpp | 92 ------- .../snippets/src/lowered/pass/validate.cpp | 1 + .../lowered/pass/validate_expanded_loops.cpp | 14 +- src/common/snippets/src/op/buffer.cpp | 52 ++-- src/common/snippets/src/op/loop.cpp | 9 +- src/common/snippets/src/op/memory_access.cpp | 28 ++- src/common/snippets/src/op/subgraph.cpp | 2 +- .../snippets/src/runtime_configurator.cpp | 166 +++++++++++-- .../shape_inference/shape_infer_instances.cpp | 14 +- src/common/snippets/src/utils.cpp | 7 +- .../lowered/pass/buffer_allocation.hpp | 13 +- .../src/lowered/pass/buffer_allocation.cpp | 121 ++------- .../src/lowered/pass/insert_load_store.cpp | 1 + .../snippets/tests/src/lowered/pass/loop.cpp | 2 +- .../snippets/aarch64/jit_kernel_emitter.cpp | 2 +- .../snippets/jit_snippets_call_args.hpp | 1 + .../emitters/snippets/x64/cpu_generator.cpp | 14 +- .../snippets/x64/jit_kernel_emitter.cpp | 6 +- .../snippets/x64/jit_memory_emitters.cpp | 233 ++++++++---------- .../snippets/x64/jit_memory_emitters.hpp | 57 ++--- .../src/emitters/snippets/x64/verbose.cpp | 22 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 4 + .../snippets/x64/op/brgemm_cpu.cpp | 1 + .../x64/pass/brgemm_to_brgemm_cpu.cpp | 1 + .../set_brgemm_copy_b_buffers_shape.cpp | 4 +- .../skip_tests_config.cpp | 5 +- .../snippets/softmax.cpp | 19 +- .../x64/lowered/buffer_allocation.cpp | 149 +++++++++-- 66 files changed, 1176 insertions(+), 906 deletions(-) create mode 100644 src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp rename src/common/snippets/include/snippets/lowered/pass/{normalize_buffer_ids.hpp => normalize_buffer_reg_groups.hpp} (65%) rename src/common/snippets/include/snippets/lowered/pass/{identify_buffers.hpp => set_buffer_reg_group.hpp} (89%) delete mode 100644 src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp create mode 100644 src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp delete mode 100644 src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp create mode 100644 src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp rename src/common/snippets/src/lowered/pass/{identify_buffers.cpp => set_buffer_reg_group.cpp} (88%) delete mode 100644 src/common/snippets/src/lowered/pass/update_loop_info.cpp diff --git a/src/common/snippets/docs/snippets_design_guide.md b/src/common/snippets/docs/snippets_design_guide.md index 3f44bde1cace3f..ce331c7c9fdc2b 100644 --- a/src/common/snippets/docs/snippets_design_guide.md +++ b/src/common/snippets/docs/snippets_design_guide.md @@ -605,17 +605,17 @@ Again, the explicit operations are needed to emit appropriate instructions later As mentioned above the `op::Buffer` operations are managed by the pass `AllocateBuffers`. Before describing the algorithm, it is necessary to briefly consider the structure of `Buffer`: * All `Buffers` represent `Buffer scratchpad` together (a common memory that is needed for intermediate results storing). -* Each `Buffer` has an `offset` relative to the common data pointer (pointer of `Buffer scratchpad`) and `ID` (the `Buffers` with the same `ID` have the same assigned register). +* Each `Buffer` has an `offset` relative to the common data pointer (pointer of `Buffer scratchpad`), `RegGroup` (the `Buffers` with the same `RegGroup` have the same assigned register) and `ClusterID` (the buffers from the same cluster refer to the same memory area - they have the same `offset` relative to the `Buffer scratchpad` data pointer). The algorithm supports two modes: optimized and non-optimized. -The optimized one calculates minimal memory size and minimal unique `ID` count required to handle all the buffers. -The non-optimized version assigns each buffer an unique `ID` and `offset`. +The optimized one calculates minimal memory size and minimal unique `RegGroup` count required to handle all the buffers. +The non-optimized version assigns each buffer an unique `RegGroup`, `ClusterID` and `offset`. The first mode is the default one, while the second one might be used for debugging the optimized version. The optimized algorithm `AllocateBuffers` has the main following steps: -1. `IdentifyBuffers` - analyzes `Buffers` access patterns to avoid redundant pointer increments. A graph coloring algorithm is utilized for this purpose. -2. `DefineBufferClusters` - creates sets of `Buffer` ops - `BufferClusters`. -`Buffers` from one `BufferCluster` refer to the same memory area (they have the same `offset` relative to the `Buffer scratchpad` data pointer). -For example, there is a loop with `Buffer` ops on input and output. If the body of this loop can write data to the memory from which it was read, these `Buffers` are in one `BufferCluster`. +1. `SetBufferRegGroup` - analyzes `Buffers` access patterns to avoid redundant pointer increments. A graph coloring algorithm is utilized for this purpose. +2. `DefineBufferClusters` - creates sets of `Buffer` ops (buffer clusters) and set `ClusterID` value to `Buffer` ops. +As noticed above, `Buffers` from one cluster refer to the same memory area. +For example, there is a loop with `Buffer` ops on input and output. If the body of this loop can write data to the memory from which it was read, these `Buffers` are in one cluster. 3. `SolveBufferMemory` - calculate the most optimal memory size of `Buffer scratchpad` based on `BufferClusters` and life time of `Buffers`. More details on control flow optimization passes could be found in the `control_flow_transformations(...)` method inside [subgraph.cpp](../src/op/subgraph.cpp). diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 5fd3984c430fe8..b8b11082c99eef 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -76,13 +76,14 @@ class LinearIR { ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& inputs) const; const container& get_ops() const { return m_expressions; } + const container& get_buffer_ops() const { return m_buffer_expressions; } const container& get_parameters() const { return m_parameter_expressions; } const container& get_results() const { return m_result_expressions; } const Config& get_config() const { return m_config; } - size_t get_buffer_scratchpad_size() const { return m_buffer_scratchpad_size; } + size_t get_static_buffer_scratchpad_size() const { return m_static_buffer_scratchpad_size; } void set_loop_depth(size_t loop_depth) { m_config.m_loop_depth = loop_depth; } - void set_buffer_scratchpad_size(size_t size) { m_buffer_scratchpad_size = size; } + void set_static_buffer_scratchpad_size(size_t size) { m_static_buffer_scratchpad_size = size; } const ExpressionPtr& get_expr_by_node(const std::shared_ptr& n) const; @@ -278,13 +279,15 @@ class LinearIR { std::unordered_map, std::shared_ptr> m_node2expression_map; container m_parameter_expressions{}; container m_result_expressions{}; + container m_buffer_expressions{}; Config m_config{}; LoopManagerPtr m_loop_manager; std::shared_ptr m_shape_infer_factory; std::shared_ptr m_shape_infer = nullptr; bool m_is_dynamic = false; - size_t m_buffer_scratchpad_size = 0; + // Size of static Buffer Scratchpad (Buffers with defined allocation size) + size_t m_static_buffer_scratchpad_size = 0; }; using LinearIRPtr = std::shared_ptr; diff --git a/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp b/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp index 969bf21cd27480..b9cfb87af617d6 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir_builder.hpp @@ -17,11 +17,24 @@ namespace lowered { class LinearIRBuilder { public: struct Config { - Config(bool deep_copy_of_shapes_ = true) : deep_copy_of_shapes(deep_copy_of_shapes_) {} + Config(bool deep_copy_of_shapes_ = true, bool copy_missed_consumers_ = true) + : deep_copy_of_shapes(deep_copy_of_shapes_), copy_missed_consumers(copy_missed_consumers_) {} // If True, copy of stored pointer in `PortDescriptor::m_tensor_shape`. // If False, copy shapes as shared pointers. const bool deep_copy_of_shapes = true; + // At the moment, input port of expression must have only one source. + // However, for example, after LinearIR range insertion to the LinearIR (InsertSpecificIteration pass) + // several operations can write to the same consumer: several `Store` ops from different loop bodies store to the same Buffer/Result. + // Since `clone` algorithm is linear and during expression cloning creates only input port connectors from sources, + // algorithm can miss some consumers. For example: + // The consumers of Store0 : Buffer0 + // The consumers of Store1 : Buffer0 + // The result: Buffer0 has only one source in input connector - Store1 + // Algorithm automatically doesn't add Buffer to consumers of Store0. Thus, + // If True, `clone` algorithm add missed consumers. + // If False, cloned LinearIR will be built by default (without extra consumers). + const bool copy_missed_consumers = true; }; LinearIRBuilder(Config config = {}) : m_config(std::move(config)) {} diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp index ca28b27a760ac7..595d93834bd97e 100644 --- a/src/common/snippets/include/snippets/lowered/loop_info.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp @@ -23,8 +23,9 @@ class LoopInfo { enum {UNDEFINED_DIM_IDX = std::numeric_limits::max()}; LoopInfo() = default; - LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); - LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); + LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, bool is_wa_const = false); + LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, + bool is_wa_const = false); virtual ~LoopInfo() = default; /** @@ -76,6 +77,11 @@ class LoopInfo { * @return m_output_ports */ const std::vector& get_output_ports() const; + /** + * @brief Returns True if `work_amount` cannot be rewritten/updated by passes. + * @return m_is_work_amount_const + */ + bool is_work_amount_const() const; /** * @brief Set m_work_amount value @@ -92,6 +98,11 @@ class LoopInfo { * @param dim_idx - index */ void set_dim_idx(size_t dim_idx); + /** + * @brief Sets `value` to `m_is_work_amount_const` + * @param value - value of the attribute + */ + void set_work_amount_const(bool value); /** * @brief Replace the current LoopPort `actual_port` with new `target_ports` @@ -164,6 +175,9 @@ class LoopInfo { // Note: Scalars aren't input expressions but can be before first input expr in Linear IR std::vector m_input_ports = {}; std::vector m_output_ports = {}; + + // If True, no one pass can rewrite the value of `m_work_amount` + bool m_is_work_amount_const = false; }; using LoopInfoPtr = std::shared_ptr; @@ -197,13 +211,13 @@ class UnifiedLoopInfo : public LoopInfo { UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, const std::vector& in_descs, const std::vector& out_descs, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers = SpecificIterationHandlers()); + const SpecificIterationHandlers& handlers = SpecificIterationHandlers(), bool is_wa_const = false); /** * @brief Clone LoopInfo with new expressions @@ -365,7 +379,7 @@ class ExpandedLoopInfo : public LoopInfo { ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info); + SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const = false); /** * @brief Clone LoopInfo with new expressions * @param expr_map map of new and old expressions diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index 570120408c37fb..f0718107ca30a2 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -99,12 +99,13 @@ class LoopManager { size_t increment, const std::vector& entries, const std::vector& exits, - bool set_default_handlers = true) { + bool set_default_handlers = true, + bool is_work_amount_const = false) { const auto normalized_increment = utils::is_dynamic_value(work_amount) || work_amount == 0 ? increment : std::min(increment, work_amount); const auto handlers = set_default_handlers ? SpecificIterationHandlers(work_amount, normalized_increment) : SpecificIterationHandlers(); - const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers); + const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers, is_work_amount_const); const auto loop_id = this->add_loop_info(loop_info); for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) { insert_loop_id(*expr_it, loop_id); @@ -131,8 +132,9 @@ class LoopManager { size_t dim_idx, const std::vector& entries, const std::vector& exits, - bool set_default_handlers = true) { - const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers); + bool set_default_handlers = true, + bool is_work_amount_const = false) { + const auto loop_id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, entries, exits, set_default_handlers, is_work_amount_const); const auto loop_info = get_loop_info(loop_id); loop_info->set_dim_idx(dim_idx); return loop_id; diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp index 623c32c7ba1d39..f9a8331c65f3da 100644 --- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp @@ -42,8 +42,6 @@ class AllocateBuffers: public RangedPass { */ static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset); - using BufferCluster = std::set; - using BufferClusters = std::vector; private: bool m_is_optimized_mode = true; }; diff --git a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp new file mode 100644 index 00000000000000..89769f150d1c8d --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +#include "snippets/lowered/loop_manager.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface ComputeBufferAllocationSize + * @brief The pass calculate allocation sizes of Buffers. + * @param m_buffer_allocation_rank - rank of shape for memory allocation: shape[m_allocation_rank : -1] + * @ingroup snippets + */ +class ComputeBufferAllocationSize : public RangedPass { +public: + OPENVINO_RTTI("ComputeBufferAllocationSize", "RangedPass") + ComputeBufferAllocationSize(size_t buffer_allocation_rank) : m_buffer_allocation_rank(buffer_allocation_rank) {} + + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; + + static size_t get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank); + +private: + size_t m_buffer_allocation_rank = 0; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp index 67254d879f3351..824b0d4daea75d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp @@ -6,8 +6,6 @@ #include "pass.hpp" -#include "allocate_buffers.hpp" - namespace ov { namespace snippets { namespace lowered { @@ -35,7 +33,7 @@ class DefineBufferClusters : public RangedPass { public: OPENVINO_RTTI("DefineBufferClusters", "RangedPass") - DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {} + DefineBufferClusters() = default; /** * @brief Apply the pass to the Linear IR @@ -45,13 +43,15 @@ class DefineBufferClusters : public RangedPass { bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: + using BufferCluster = std::set; + using BufferClusters = std::vector; using BufferPorts = std::unordered_map>; /** * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer * @param target target expression with Buffer op * @return vector iterator which refers to the found cluster */ - AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target); + BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target); /** * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr) * @param buffer_expr expression with assumed Buffer op @@ -70,7 +70,7 @@ class DefineBufferClusters : public RangedPass { * @param cluster set of Buffer expressions - cluster * @return common buffer ID or SIZE_MAX - size value */ - size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const; + size_t get_cluster_buffer_id(const BufferCluster& cluster) const; /** * @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory. @@ -126,10 +126,10 @@ class DefineBufferClusters : public RangedPass { * @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers * @return Return True if clusters have been united */ - bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster, + bool unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster, const ExpressionPtr& outer_buffer, bool is_outer_up); - AllocateBuffers::BufferClusters& m_clusters; + BufferClusters m_clusters; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp index 3b085ca2b32f80..5ddb2749d63998 100644 --- a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp @@ -13,7 +13,7 @@ namespace pass { /** * @interface InitBuffersDefault - * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers. + * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and reg groups to Buffers. * @ingroup snippets */ diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp index f38666cd4de1ba..40a2611b80ef48 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp @@ -24,7 +24,7 @@ namespace pass { class InsertBuffers : public RangedPass { public: OPENVINO_RTTI("InsertBuffers", "RangedPass") - InsertBuffers(int32_t buffer_allocation_rank); + InsertBuffers() = default; bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: @@ -39,8 +39,6 @@ class InsertBuffers : public RangedPass { const LoopManagerPtr& loop_manager, const ExpressionPtr& expr, const ExpressionPtr& down_expr); - - int32_t m_buffer_allocation_rank; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp similarity index 65% rename from src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp rename to src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp index 81b7536b63edaa..e07d11da70d904 100644 --- a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_reg_groups.hpp @@ -12,20 +12,20 @@ namespace lowered { namespace pass { /** - * @interface NormalizeBufferIDs - * @brief After optimizations some Buffer IDs might be set unevenly: some numbers are missed. + * @interface NormalizeBufferRegisterGroups + * @brief After optimizations some Buffer RegGroups might be set unevenly: some numbers are missed. * For example, - * [Buffer -> ID] - * Buffer0 -> 0 Two Buffers have ID = 0, one has ID = 2. - * Buffer1 -> 2 Obviosly, we can normalize this IDs to set ID = 1 to Buffer1. + * [Buffer -> RegGroup] + * Buffer0 -> 0 Two Buffers have RegGroup = 0, one has RegGroup = 2. + * Buffer1 -> 2 Obviosly, we can normalize this IDs to set RegGroup = 1 to Buffer1. * Buffer2 -> 0 It helps to assign GPR registers in `AssignRegister` more effective. * Thus, the pass normalize IDs of Buffers in Linear IR. * @ingroup snippets */ -class NormalizeBufferIDs : public RangedPass { +class NormalizeBufferRegisterGroups : public RangedPass { public: - OPENVINO_RTTI("NormalizeBufferIDs", "RangedPass") + OPENVINO_RTTI("NormalizeBufferRegisterGroups", "RangedPass") /** * @brief Apply the pass to the Linear IR * @param linear_ir the target Linear IR diff --git a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp similarity index 89% rename from src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp rename to src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp index 2289ef0246e8ff..f1f57afc6e2fd4 100644 --- a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp @@ -14,9 +14,9 @@ namespace lowered { namespace pass { /** - * @interface IdentifyBuffers - * @brief The pass set identifiers for Buffers in common Buffer system. - * The buffers with the same identifier will be assigned the same data register. + * @interface SetBufferRegGroup + * @brief The pass groups Buffers by Register groups. + * The buffers with the same RegGroup will be assigned the same data register. * The pass uses greedy graph coloring algorithm using adjacency matrix: * - Buffers - are vertices of graph; * - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges). @@ -26,13 +26,12 @@ namespace pass { * or one of the Buffers is in some a Loop but another Buffer is not; * - Firstly, create adjacency matrix using the definition above; * - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise. - * Note: should be called before ResetBuffer() pass to have correct offsets * @ingroup snippets */ -class IdentifyBuffers: public RangedPass { +class SetBufferRegGroup: public RangedPass { public: - OPENVINO_RTTI("IdentifyBuffers", "RangedPass") - IdentifyBuffers() = default; + OPENVINO_RTTI("SetBufferRegGroup", "RangedPass") + SetBufferRegGroup() = default; /** * @brief Apply the pass to the Linear IR @@ -57,12 +56,12 @@ class IdentifyBuffers: public RangedPass { }; /** - * @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset > + * @brief Check if two Buffers can be in one register group by ShiftPtrParams < data_size, ptr_increment, finalization_offset > * @param lhs Data pointer shift params for first Buffer * @param rhs Data pointer shift params for second Buffer - * @return Returns True if params are valid for reusing. Otherwise returns False + * @return Returns True if params are valid to reuse one register. Otherwise returns False */ - static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs); + static bool can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs); private: using BufferPool = std::vector; diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index dfa5c3fc54d120..45eda9d4dc145f 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -6,7 +6,6 @@ #include "pass.hpp" -#include "allocate_buffers.hpp" #include "openvino/runtime/memory_solver.hpp" namespace ov { @@ -17,7 +16,9 @@ namespace pass { /** * @interface SolveBufferMemory * @brief The pass optimally calculates the common buffer scratchpad size and - * set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API. + * set the offsets relative to the common data pointer to all defined Buffers. + * The pass uses MemorySolver API to calculate common allocation size for static Buffers. + * If some Buffers have unknown allocation size, the pass set `dynamic` offset. * Note: The pass requires expression enumeration. It should be executed separately before this pass! * Note: this transformation works only with m_clusters, no lir or iterators are really needed * @ingroup snippets @@ -26,8 +27,7 @@ class SolveBufferMemory : public Pass { public: OPENVINO_RTTI("SolveBufferMemory", "Pass") - SolveBufferMemory(size_t& buffer_scratchpad_size, AllocateBuffers::BufferClusters& clusters) - : m_buffer_scratchpad_size(buffer_scratchpad_size), m_clusters(clusters) {} + SolveBufferMemory(size_t& static_buffer_scratchpad_size) : m_static_buffer_scratchpad_size(static_buffer_scratchpad_size) {} /** * @brief Apply the pass to the Linear IR * @param linear_ir the target Linear IR @@ -36,15 +36,32 @@ class SolveBufferMemory : public Pass { bool run(lowered::LinearIR& linear_ir) override; private: + /** + * @brief Split buffer expressions of Linear IR into + * static (with defined allocation size) and dynamic (with unknown size) buffers + * @param buffer_expressions buffer expressions + * @return the pair of static and dynamic buffer expressions + */ + std::pair extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions); /** * @brief Initializes boxes for MemorySolver - * @param buffer_clusters buffer clusters. These clusters could be obtained using DefineBufferClusters pass + * @param buffer_expressions buffer expressions * @return vector of boxes for MemorySolver */ - std::vector init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters); + std::vector init_boxes(const LinearIR::container& buffer_expressions); + /** + * @brief Calculate memory size and propagate offsets to MA ops for buffer with defined allocation size + * @param static_buffer_expressions static buffer expressions + */ + void solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions); + /** + * @brief Propagate dynamic offset to MA ops for buffer with undefined allocation size + * Note: should be called after `solve_static_buffer_memory` + * @param dynamic_buffer_expressions dynamic buffer expressions + */ + void set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions); - size_t& m_buffer_scratchpad_size; - AllocateBuffers::BufferClusters& m_clusters; + size_t& m_static_buffer_scratchpad_size; constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory }; diff --git a/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp b/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp deleted file mode 100644 index 2b391251bbe8a4..00000000000000 --- a/src/common/snippets/include/snippets/lowered/pass/update_loop_info.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "pass.hpp" - -#include "snippets/lowered/loop_info.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { - -/** - * @interface UpdateLoopInfo - * @brief The pass update the existing UnifiedLoopInfo and the corresponding ExpandedLoopInfos. - * Notes: - * - LinearIR must have LoopManager only with ExpandedLoopInfo (LinearIR contains decomposed loops). - * Each of them has the pointer to UnifiedLoopInfo. - * - ExpandedLoopInfos` in LoopManager are sorted by execution order (NormalizeLoopIDs pas has been already passed). - * @ingroup snippets - */ - -class UpdateLoopInfo : public Pass { -public: - OPENVINO_RTTI("UpdateLoopInfo", "Pass") - UpdateLoopInfo() = default; - bool run(LinearIR& linear_ir) override; - -private: - /** - * @brief Initializes common ptr_increments and finalization offsets for ExpandedLoopInfo from ports of UnifiedLoopInfo - * @param unified_loop_info UnifiedLoopInfo - * @param ptr_increments ref of vector with ptr increments - * @param finalization_offsets ref of vector with finalization offsets - */ - static void init_data_ptr_shifts(const UnifiedLoopInfoPtr& unified_loop_info, std::vector& ptr_increments, - std::vector& finalization_offsets); -}; - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index 454fb5301adca4..199ebb99e8532b 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -6,6 +6,7 @@ #include "openvino/op/op.hpp" #include "snippets/shape_inference/shape_inference.hpp" +#include "snippets/utils.hpp" namespace ov { namespace snippets { @@ -15,35 +16,42 @@ namespace op { * @interface Buffer * @brief This is a base class for memory storage. * Notes: - * - All buffers with the same ID in a graph have the same memory pointer. So if we have a few buffers, + * - All buffers with the same reg_group in a graph have the same memory pointer. So if we have a few buffers, * each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer * - Buffer should be a single consumer for operation output port - * @param m_shape - output allocation shape for Buffer with type NewMemory + * @param m_allocation_size - memory size for allocation in u8 data type. Dynamic value means undefined size. * @param m_offset - offset in common Buffer scratchpad - * @param m_id - Buffer ID in common Buffer system + * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR + * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset. * @ingroup snippets */ class Buffer : public ov::op::Op { public: OPENVINO_OP("Buffer", "SnippetsOpset"); Buffer() = default; - Buffer(const OutputVector& arguments, const ov::Shape& shape, size_t id, ov::element::Type element_type = ov::element::u8); + Buffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value(), size_t reg_group = 0, size_t cluster_id = 0); bool visit_attributes(AttributeVisitor& visitor) override; - size_t get_id() const { return m_id; } - int64_t get_offset() const { return m_offset; } - void set_id(size_t id) { m_id = id; } - const ov::Shape& get_allocation_shape() const { return m_shape; } - void set_allocation_shape(const ov::Shape& allocation_shape) { m_shape = allocation_shape; } - void set_offset(int64_t offset) { m_offset = offset; } + size_t get_reg_group() const { return m_reg_group; } + size_t get_cluster_id() const { return m_cluster_id; } + size_t get_offset() const { return m_offset; } + size_t get_allocation_size() const { return m_allocation_size; } size_t get_byte_size() const; + void set_reg_group(size_t reg_group) { m_reg_group = reg_group; } + void set_cluster_id(size_t cluster) { m_cluster_id = cluster; } + void set_allocation_size(size_t allocation_size) { m_allocation_size = allocation_size; } + void set_offset(size_t offset) { m_offset = offset; } + + // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined + bool is_defined() const; + protected: - ov::Shape m_shape = {}; - size_t m_id = 0; // Default ID - 0. All Buffers are from the same set - ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte - int64_t m_offset = 0; + size_t m_allocation_size = utils::get_dynamic_value(); + size_t m_reg_group = 0; + size_t m_cluster_id = 0; + size_t m_offset = 0; }; /** @@ -56,14 +64,11 @@ class IntermediateMemoryBuffer : public Buffer { public: OPENVINO_OP("IntermediateMemoryBuffer", "SnippetsOpset", Buffer); IntermediateMemoryBuffer() = default; - IntermediateMemoryBuffer(const ov::Output& arg, const ov::Shape& shape, size_t id = 0); - IntermediateMemoryBuffer(const ov::Output& arg, int32_t allocation_rank = -1, size_t id = 0); + IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size = utils::get_dynamic_value(), + size_t reg_group = 0, size_t cluster_id = 0); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - -private: - ov::Shape compute_shape_from_allocation_rank(const ov::Output& arg, int32_t allocation_rank); }; /** @@ -76,18 +81,23 @@ class NewMemoryBuffer : public Buffer { public: OPENVINO_OP("NewMemoryBuffer", "SnippetsOpset", Buffer); NewMemoryBuffer() = default; - NewMemoryBuffer(const ov::Shape& shape, size_t id = 0, ov::element::Type element_type = ov::element::u8); + NewMemoryBuffer(const ov::Shape& shape, size_t reg_group = 0, size_t cluster_id = 0, ov::element::Type element_type = ov::element::u8); void validate_and_infer_types() override; - void set_element_type(ov::element::Type element_type); std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void set_element_type(ov::element::Type element_type); + class ShapeInfer : public IShapeInferSnippets { ov::Shape m_shape; public: explicit ShapeInfer(const std::shared_ptr& n); Result infer(const std::vector& input_shapes) override; }; + +private: + ov::Shape m_output_shape; + ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte }; } // namespace op diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 3f830ccc490664..10a2c26b1d843a 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -37,9 +37,12 @@ class RuntimeConfig { size_t tensor_rank = 0; size_t tile_rank = 0; + std::vector io_data_offsets = {}; ov::snippets::VectorDims master_shape = {}; + size_t buffer_scratchpad_size = 0; + std::vector buffer_cluster_offsets; }; /** @@ -65,7 +68,7 @@ class RuntimeConfigurator { */ virtual void update(const std::shared_ptr& linear_ir); /** - * @brief Allocate and intialize fields in RuntimeConfig + * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator * @param linear_ir LinearIR */ virtual void initialization(const std::shared_ptr& linear_ir); @@ -76,11 +79,30 @@ class RuntimeConfigurator { * @param linear_ir LinearIR */ void init_data_info(const std::shared_ptr& linear_ir); + /** + * @brief Initializes information of buffers: + * - static buffer_scratchpad_size + * - offsets of static clusters (with static buffers) + * - clusters with dynamic buffers (`m_dynamic_buffer_clusters`) for the quick access in `update()` + * @param linear_ir LinearIR + */ + void init_buffer_info(const std::shared_ptr& linear_ir); /** * @brief Initializes tensor rank of config * @param linear_ir LinearIR */ virtual void init_tensor_rank(const std::shared_ptr& linear_ir) const; + /** + * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo + * @param linear_ir LinearIR + */ + void update_loop_info(const std::shared_ptr& linear_ir) const; + /** + * @brief Update Buffer scratchpad size and offsets if needed + * Note: `update_loop_info` must be called before + * @param linear_ir LinearIR + */ + void update_buffer_scratchpad_size(const std::shared_ptr& linear_ir) const; /** * @brief Calculate data offsets of LinearIR and update these values in RuntimeConfig */ @@ -91,12 +113,13 @@ class RuntimeConfigurator { void update_latest_shapes(); std::shared_ptr m_config = nullptr; - lowered::pass::PassPipeline m_state_updater = {}; size_t m_io_num = 0; size_t m_in_num = 0; std::vector m_io_descs = {}; std::vector m_io_data_sizes = {}; + // [cluster_id -> buffer expressions ] + std::map> m_dynamic_buffer_clusters; std::vector m_latest_shapes = {}; }; diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index d0328d9d08a2c3..e5c7a443d34eb9 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -8,9 +8,13 @@ */ #pragma once -#include "snippets_isa.hpp" -#include "emitter.hpp" -#include "shape_types.hpp" +#include "snippets/emitter.hpp" +#include "snippets/shape_types.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/expression_port.hpp" + +#include "openvino/op/fake_quantize.hpp" +#include "openvino/op/constant.hpp" namespace ov { @@ -19,10 +23,10 @@ namespace utils { // Get non-scalar Constant count that will be created after FakeQuantize decomposition. // This count is needed to know exact count of non-scalar Constants during tokenization. -auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t; +auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t; inline auto is_scalar_constant(const std::shared_ptr& source_output_node) -> bool { - return ov::is_type(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1; + return ov::is_type(source_output_node) && ov::shape_size(source_output_node->get_shape()) == 1; } inline auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { @@ -73,7 +77,30 @@ inline bool is_dynamic_vdims(const VectorDimsPtr& shape) { return is_dynamic_vdims(*shape); } -void broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2); +template +inline void dynamic_safe_add(T& lhs, const T& rhs) { + if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) { + lhs = utils::get_dynamic_value(); + return; + } + lhs += rhs; +} + +template +inline void dynamic_safe_mul(T& lhs, const T& rhs) { + if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) { + lhs = utils::get_dynamic_value(); + return; + } + lhs *= rhs; +} + +template +inline std::string value2str(const T& value) { + return utils::is_dynamic_value(value) ? "?" : std::to_string(value); +} + +bool broadcast_merge_dim(size_t& dst, const size_t& d1, const size_t& d2); VectorDims pshape_to_vdims(const PartialShape&); ov::PartialShape vdims_to_pshape(const VectorDims&); diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index f1ff133ab20d79..098d6e2e1d2f32 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -179,6 +179,8 @@ void LinearIR::register_expression(const ExpressionPtr& expr, bool io_allowed) { m_parameter_expressions.push_back(expr); if (ov::is_type(node)) m_result_expressions.push_back(expr); + if (ov::is_type(node)) + m_buffer_expressions.push_back(expr); } void LinearIR::unregister_expression(const ExpressionPtr& expr) { @@ -191,6 +193,11 @@ void LinearIR::unregister_expression(const ExpressionPtr& expr) { m_node2expression_map.erase(node); OPENVINO_ASSERT(!ov::is_type(node) && !ov::is_type(node), "unregister_expression mustn't be called for parameter or result expressions"); + if (ov::is_type(node)) { + const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), expr); + OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "Buffer Expression has not been found in the list of LinearIR Buffers!"); + m_buffer_expressions.erase(it); + } } LinearIR::exprIt LinearIR::insert(constExprIt pos, container::value_type&& value) { diff --git a/src/common/snippets/src/lowered/linear_ir_builder.cpp b/src/common/snippets/src/lowered/linear_ir_builder.cpp index 1aff2b9e99d07b..6054e94d26d4b9 100644 --- a/src/common/snippets/src/lowered/linear_ir_builder.cpp +++ b/src/common/snippets/src/lowered/linear_ir_builder.cpp @@ -111,12 +111,32 @@ LinearIR::container LinearIRBuilder::clone_range(LinearIR::container::const_iter result_expr->get_input_count() == original_expr->get_input_count() && result_expr->get_output_count() == original_expr->get_output_count(), "Expressions after copying aren't matched!"); + // Copy tensor shapes as shared pointer if needed if (!m_config.deep_copy_of_shapes) { for (size_t i = 0; i < original_expr->get_input_count(); ++i) result_expr->get_input_port_descriptor(i)->m_tensor_shape = original_expr->get_input_port_descriptor(i)->m_tensor_shape; for (size_t i = 0; i < original_expr->get_output_count(); ++i) result_expr->get_output_port_descriptor(i)->m_tensor_shape = original_expr->get_output_port_descriptor(i)->m_tensor_shape; } + + // Copy missed consumers if needed + if (m_config.copy_missed_consumers) { + for (size_t i = 0; i < original_expr->get_output_count(); i++) { + const auto& original_consumers = original_expr->get_output_port_connector(i)->get_consumers(); + for (const auto& original_consumer : original_consumers) { + const auto result_consumers = result_expr->get_output_port_connector(i)->get_consumers(); + // Check if consumer is from the cloned body + const auto original_expr_ptr = original_consumer.get_expr().get(); + if (expression_map.count(original_expr_ptr)) { + const auto target_consumer = expression_map[original_expr_ptr]->get_input_port(original_consumer.get_index()); + // If missed, add to existing consumers + if (std::find(result_consumers.cbegin(), result_consumers.cend(), target_consumer) == result_consumers.cend()) { + result_expr->get_output_port_connector(i)->add_consumer(target_consumer); + } + } + } + } + } } return result; diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp index e26703c294881b..fd26d5d90cc278 100644 --- a/src/common/snippets/src/lowered/loop_info.cpp +++ b/src/common/snippets/src/lowered/loop_info.cpp @@ -11,11 +11,12 @@ namespace ov { namespace snippets { namespace lowered { -LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits) - : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits) {} +LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, bool is_wa_const) + : m_work_amount(work_amount), m_increment(increment), m_input_ports(entries), m_output_ports(exits), m_is_work_amount_const(is_wa_const) {} -LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits) - : m_work_amount(work_amount), m_increment(increment) { +LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, + bool is_wa_const) + : m_work_amount(work_amount), m_increment(increment), m_is_work_amount_const(is_wa_const) { m_input_ports.reserve(entries.size()); m_output_ports.reserve(exits.size()); for (const auto& port : entries) @@ -68,6 +69,10 @@ const std::vector& LoopInfo::get_output_ports() const { return m_output_ports; } +bool LoopInfo::is_work_amount_const() const { + return m_is_work_amount_const; +} + void LoopInfo::set_work_amount(size_t work_amount) { m_work_amount = work_amount; } @@ -80,6 +85,10 @@ void LoopInfo::set_dim_idx(size_t dim_idx) { iterate_through_ports([dim_idx](LoopPort& port) { port.dim_idx = dim_idx; }); } +void LoopInfo::set_work_amount_const(bool value) { + m_is_work_amount_const = value; +} + template<> std::vector::iterator LoopInfo::find_loop_port(const LoopPort& loop_port) { auto& ports = loop_port.expr_port->get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports; @@ -138,16 +147,16 @@ std::vector LoopInfo::clone_loop_ports(const ExpressionMap& expr_map, UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers) - : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), + const SpecificIterationHandlers& handlers, bool is_wa_const) + : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), m_input_port_descs(std::vector(entries.size())), m_output_port_descs(std::vector(exits.size())) { validate(); } UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, - const SpecificIterationHandlers& handlers) - : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), + const SpecificIterationHandlers& handlers, bool is_wa_const) + : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), m_input_port_descs(std::vector(entries.size())), m_output_port_descs(std::vector(exits.size())) { validate(); } @@ -155,8 +164,8 @@ UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, UnifiedLoopInfo::UnifiedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, const std::vector& in_shifts, const std::vector& out_shifts, - const SpecificIterationHandlers& handlers) - : LoopInfo(work_amount, increment, entries, exits), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) { + const SpecificIterationHandlers& handlers, bool is_wa_const) + : LoopInfo(work_amount, increment, entries, exits, is_wa_const), m_handlers(handlers), m_input_port_descs(in_shifts), m_output_port_descs(out_shifts) { validate(); } @@ -170,7 +179,7 @@ std::shared_ptr UnifiedLoopInfo::clone_with_new_expr(const ExpressionM const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_input_port_descs, m_output_port_descs, m_handlers); + m_input_port_descs, m_output_port_descs, m_handlers, m_is_work_amount_const); } const SpecificIterationHandlers& UnifiedLoopInfo::get_handlers() const { @@ -294,8 +303,9 @@ void UnifiedLoopInfo::replace_with_new_ports(const ExpressionPort& actual_port, ExpandedLoopInfo::ExpandedLoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits, std::vector ptr_increments, std::vector final_offsets, std::vector data_sizes, - SpecificLoopIterType type, std::shared_ptr unified_loop_info) - : LoopInfo(work_amount, increment, entries, exits), m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)), + SpecificLoopIterType type, std::shared_ptr unified_loop_info, bool is_wa_const) + : LoopInfo(work_amount, increment, entries, exits, is_wa_const), + m_ptr_increments(std::move(ptr_increments)), m_finalization_offsets(std::move(final_offsets)), m_data_sizes(std::move(data_sizes)), m_type(type), m_unified_loop_info(std::move(unified_loop_info)) { validate(); } @@ -313,7 +323,7 @@ std::shared_ptr ExpandedLoopInfo::clone_with_new_expr(const Expression const auto& new_output_ports = clone_loop_ports(expr_map, m_output_ports); return std::make_shared(m_work_amount, m_increment, new_input_ports, new_output_ports, - m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info); + m_ptr_increments, m_finalization_offsets, m_data_sizes, m_type, m_unified_loop_info, m_is_work_amount_const); } const std::shared_ptr& ExpandedLoopInfo::get_unified_loop_info() const { diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index 75a196e4f623bb..224e1add666948 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/expression.hpp" +#include "snippets/op/loop.hpp" #include "snippets/utils.hpp" #include "openvino/core/graph_util.hpp" @@ -180,7 +181,8 @@ void LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos, OPENVINO_ASSERT(index < size, "Incorrect index for broadcasting"); const auto lhs_value = index < lhs_size ? *(lhs.crbegin() + index) : 1; const auto rhs_value = index < rhs_size ? *(rhs.crbegin() + index) : 1; - utils::broadcast_merge_dim(*(lhs.rbegin() + index), lhs_value, rhs_value); + OPENVINO_ASSERT(utils::broadcast_merge_dim(*(lhs.rbegin() + index), lhs_value, rhs_value), + "Failed to broadcast work amount in marking loop"); }; auto is_outside_loop = [&FULL_DIM](const std::vector& subtensor) { @@ -281,13 +283,14 @@ void LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR:: const auto work_amount = std::max(loop_info_upper->get_work_amount(), loop_info_lower->get_work_amount()); const auto increment = std::max(loop_info_upper->get_increment(), loop_info_lower->get_increment()); const auto handlers = SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers()); + const auto is_work_amount_const = loop_info_upper->is_work_amount_const() || loop_info_lower->is_work_amount_const(); auto new_entries = input_ports_upper; new_entries.insert(new_entries.end(), input_ports_lower.begin(), input_ports_lower.end()); auto new_exits = output_ports_upper; new_exits.insert(new_exits.end(), output_ports_lower.begin(), output_ports_lower.end()); - m_map[to] = std::make_shared(work_amount, increment, new_entries, new_exits, handlers); + m_map[to] = std::make_shared(work_amount, increment, new_entries, new_exits, handlers, is_work_amount_const); for (auto it = loop_begin_target; it != loop_end_target; ++it) { const auto& expr = *it; diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index a0c5328adb76fc..e830c7f9073206 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -6,11 +6,12 @@ #include "snippets/lowered/pass/allocate_buffers.hpp" #include "snippets/lowered/pass/enumerate_expressions.hpp" +#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" #include "snippets/lowered/pass/solve_buffer_memory.hpp" #include "snippets/lowered/pass/init_buffers_default.hpp" -#include "snippets/lowered/pass/identify_buffers.hpp" +#include "snippets/lowered/pass/set_buffer_reg_group.hpp" #include "snippets/lowered/pass/define_buffer_clusters.hpp" -#include "snippets/lowered/pass/normalize_buffer_ids.hpp" +#include "snippets/lowered/pass/normalize_buffer_reg_groups.hpp" #include "snippets/pass/tokenization.hpp" #include "snippets/itt.hpp" #include "snippets/utils.hpp" @@ -70,20 +71,20 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers"); size_t buffer_scratchpad_size = 0; + PassPipeline pipeline; + pipeline.register_pass(linear_ir.get_config().m_loop_depth); if (m_is_optimized_mode) { - BufferClusters buffer_clusters; - PassPipeline pipeline; pipeline.register_pass(); - pipeline.register_pass(); - pipeline.register_pass(buffer_clusters); - pipeline.register_pass(buffer_scratchpad_size, buffer_clusters); - pipeline.register_pass(); - pipeline.run(linear_ir); + pipeline.register_pass(); + pipeline.register_pass(); + pipeline.register_pass(buffer_scratchpad_size); + pipeline.register_pass(); } else { - InitBuffersDefault(buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); + pipeline.register_pass(buffer_scratchpad_size); } + pipeline.run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); - linear_ir.set_buffer_scratchpad_size(buffer_scratchpad_size); + linear_ir.set_static_buffer_scratchpad_size(buffer_scratchpad_size); return buffer_scratchpad_size > 0; } diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index 70da1a6cc17424..b81a1552f97b03 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -85,22 +85,22 @@ bool AssignRegisters::run(LinearIR& linear_ir) { for (const auto& expr : exprs) { auto op = expr->get_node(); if (const auto& buffer = ov::as_type_ptr(op)) { - const auto buffer_id = buffer->get_id(); + const auto reg_group = buffer->get_reg_group(); // All buffers have one common data pointer if (ov::is_type(buffer)) { manually_assigned_gprs[expr->get_input_port_connector(0)] = - static_cast(num_results + num_parameters + buffer_id); + static_cast(num_results + num_parameters + reg_group); // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start. // child shape info ops share the same memory as IntermediateMemoryBuffer. const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr); for (const auto& child_shape_infer_expr : shape_infer_consumers) { manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = - static_cast(num_results + num_parameters + buffer_id); + static_cast(num_results + num_parameters + reg_group); } } manually_assigned_gprs[expr->get_output_port_connector(0)] = - static_cast(num_results + num_parameters + buffer_id); + static_cast(num_results + num_parameters + reg_group); } else if (ov::is_type(op) || ov::is_type(op)) { // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer. // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp index 9552cbfdfbee76..4cf201047d63f5 100644 --- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp +++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp @@ -24,7 +24,7 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop const auto output_count = loop_end->get_output_num(); std::set resetting_data_indexes; - std::set buffers_ids; + std::set buffers_groups; // We count expressions only on inputs of Loop because we can only read from the same data but not write to the same data. // Parameter // | | @@ -34,8 +34,8 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop const auto& parent_output = loop_connectors[i]->get_source().get_expr(); if (const auto buffer = ov::as_type_ptr(parent_output->get_node())) { // If Buffer is missed in set, Just save - it's first meeting - if (buffers_ids.count(buffer->get_id()) == 0) { - buffers_ids.insert(buffer->get_id()); + if (buffers_groups.count(buffer->get_reg_group()) == 0) { + buffers_groups.insert(buffer->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(i); @@ -60,8 +60,8 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop if (const auto buffer = ov::as_type_ptr(child_node)) { buffer_count++; // If Buffer is missed in set, Just save - it's first meeting - if (buffers_ids.count(buffer->get_id()) == 0) { - buffers_ids.insert(buffer->get_id()); + if (buffers_groups.count(buffer->get_reg_group()) == 0) { + buffers_groups.insert(buffer->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(input_count + i); diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp new file mode 100644 index 00000000000000..760606d8dc067c --- /dev/null +++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" + +#include "snippets/op/buffer.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +namespace { +std::vector get_parent_inner_loops(const std::vector& parent_loops, const std::vector& current_loops) { + const auto common_rank = std::min(parent_loops.size(), current_loops.size()); + size_t i = 0; + while (i < common_rank && parent_loops[i] == current_loops[i]) + ++i; + return std::vector(parent_loops.cbegin() + i, parent_loops.cend()); +} +} // namespace + +// Ticket: 113744 +// TODO: This logic covers only several specific cases so it should be generalized. +size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank) { + const auto& parent_port = buffer_expr->get_input_port_connector(0)->get_source(); + const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), buffer_expr->get_loop_ids()); + const auto planar_shape = utils::get_preordered_vdims(parent_port); + + const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) + : planar_shape.size(); + + const auto& subtensor = parent_port.get_descriptor_ptr()->get_subtensor(); + + size_t allocation_size = 1; + std::set processed_dim_idxs; + for (const auto& parent_loop : parent_loop_ids) { + const auto loop_info = loop_manager->get_loop_info(parent_loop); + const auto& output_ports = loop_info->get_output_ports(); + auto it = std::find_if(output_ports.begin(), output_ports.end(), [&parent_port](const LoopPort& port) { return *port.expr_port == parent_port; }); + OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); + const auto& loop_port = *it; + const auto& dim_idx = loop_port.dim_idx; + if (loop_port.is_incremented && dim_idx < rank) { + if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) + utils::dynamic_safe_mul(allocation_size, unified_loop_info->get_work_amount()); + else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) + utils::dynamic_safe_mul(allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount()); + else + OPENVINO_THROW("Unknown LoopInfo type"); + processed_dim_idxs.insert(dim_idx); + } + } + const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); + for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { + if (processed_dim_idxs.count(i) == 0) { + if (i < subtensor.size()) + utils::dynamic_safe_mul(allocation_size, std::min(*(planar_shape.rbegin() + i), *(subtensor.rbegin() + i))); + else + utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i)); + } + } + + // Corner case when the current information is not enough + if (processing_rank == 0 && processed_dim_idxs.empty()) { + for (size_t i = 0; i < rank; ++i) { + utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i)); + } + } + + return allocation_size; +} + +bool ComputeBufferAllocationSize::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ComputeBufferAllocationSize") + + const auto& loop_manager = linear_ir.get_loop_manager(); + + const auto& buffer_expressions = linear_ir.get_buffer_ops(); + for (const auto& buffer_expr : buffer_expressions) { + const auto node = buffer_expr->get_node(); + OPENVINO_ASSERT(ov::is_type(node), "Expected Buffer ops in Buffer expressions of LinearIR"); + if (const auto buffer = ov::as_type_ptr(node)) { + // If the current size is undefined, update it + if (!buffer->is_defined()) + buffer->set_allocation_size(get_allocation_size(loop_manager, buffer_expr, m_buffer_allocation_rank)); + } + } + + + return true; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp index d093085dcc8922..41a13cadeb10e0 100644 --- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp +++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp @@ -4,7 +4,7 @@ #include "snippets/lowered/pass/define_buffer_clusters.hpp" -#include "snippets/lowered/pass/identify_buffers.hpp" +#include "snippets/lowered/pass/set_buffer_reg_group.hpp" #include "snippets/pass/tokenization.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" @@ -14,11 +14,11 @@ namespace snippets { namespace lowered { namespace pass { -using ShiftPtrParams = IdentifyBuffers::ShiftPtrParams; +using ShiftPtrParams = SetBufferRegGroup::ShiftPtrParams; -AllocateBuffers::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) { +DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) { return std::find_if(m_clusters.begin(), m_clusters.end(), - [&target](const AllocateBuffers::BufferCluster& cluster) { return cluster.count(target) > 0; }); + [&target](const BufferCluster& cluster) { return cluster.count(target) > 0; }); } bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const { @@ -30,15 +30,15 @@ void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) const auto cluster_it = find_cluster_by_expr(buffer_expr); // If Buffer is missed in clusters, create new cluster with the single Buffer node inside if (cluster_it == m_clusters.cend()) { - m_clusters.push_back(AllocateBuffers::BufferCluster{buffer_expr}); + m_clusters.push_back(BufferCluster{buffer_expr}); } } -size_t DefineBufferClusters::get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const { +size_t DefineBufferClusters::get_cluster_buffer_id(const BufferCluster& cluster) const { OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!"); - const auto id = (ov::as_type_ptr(cluster.cbegin()->get()->get_node()))->get_id(); + const auto id = (ov::as_type_ptr(cluster.cbegin()->get()->get_node()))->get_reg_group(); if (std::all_of(cluster.cbegin(), cluster.cend(), - [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr(expr->get_node()))->get_id() == id; })) { + [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr(expr->get_node()))->get_reg_group() == id; })) { return id; } return SIZE_MAX; @@ -148,7 +148,7 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) { if (has_been_added) break; } if (!has_been_added) { - m_clusters.push_back(AllocateBuffers::BufferCluster{output_buffer_expr}); + m_clusters.push_back(BufferCluster{output_buffer_expr}); } } @@ -248,8 +248,8 @@ int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr return final_offset; } -bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, - AllocateBuffers::BufferCluster& outer_cluster, +bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, + BufferCluster& outer_cluster, const ExpressionPtr& outer_buffer, bool is_outer_up) { for (const auto& inner_buffer : *inner_cluster_it) { ExpressionPtr common_loop_end_expr = nullptr; @@ -263,11 +263,11 @@ bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferCl const auto& inner_ptr_increments = common_loop_end->get_ptr_increments(); const auto& inner_final_offsets = common_loop_end->get_finalization_offsets(); const auto& inner_data_sizes = common_loop_end->get_element_type_sizes(); - if (IdentifyBuffers::can_reuse_id({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] }, - { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) { - const auto buffer_id = ov::as_type_ptr(outer_buffer->get_node())->get_id(); + if (SetBufferRegGroup::can_be_in_one_group({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] }, + { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) { + const auto buffer_reg_group = ov::as_type_ptr(outer_buffer->get_node())->get_reg_group(); for (const auto& inner_buffer : *inner_cluster_it) - ov::as_type_ptr(inner_buffer->get_node())->set_id(buffer_id); + ov::as_type_ptr(inner_buffer->get_node())->set_reg_group(buffer_reg_group); outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend()); m_clusters.erase(inner_cluster_it); @@ -339,6 +339,8 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) { bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters"); + m_clusters.clear(); + for (auto expr_it = begin; expr_it != end; ++expr_it) { const auto& expr = *expr_it; const auto op = expr->get_node(); @@ -353,6 +355,15 @@ bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR:: } } + for (size_t cluster_id = 0; cluster_id < m_clusters.size(); ++cluster_id) { + const auto& cluster = m_clusters[cluster_id]; + std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const ExpressionPtr& buffer_expr) { + const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + buffer->set_cluster_id(cluster_id); + }); + } + return true; } diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index ec0743bf4df7d0..a06c58a21bf272 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -65,12 +65,13 @@ bool FuseLoops::can_be_fused(const UnifiedLoopInfoPtr& loop_upper, const Unified (work_amount_upper == work_amount_lower) && increment_upper == increment_lower; const bool bcastable_upper = work_amount_upper == 1 && increment_upper == 1; const bool bcastable_lower = work_amount_lower == 1 && increment_lower == 1; + const auto is_const_wa_equal = loop_upper->is_work_amount_const() == loop_lower->is_work_amount_const(); // WA: we can't fuse 2 loops if one of them has first iteration handler but second hasn't, // because in this case Main/Tail body handlers of the loop wo first iter handler must be reset with new parameters // (e.g. tail size). This logic is not implemented for now, so fusion for such loops is skipped. const bool first_iter_handlers_match = loop_upper->get_handlers().get_passes().empty() == loop_lower->get_handlers().get_passes().empty(); - return first_iter_handlers_match && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower); + return first_iter_handlers_match && is_const_wa_equal && (is_dynamic_case || equal_parameters || bcastable_upper || bcastable_lower); } void FuseLoops::move(LinearIR& linear_ir, const LoopManagerPtr& loop_manager, size_t loop_id, diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp index 36cb41d3b9c96e..8ba9c39322fd66 100644 --- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp +++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp @@ -17,17 +17,22 @@ namespace pass { bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault"); - size_t id = 0; + size_t idx = 0; size_t offset = 0; for (auto expr_it = begin; expr_it != end; ++expr_it) { const auto& expr = *expr_it; const auto op = expr->get_node(); if (const auto buffer = ov::as_type_ptr(op)) { - AllocateBuffers::set_buffer_offset(expr, offset); - buffer->set_id(id); - - offset += buffer->get_byte_size(); - id++; + buffer->set_reg_group(idx); + buffer->set_cluster_id(idx); + + if (!buffer->is_defined()) { + AllocateBuffers::set_buffer_offset(expr, utils::get_dynamic_value()); + } else { + AllocateBuffers::set_buffer_offset(expr, offset); + offset += buffer->get_byte_size(); + } + idx++; } } diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 1e0c556cff013f..c2360b4f2a54f4 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -6,7 +6,7 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_manager.hpp" -#include "snippets/op/memory_access.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" @@ -123,8 +123,9 @@ inline void init_work_amount(const LoopInfoPtr& loop_info) { const auto& shape = desc->get_shape(); const auto& layout = desc->get_layout(); const auto is_input = loop_port.expr_port->get_type() == ExpressionPort::Input; - const auto dim_idx = is_input ? utils::get_input_dim_idx(layout, loop_port.dim_idx) : utils::get_input_dim_idx(layout, loop_port.dim_idx); - utils::broadcast_merge_dim(work_amount, work_amount, shape[dim_idx]); + const auto dim_idx = is_input ? utils::get_input_dim_idx(layout, loop_port.dim_idx) : utils::get_output_dim_idx(layout, loop_port.dim_idx); + OPENVINO_ASSERT(utils::broadcast_merge_dim(work_amount, work_amount, shape[dim_idx]), + "Failed to broadcast work_amount"); } }); loop_info->set_work_amount(work_amount); @@ -133,7 +134,7 @@ inline void init_work_amount(const LoopInfoPtr& loop_info) { void InitLoops::init_loop_info(const UnifiedLoopInfoPtr& loop_info, const size_t loop_id, bool only_runtime_args) { OPENVINO_ASSERT(loop_info != nullptr, "UnifiedLoopInfo is nullptr, nothing to initialize"); - if (utils::is_dynamic_value(loop_info->get_work_amount())) + if (!loop_info->is_work_amount_const()) init_work_amount(loop_info); const auto work_amount = loop_info->get_work_amount(); diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index 7c9ee6b8f1b000..87e5c489cb1029 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -29,82 +29,8 @@ std::vector get_buffer_loop_ids(const std::vector& lhs, const st } return buffer_loop_ids; } - -// Ticket: 113744 -// TODO: This logic covers only several specific cases so it should be generalized. -ov::Shape compute_allocation_shape(const LoopManagerPtr& loop_manager, - const std::vector& buffer_loop_ids, - const ExpressionPort& parent_expr_output, - const int allocation_rank) { - const auto& parent_expr = parent_expr_output.get_expr(); - const auto& parent_loop_ids = parent_expr->get_loop_ids(); - const auto planar_shape = utils::get_preordered_vdims(parent_expr_output); - - const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) - : planar_shape.size(); - ov::Shape allocation_shape(rank); - for (size_t i = 0; i < rank; ++i) { - *(allocation_shape.rbegin() + i) = *(planar_shape.rbegin() + i); - } - - if (buffer_loop_ids.empty() || parent_loop_ids.empty()) { - return allocation_shape; - } - - // If subtensor is set, its information is used for allocation shape computation. Two situations are possible: - // 1. Buffer is outside the parent loop: the corresponding subtensor value is ignored, parent loop work amount is set instead - // 2. Buffer is inside the parent loop: the corresponding subtensor value is used in allocation shape. - // Since we can defenitely know which subtensor value corresponds to the loop only for 1st case - // (we can extract this info from loop output port), we copy subtensor, and then replace subtensor values with parent loop work amount if needed. - // Example: - // Parent subtensor: [M_blk, N_blk] - // Buffer loop idces: [M_loop_idx], parent loop idces: [M_loop_idx, N_loop_idx] - // - // 1. Allocation shape is set to subtensor: [M_blk, N_blk] - // 2. Buffer is inside M_loop_idx loop => allocation shape is not changed - // 3. Buffer is outside N_loop_idx loop => the corresponding allocation shape value is replaced with N loop work amount - // So the result allocation shape is [M_blk, N_loop_work_amount] - const auto& subtensor = parent_expr_output.get_descriptor_ptr()->get_subtensor(); - if (!subtensor.empty()) { - for (size_t i = 0; i < std::min(rank, subtensor.size()); ++i) { - auto& cur_val = *(allocation_shape.rbegin() + i); - const auto& subtensor_val = *(subtensor.rbegin() + i); - cur_val = std::min(cur_val, subtensor_val); - } - for (const auto& parent_loop : parent_loop_ids) { - if (std::find(buffer_loop_ids.begin(), buffer_loop_ids.end(), parent_loop) == buffer_loop_ids.end()) { - const auto loop_info = loop_manager->get_loop_info(parent_loop); - const auto& output_ports = loop_info->get_output_ports(); - auto it = std::find_if(output_ports.begin(), - output_ports.end(), - [&parent_expr_output](const LoopPort& port) { - return *port.expr_port == parent_expr_output; - }); - OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); - const auto& loop_port = *it; - if (loop_port.is_incremented && loop_port.dim_idx < allocation_shape.size()) { - *(allocation_shape.rbegin() + loop_port.dim_idx) = loop_info->get_work_amount(); - } - } - } - } else { - // WA: In case of empty subtensors another information have to be used to update allocation shape. - for (size_t i = 0; i < std::min(rank, parent_loop_ids.size()); ++i) { - const auto loop = loop_manager->get_loop_info(*(parent_loop_ids.rbegin() + i)); - OPENVINO_ASSERT(loop->get_dim_idx() == i, "compute_allocation_shape: eltwise loop has unexpected dimension index"); - *(allocation_shape.rbegin() + i) = loop->get_work_amount(); - } - for (int i = 0; i < allocation_rank - static_cast(parent_loop_ids.size()); ++i) { - allocation_shape[i] = 1; - } - } - return allocation_shape; -} } // namespace -InsertBuffers::InsertBuffers(int32_t buffer_allocation_rank) - : RangedPass(), m_buffer_allocation_rank(buffer_allocation_rank) {} - LinearIR::constExprIt InsertBuffers::insertion_position(const LinearIR& linear_ir, const LoopManagerPtr& loop_manager, const ExpressionPtr& up_expr, const ExpressionPtr& down_expr) { const auto& up_loops = up_expr->get_loop_ids(); @@ -189,11 +115,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Current expr Loop identifies: 3, 4, 6 // Need to insert between 2nd and 4th Loops - after 2nd Loop const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr); - const auto allocation_shape = compute_allocation_shape(loop_manager, - buffer_loop_ids, - parent_expr_output, - m_buffer_allocation_rank); - const auto buffer = std::make_shared(parent->output(parent_port), allocation_shape); + const auto buffer = std::make_shared(parent->output(parent_port)); const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : *entry_port; linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer }); } @@ -276,11 +198,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Note: All potential consumers must have the same count of first equal Loop identifies and the same count of different last identifies const auto pos = insertion_position(linear_ir, loop_manager, expr, consumer_expr); - const auto allocation_shape = compute_allocation_shape(loop_manager, - buffer_loop_ids, - *exit_port, - m_buffer_allocation_rank); - auto buffer = std::make_shared(node->output(port_idx), allocation_shape); + auto buffer = std::make_shared(node->output(port_idx)); // We cannot insert Node output connector on Buffer output because not all consumers of Node needs Buffer // Example: // Add diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index e89c711627a911..dd418839ca84cc 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -75,12 +75,29 @@ LoopManager::LoopBounds InsertSpecificIterations::insert_copy_loop(LinearIR& lin std::vector& new_entry_ports, std::vector& new_exit_ports) { const auto& loop_manager = linear_ir.get_loop_manager(); const auto loop_bounds = loop_manager->get_loop_bounds(linear_ir, loop_id); + const auto loop_begin_pos = loop_bounds.first; + const auto loop_end_pos = std::next(loop_bounds.second); + ExpressionMap expression_map; const auto& cloning_config = LinearIRBuilder::Config(false); - const auto& loop_copy_range = LinearIRBuilder(cloning_config).clone_range(loop_bounds.first, std::next(loop_bounds.second), expression_map); + const auto& loop_copy_range = LinearIRBuilder(cloning_config).clone_range(loop_begin_pos, loop_end_pos, expression_map); const auto new_loop_begin_pos = linear_ir.insert(insert_pos, loop_copy_range.begin(), loop_copy_range.end()); const auto new_loop_end_pos = std::prev(insert_pos); + // Added connections between output of cloned bodies and the current LinearIR + for (LinearIR::constExprIt result_it = new_loop_begin_pos, original_it = loop_begin_pos; original_it != loop_end_pos; ++result_it, ++original_it) { + const auto result_expr = *result_it; + const auto original_expr = *original_it; + for (size_t i = 0; i < original_expr->get_output_count(); i++) { + const auto& consumers = original_expr->get_output_port_connector(i)->get_consumers(); + for (const auto& consumer : consumers) { + if (std::find(loop_begin_pos, loop_end_pos, consumer.get_expr()) == loop_end_pos) { + result_expr->get_output_port_connector(i)->add_consumer(consumer); + } + } + } + } + auto clone_ports = [&expression_map](const std::vector& ports, std::vector& new_ports) { new_ports.resize(ports.size()); for (size_t i = 0; i < ports.size(); ++i) { diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp index dd2d601366cb1a..3a928819e6c85d 100644 --- a/src/common/snippets/src/lowered/pass/iter_handler.cpp +++ b/src/common/snippets/src/lowered/pass/iter_handler.cpp @@ -115,6 +115,7 @@ bool TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::constExprIt beg offset = offset / inner_loop_work_amount * static_cast(m_tail_size); } inner_loop_end->set_work_amount(m_tail_size); + inner_loop_info->set_work_amount_const(true); // TODO: if m_tail_size more than inner loop increment, // handlers of the inner loop must be reset with new tail size inner_loop_end->set_increment(std::min(inner_loop_increment, m_tail_size)); diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp deleted file mode 100644 index 76ef3562760daa..00000000000000 --- a/src/common/snippets/src/lowered/pass/normalize_buffer_ids.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/lowered/pass/normalize_buffer_ids.hpp" - -#include "snippets/op/buffer.hpp" -#include "snippets/itt.hpp" - - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { - -bool NormalizeBufferIDs::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs"); - - // [ original Buffer ID -> normalized ] - std::map buffer_ids; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (const auto buffer = ov::as_type_ptr(op)) { - const auto buffer_id = buffer->get_id(); - if (buffer_ids.count(buffer_id) == 0) { - const auto new_id = buffer_ids.size(); - buffer_ids[buffer_id] = new_id; - } - buffer->set_id(buffer_ids[buffer_id]); - } - } - return buffer_ids.size(); -} - -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp new file mode 100644 index 00000000000000..3e235749ce7ca2 --- /dev/null +++ b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/normalize_buffer_reg_groups.hpp" + +#include "snippets/op/buffer.hpp" +#include "snippets/itt.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +bool NormalizeBufferRegisterGroups::run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferRegisterGroups"); + + // [ original Buffer reg group -> normalized ] + std::map buffer_reg_groups; + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto& expr = *expr_it; + const auto op = expr->get_node(); + if (const auto buffer = ov::as_type_ptr(op)) { + const auto group = buffer->get_reg_group(); + if (buffer_reg_groups.count(group) == 0) { + const auto new_id = buffer_reg_groups.size(); + buffer_reg_groups[group] = new_id; + } + buffer->set_reg_group(buffer_reg_groups[group]); + } + } + return buffer_reg_groups.size(); +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp index 017704adf28089..9a89edd24767a6 100644 --- a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp +++ b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp @@ -10,6 +10,7 @@ #include "snippets/op/reduce.hpp" #include "snippets/op/horizon_max.hpp" #include "snippets/op/horizon_sum.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" @@ -101,6 +102,12 @@ bool ReduceDecomposition::run(LinearIR& linear_ir, LinearIR::constExprIt begin, replace_input_port_connectors({fill.first->get()->get_input_port(0)}, reduce_expr->get_input_port_connector(0)); replace_input_port_connectors(reduce_expr->get_output_port_connector(0)->get_consumers(), horizon.first->get()->get_output_port_connector(0)); + // Update input shapes of consumers + const auto reduce_consumers = horizon.first->get()->get_output_port_connector(0)->get_consumers(); + for (const auto& consumer : reduce_consumers) { + consumer.get_expr()->updateShapes(); + } + // Update Loop info for outer loops const std::vector input_ports{(*fill.first)->get_input_port(0)}; const std::vector output_ports{(*horizon.first)->get_output_port(0)}; diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp similarity index 88% rename from src/common/snippets/src/lowered/pass/identify_buffers.cpp rename to src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp index 7e859ce8b1b173..59c9bf21a0894a 100644 --- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/lowered/pass/identify_buffers.hpp" +#include "snippets/lowered/pass/set_buffer_reg_group.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/snippets_isa.hpp" @@ -19,22 +19,22 @@ inline size_t index(size_t col_num, size_t row, size_t col) { } } // namespace -bool operator==(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) { +bool operator==(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferRegGroup::ShiftPtrParams& rhs) { if (&lhs == &rhs) return true; return lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset && lhs.data_size == rhs.data_size; } -bool operator!=(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) { +bool operator!=(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferRegGroup::ShiftPtrParams& rhs) { return !(rhs == lhs); } -size_t IdentifyBuffers::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) { +size_t SetBufferRegGroup::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) { const auto iter = std::find(pool.cbegin(), pool.cend(), target); OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph"); return std::distance(pool.cbegin(), iter); } -bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) { +bool SetBufferRegGroup::can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) { // If data pointer shift parameters are unknown on model compilation stage (dynamic), // we cannot be sure that these data pointers will be proportionally shifted. // Then we force `false` value here to set unique registers for these buffers @@ -44,13 +44,13 @@ bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrPara return are_static && equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0)); } -bool IdentifyBuffers::are_adjacent(const std::pair& lhs, +bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, const std::pair& rhs) { const auto& lhs_ids = lhs.first->get_loop_ids(); const auto& rhs_ids = rhs.first->get_loop_ids(); const auto equal_loop_ids = lhs_ids == rhs_ids; if (equal_loop_ids) { // Buffers are connected to the same Loop and have the same outer Loops - return !can_reuse_id(lhs.second, rhs.second); + return !can_be_in_one_group(lhs.second, rhs.second); } else { // Buffers are connected to the same Loop, but one of Buffers - inside this Loop, another - outside // Buffers are adjacent if outer Buffer has not zero data shift params if (lhs_ids.size() == rhs_ids.size()) // If the count of outer Loops are equal, it means that outer loops are already different @@ -64,7 +64,7 @@ bool IdentifyBuffers::are_adjacent(const std::pair& lhs, +void SetBufferRegGroup::update_adj_matrix(const std::pair& lhs, const std::pair& rhs, const BufferPool& buffers, std::vector& adj) { @@ -80,7 +80,7 @@ void IdentifyBuffers::update_adj_matrix(const std::pair IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) { +std::vector SetBufferRegGroup::create_adjacency_matrix(LinearIR::constExprIt begin, LinearIR::constExprIt end, const BufferPool& pool) { // The sync point to check for adjacency is Loop because only in Loop we increment pointers. // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes) // they are called as adjacent @@ -113,7 +113,7 @@ std::vector IdentifyBuffers::create_adjacency_matrix(LinearIR::constExprIt return adj; } -IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_neighbours(const ExpressionPtr& loop_end_expr) { +SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const ExpressionPtr& loop_end_expr) { const auto& loop_end = ov::as_type_ptr(loop_end_expr->get_node()); const auto input_count = loop_end->get_input_num(); const auto output_count = loop_end->get_output_num(); @@ -157,7 +157,7 @@ IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_neighbours(const Exp return buffer_neighbours; } -IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_inside(const LinearIR::constExprIt& loop_end_it) { +SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_inside(const LinearIR::constExprIt& loop_end_it) { const auto& loop_end = ov::as_type_ptr((*loop_end_it)->get_node()); const auto loop_begin = loop_end->get_loop_begin(); BufferMap inner_buffers; @@ -172,7 +172,7 @@ IdentifyBuffers::BufferMap IdentifyBuffers::get_buffer_loop_inside(const LinearI return inner_buffers; } -auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector& adj) -> std::map { +auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> std::map { size_t color = 0; std::map color_groups; const auto size = buffers.size(); @@ -217,8 +217,8 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector& adj) -> s return color_groups; } -bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers") +bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBufferRegGroup") // Identify Buffers using Graph coloring algorithm. BufferPool buffer_pool; @@ -239,7 +239,7 @@ bool IdentifyBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt be const auto color = pair.first; const auto& united_buffers = pair.second; for (const auto& buffer_expr : united_buffers) { - ov::as_type_ptr(buffer_expr->get_node())->set_id(color); + ov::as_type_ptr(buffer_expr->get_node())->set_reg_group(color); } } diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp index 5b7d7e07714b64..f8416f2ea7326e 100644 --- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp +++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp @@ -4,6 +4,7 @@ #include "snippets/lowered/pass/solve_buffer_memory.hpp" +#include "snippets/lowered/pass/allocate_buffers.hpp" #include "snippets/pass/tokenization.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" @@ -14,73 +15,144 @@ namespace snippets { namespace lowered { namespace pass { -std::vector SolveBufferMemory::init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters) { - std::vector boxes; - const auto count = static_cast(buffer_clusters.size()); - for (int i = 0; i < count; i++) { - ov::MemorySolver::Box box = { std::numeric_limits::max(), 0, 0, i }; - int64_t box_size = 0; - for (const auto& buffer_expr : buffer_clusters[i]) { - int e_start = 0, e_finish = 0; - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer != nullptr, "BufferSolver expects Buffer ops in clusters"); - - // life finish time - order of LoopEnd / MemoryAccess ops - const auto& buffer_outs = buffer_expr->get_output_port_connectors(); - for (const auto& buffer_out : buffer_outs) { - const auto consumers = buffer_out->get_consumers(); - for (const auto& consumer : consumers) { - const auto consumer_order = static_cast(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node())); - e_finish = std::max(e_finish, consumer_order); // the last consumer - } +std::pair SolveBufferMemory::extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions) { + LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; + for (const auto& buffer_expr : buffer_expressions) { + const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + + auto& clusters = buffer->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs; + clusters.push_back(buffer_expr); + } + + // Validation check that buffer cluster has only static or dynamic buffers. + for (const auto& static_buffer : static_buffer_exprs) { + const auto static_cluster_id = ov::as_type_ptr(static_buffer->get_node())->get_cluster_id(); + auto is_cluster_ids_the_same = [&static_cluster_id](const ExpressionPtr& expr) { + return static_cluster_id == ov::as_type_ptr(expr->get_node())->get_cluster_id(); + }; + OPENVINO_ASSERT(std::none_of(dynamic_buffer_exprs.cbegin(), dynamic_buffer_exprs.cend(), is_cluster_ids_the_same), + "There is Buffer cluster with buffers which has defined and undefined allocation sizes"); + } + + return { static_buffer_exprs, dynamic_buffer_exprs }; +} + +std::vector SolveBufferMemory::init_boxes(const LinearIR::container& buffer_expressions) { + std::map map_boxes; + for (const auto& buffer_expr : buffer_expressions) { + const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + auto cluster_id = static_cast(buffer->get_cluster_id()); + + if (map_boxes.count(cluster_id) == 0) { + map_boxes[cluster_id] = { std::numeric_limits::max(), 0, 0, cluster_id }; + } + + auto& box = map_boxes.at(cluster_id); + + int e_start = 0, e_finish = 0; + + // life finish time - order of LoopEnd / MemoryAccess ops + const auto& buffer_outs = buffer_expr->get_output_port_connectors(); + for (const auto& buffer_out : buffer_outs) { + const auto consumers = buffer_out->get_consumers(); + for (const auto& consumer : consumers) { + const auto consumer_order = static_cast(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node())); + e_finish = std::max(e_finish, consumer_order); // the last consumer } - e_start = e_finish; - - const auto& buffer_ins = buffer_expr->get_input_port_connectors(); - for (const auto& buffer_in : buffer_ins) { - const auto& source = buffer_in->get_source(); - e_start = static_cast(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node())); - - const auto buffer_siblings = buffer_in->get_consumers(); - for (const auto& sibling : buffer_siblings) { - if (const auto loop_end = ov::as_type_ptr(sibling.get_expr()->get_node())) { - e_start = std::min(e_start, static_cast(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin()))); - } + } + e_start = e_finish; + + const auto& buffer_ins = buffer_expr->get_input_port_connectors(); + for (const auto& buffer_in : buffer_ins) { + const auto& source = buffer_in->get_source(); + e_start = static_cast(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node())); + + const auto buffer_siblings = buffer_in->get_consumers(); + for (const auto& sibling : buffer_siblings) { + if (const auto loop_end = ov::as_type_ptr(sibling.get_expr()->get_node())) { + e_start = std::min(e_start, static_cast(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin()))); } } - OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!"); + } + OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!"); - auto buffer_size = static_cast(buffer->get_byte_size()); - box_size = std::max(buffer_size, box_size); + auto buffer_size = static_cast(buffer->get_byte_size()); + box.size = std::max(buffer_size, box.size); - box.start = std::min(e_start, box.start); - box.finish = std::max(e_finish, box.finish); - } + box.start = std::min(e_start, box.start); + box.finish = std::max(e_finish, box.finish); + } + + std::vector boxes(map_boxes.size()); + for (const auto& p : map_boxes) { + const auto& buffer_id = static_cast(p.first); + OPENVINO_ASSERT(buffer_id < boxes.size(), "Incorrect Buffer Cluster ID"); + boxes[buffer_id] = p.second; // We use data alignment to put data in the line cache - box.size = utils::div_up(box_size, m_alignment); - boxes.push_back(box); + boxes.at(buffer_id).size = utils::div_up(boxes.at(buffer_id).size, m_alignment); } + return boxes; } - -bool SolveBufferMemory::run(LinearIR& linear_ir) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory"); - - const auto boxes = init_boxes(m_clusters); +void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions) { + const auto boxes = init_boxes(static_buffer_expressions); ov::MemorySolver memSolver(boxes); - m_buffer_scratchpad_size = static_cast(memSolver.solve()) * m_alignment; // alignment in byte + m_static_buffer_scratchpad_size = static_cast(memSolver.solve()) * m_alignment; // alignment in byte // Set offsets for Buffers - for (const auto& box : boxes) { - for (const auto& buffer : m_clusters[box.id]) { - const auto offset = static_cast(memSolver.get_offset(static_cast(box.id))); - AllocateBuffers::set_buffer_offset(buffer, offset * m_alignment); // alignment in byte + for (const auto& buffer_expr : static_buffer_expressions) { + const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + auto cluster_id = static_cast(buffer->get_cluster_id()); + + const auto offset = static_cast(memSolver.get_offset(static_cast(boxes[cluster_id].id))); + AllocateBuffers::set_buffer_offset(buffer_expr, offset * m_alignment); // alignment in byte + } +} + +void SolveBufferMemory::set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions) { + size_t offset = utils::get_dynamic_value(); + + // If there are not allocated memory for static buffers in LinearIR and there is only one cluster of dynamic buffer exprs, + // we can force offset = 0 + if (m_static_buffer_scratchpad_size == 0) { + std::set dynamic_clusters; + for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) { + const auto& buffer = ov::as_type_ptr(dynamic_buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + dynamic_clusters.insert(buffer->get_cluster_id()); } + if (dynamic_clusters.size() == 1) + offset = 0; + } + + // Set offsets for Buffers + for (const auto& buffer_expr : dynamic_buffer_expressions) { + const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); + + AllocateBuffers::set_buffer_offset(buffer_expr, offset); } - return m_buffer_scratchpad_size > 0; +} + +bool SolveBufferMemory::run(LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory"); + + LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; + std::tie(static_buffer_exprs, dynamic_buffer_exprs) = extract_static_and_dynamic_buffers(linear_ir.get_buffer_ops()); + + if (!static_buffer_exprs.empty()) + solve_static_buffer_memory(static_buffer_exprs); + + if (!dynamic_buffer_exprs.empty()) + set_dynamic_buffer_offset(dynamic_buffer_exprs); + + return !static_buffer_exprs.empty() && !dynamic_buffer_exprs.empty(); } } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index 163980a21e5f72..365bcd250b4556 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -70,6 +70,7 @@ bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, if (FuseLoops::can_be_fused(upper_loop, lower_loop) && can_be_split(loop_to_split, loop_to_fuse)) { loop_was_split = true; loop_to_split->set_work_amount(loop_to_fuse->get_increment()); + loop_to_split->set_work_amount_const(true); const auto& loop_to_split_id = split_parent ? parent_loop_id : loop_id; const auto loop_bounds = LoopManager::get_loop_bounds(linear_ir, loop_to_split_id, diff --git a/src/common/snippets/src/lowered/pass/update_loop_info.cpp b/src/common/snippets/src/lowered/pass/update_loop_info.cpp deleted file mode 100644 index 3112701a737dc0..00000000000000 --- a/src/common/snippets/src/lowered/pass/update_loop_info.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/lowered/pass/update_loop_info.hpp" - -#include "snippets/lowered/pass/init_loops.hpp" -#include "snippets/lowered/pass/insert_specific_iterations.hpp" -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/itt.hpp" - -namespace ov { -namespace snippets { -namespace lowered { -namespace pass { - -void UpdateLoopInfo::init_data_ptr_shifts(const UnifiedLoopInfoPtr& unified_loop_info, std::vector& ptr_increments, - std::vector& finalization_offsets) { - const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count(); - ptr_increments.resize(count); - finalization_offsets.resize(count); - - size_t idx = 0; - unified_loop_info->iterate_through_descs( - [&ptr_increments, &finalization_offsets, &idx](const UnifiedLoopInfo::LoopPortDesc& desc) { - ptr_increments[idx] = desc.ptr_increment; - finalization_offsets[idx] = desc.finalization_offset; - ++idx; - }); -} - -bool UpdateLoopInfo::run(LinearIR& linear_ir) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::UpdateLoopInfo") - - // Initialized UnifiedLoopInfo - struct CurrentUnifiedLoopInfo { - UnifiedLoopInfoPtr updated_unified_loop_info = nullptr; - size_t current_work_amount = 0; - std::vector ptr_increments; - std::vector finalization_offsets; - }; - std::unordered_map initializated_info_map; - - const auto& loop_map = linear_ir.get_loop_manager()->get_map(); - for (const auto& p : loop_map) { - const auto& expanded_loop_info = ov::as_type_ptr(p.second); - OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager"); - - // First visiting of unified (whole) loop - const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info(); - if (initializated_info_map.count(current_unified_loop_info) == 0) { - auto& current_info = initializated_info_map[current_unified_loop_info]; - // make a copy to avoid original loop info corruption - current_info.updated_unified_loop_info = std::make_shared(*current_unified_loop_info); - InitLoops::init_loop_info(current_info.updated_unified_loop_info, true); - - current_info.current_work_amount = current_info.updated_unified_loop_info->get_work_amount(); - init_data_ptr_shifts(current_info.updated_unified_loop_info, current_info.ptr_increments, current_info.finalization_offsets); - } - - auto& initializated_info = initializated_info_map.at(current_unified_loop_info); - auto& current_work_amount = initializated_info.current_work_amount; - const auto& updated_unified_loop_info = initializated_info.updated_unified_loop_info; - const auto& ptr_increments = initializated_info.ptr_increments; - const auto& finalization_offsets = initializated_info.finalization_offsets; - - const auto& decomposed_loop_type = expanded_loop_info->get_type(); - - // If the specific iteration is not needed, we skip loop evaluation - set zero as work amount is enough - if (!InsertSpecificIterations::is_decomposed_loop_needed(updated_unified_loop_info, decomposed_loop_type, current_work_amount)) { - expanded_loop_info->set_work_amount(0); - continue; - } - - expanded_loop_info->set_work_amount( - InsertSpecificIterations::get_decomposed_loop_work_amount(updated_unified_loop_info, decomposed_loop_type, current_work_amount)); - // Update remaining Loop work amount - current_work_amount -= expanded_loop_info->get_work_amount(); - - expanded_loop_info->update_ptr_increments(ptr_increments); - if (current_work_amount > 0) { - expanded_loop_info->update_finalization_offsets(std::vector(finalization_offsets.size(), 0)); - } else { - expanded_loop_info->update_finalization_offsets(finalization_offsets); - } - } - return true; -} -} // namespace pass -} // namespace lowered -} // namespace snippets -} // namespace ov diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp index dc9dbdea76b5c8..c0d605e0dfa1c4 100644 --- a/src/common/snippets/src/lowered/pass/validate.cpp +++ b/src/common/snippets/src/lowered/pass/validate.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/validate.hpp" #include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" diff --git a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp index 1653d9da993f6d..9c62907e569670 100644 --- a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp +++ b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/validate_expanded_loops.hpp" #include "snippets/lowered/loop_manager.hpp" +#include "snippets/op/loop.hpp" #include "snippets/utils.hpp" #include "snippets/itt.hpp" @@ -17,15 +18,6 @@ namespace pass { OPENVINO_ASSERT((cond), "Failed to validate ExpandedLoops: ", __VA_ARGS__) namespace { -template -void dynamic_safe_add(T& lhs, const T& rhs) { - if (utils::is_dynamic_value(lhs) || utils::is_dynamic_value(rhs)) { - lhs = utils::get_dynamic_value(); - return; - } - lhs += rhs; -} - bool is_inner_splitted_tail(const ExpressionPtr& loop_expr, const LoopManagerPtr& loop_manager) { const auto loop_end = ov::as_type_ptr(loop_expr->get_node()); INFORMATIVE_ASSERT(loop_end, "expects LoopEnd"); @@ -81,7 +73,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) total_finalization_offsets.resize(num_ports, 0); } - dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount()); + utils::dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount()); INFORMATIVE_ASSERT(current_unified_loop_info->get_ptr_increments() == expanded_loop_info->get_ptr_increments(), "incompatible pointer increments with UnifiedLoopInfo"); @@ -89,7 +81,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) INFORMATIVE_ASSERT(finalization_offsets.size() == total_finalization_offsets.size(), "incompatible finalization offset count"); for (size_t i = 0; i < num_ports; ++i) - dynamic_safe_add(total_finalization_offsets[i], finalization_offsets[i]); + utils::dynamic_safe_add(total_finalization_offsets[i], finalization_offsets[i]); } } diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp index 424063d68d5a59..dc455300522ba1 100644 --- a/src/common/snippets/src/op/buffer.cpp +++ b/src/common/snippets/src/op/buffer.cpp @@ -13,75 +13,67 @@ namespace ov { namespace snippets { namespace op { -Buffer::Buffer(const OutputVector& arguments, const ov::Shape& shape, size_t id, ov::element::Type element_type) - : Op(arguments), m_shape(shape), m_id(id), m_element_type(std::move(element_type)), m_offset(0) { +Buffer::Buffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id) + : Op(arguments), m_allocation_size(allocation_size), m_reg_group(reg_group), m_cluster_id(cluster_id), m_offset(0) { constructor_validate_and_infer_types(); } bool Buffer::visit_attributes(AttributeVisitor& visitor) { INTERNAL_OP_SCOPE(Buffer_visit_attributes); - visitor.on_attribute("allocation_shape", m_shape); + auto element_type = get_element_type(); + auto allocation_size = utils::value2str(m_allocation_size); + visitor.on_attribute("allocation_size", allocation_size); visitor.on_attribute("offset", m_offset); - visitor.on_attribute("id", m_id); - visitor.on_attribute("element_type", m_element_type); + visitor.on_attribute("reg_group", m_reg_group); + visitor.on_attribute("cluster_id", m_cluster_id); + visitor.on_attribute("element_type", element_type); return true; } -size_t Buffer::get_byte_size() const { - const auto shape = get_allocation_shape(); - return ov::shape_size(shape) * m_element_type.size(); +bool Buffer::is_defined() const { + return !utils::is_dynamic_value(m_allocation_size); } -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output& arg, const ov::Shape& shape, size_t id) - : Buffer({arg}, shape, id) { - constructor_validate_and_infer_types(); +size_t Buffer::get_byte_size() const { + if (is_defined()) + return m_allocation_size * get_element_type().size(); + return utils::get_dynamic_value(); } -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output& arg, int32_t allocation_rank, size_t id) - : Buffer({arg}, compute_shape_from_allocation_rank(arg, allocation_rank), id) { +IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size, size_t reg_group, size_t cluster_id) + : Buffer({arg}, allocation_size, reg_group, cluster_id) { constructor_validate_and_infer_types(); } -ov::Shape IntermediateMemoryBuffer::compute_shape_from_allocation_rank(const ov::Output& arg, int32_t allocation_rank) { - const auto& pshape = arg.get_partial_shape(); - OPENVINO_ASSERT(pshape.is_static(), "Buffer supports only static input shape"); - const auto shape = pshape.get_shape(); - const auto normalize_rank = utils::normalize_rank(static_cast(allocation_rank), shape.size()); - const auto offset = static_cast(shape.size()) - normalize_rank; - return ov::Shape{shape.begin() + offset, shape.end()}; -} - void IntermediateMemoryBuffer::validate_and_infer_types() { INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); ov::PartialShape output_shape; - m_element_type = get_input_element_type(0); - output_shape = get_input_partial_shape(0); - set_output_type(0, m_element_type, output_shape); + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); } std::shared_ptr IntermediateMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); check_new_args_count(this, new_args); - auto new_buffer = std::make_shared(new_args.at(0), m_shape, m_id); + auto new_buffer = std::make_shared(new_args.at(0), m_allocation_size, m_reg_group, m_cluster_id); new_buffer->set_offset(m_offset); return new_buffer; } -NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t id, ov::element::Type element_type) - : Buffer({}, shape, id, element_type) { +NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t reg_group, size_t cluster_id, ov::element::Type element_type) + : Buffer({}, ov::shape_size(shape), reg_group, cluster_id), m_output_shape(shape), m_element_type(element_type) { constructor_validate_and_infer_types(); } void NewMemoryBuffer::validate_and_infer_types() { INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); OPENVINO_ASSERT(get_input_size() == 0, "Buffer with new allocated memory mustn't have arguments!"); - set_output_type(0, m_element_type, m_shape); + set_output_type(0, m_element_type, m_output_shape); } std::shared_ptr NewMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); check_new_args_count(this, new_args); - auto new_buffer = std::make_shared(m_shape, m_id, m_element_type); + auto new_buffer = std::make_shared(m_output_shape, m_reg_group, m_cluster_id, m_element_type); new_buffer->set_offset(m_offset); return new_buffer; } diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp index 39c93a304a74d0..26b1dcb008244d 100644 --- a/src/common/snippets/src/op/loop.cpp +++ b/src/common/snippets/src/op/loop.cpp @@ -83,11 +83,14 @@ void LoopEnd::validate_and_infer_types() { bool LoopEnd::visit_attributes(AttributeVisitor &visitor) { std::vector int_incremented(m_is_incremented.cbegin(), m_is_incremented.cend()); + auto work_amount = utils::value2str(m_work_amount); + auto ptr_increments = ov::PartialShape(m_ptr_increments); + auto final_offsets = ov::PartialShape(m_finalization_offsets); visitor.on_attribute("is_incremented", int_incremented); - visitor.on_attribute("ptr_incr", m_ptr_increments); - visitor.on_attribute("fin_offset", m_finalization_offsets); + visitor.on_attribute("ptr_incr", ptr_increments); + visitor.on_attribute("fin_offset", final_offsets); visitor.on_attribute("data_sizes", m_element_type_sizes); - visitor.on_attribute("work_amount", m_work_amount); + visitor.on_attribute("work_amount", work_amount); visitor.on_attribute("increment", m_work_amount_increment); visitor.on_attribute("input_num", m_input_num); visitor.on_attribute("output_num", m_output_num); diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp index f3fa1f90c206ae..352e355ef75df5 100644 --- a/src/common/snippets/src/op/memory_access.cpp +++ b/src/common/snippets/src/op/memory_access.cpp @@ -2,8 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/itt.hpp" #include "snippets/op/memory_access.hpp" +#include "snippets/utils.hpp" +#include "snippets/itt.hpp" namespace ov { namespace snippets { @@ -49,20 +50,33 @@ bool MemoryAccess::is_full_memory_access_op(const std::shared_ptr& op) } bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) { + bool is_dynamic = false; for (const auto& p : m_input_ports) { auto idx = p.first; auto port = p.second; - visitor.on_attribute("count_in_" + std::to_string(idx), port.count); - visitor.on_attribute("offset_in_" + std::to_string(idx), port.offset); - visitor.on_attribute("stride_in_" + std::to_string(idx), port.stride); + auto count = utils::value2str(port.count); + auto offset = utils::value2str(port.offset); + auto stride = utils::value2str(port.stride); + visitor.on_attribute("count_in_" + std::to_string(idx), count); + visitor.on_attribute("offset_in_" + std::to_string(idx), offset); + visitor.on_attribute("stride_in_" + std::to_string(idx), stride); + is_dynamic |= utils::is_dynamic_value(port.count) || utils::is_dynamic_value(port.offset) || utils::is_dynamic_value(port.stride); } for (const auto& p : m_output_ports) { auto idx = p.first; auto port = p.second; - visitor.on_attribute("count_out_" + std::to_string(idx), port.count); - visitor.on_attribute("offset_out_" + std::to_string(idx), port.offset); - visitor.on_attribute("stride_out_" + std::to_string(idx), port.stride); + auto count = utils::value2str(port.count); + auto offset = utils::value2str(port.offset); + auto stride = utils::value2str(port.stride); + visitor.on_attribute("count_out_" + std::to_string(idx), count); + visitor.on_attribute("offset_out_" + std::to_string(idx), offset); + visitor.on_attribute("stride_out_" + std::to_string(idx), stride); + is_dynamic |= utils::is_dynamic_value(port.count) || utils::is_dynamic_value(port.offset) || utils::is_dynamic_value(port.stride); } + + std::string dynamic_status = is_dynamic ? "DYNAMIC" : "STATIC"; + visitor.on_attribute("dynamic_status", dynamic_status); + return true; } diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 8d888d5a75e7c2..ab793c722d1e3e 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -454,7 +454,7 @@ void Subgraph::control_flow_transformations(size_t min_parallel_work_amount, siz pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(static_cast(loop_depth)); + pipeline.register_pass(); pipeline.register_pass(vector_size); pipeline.register_pass(); pipeline.register_pass(); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 60ea8dd1a35e02..06e7ebaf74944e 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -4,17 +4,34 @@ #include "snippets/runtime_configurator.hpp" +#include "snippets/lowered/pass/init_loops.hpp" +#include "snippets/lowered/pass/insert_specific_iterations.hpp" +#include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/utils.hpp" -#include "snippets/lowered/pass/update_loop_info.hpp" namespace ov { namespace snippets { +namespace { +void init_data_ptr_shifts(const lowered::UnifiedLoopInfoPtr& unified_loop_info, std::vector& ptr_increments, + std::vector& finalization_offsets) { + const auto count = unified_loop_info->get_input_count() + unified_loop_info->get_output_count(); + ptr_increments.resize(count); + finalization_offsets.resize(count); + + size_t idx = 0; + unified_loop_info->iterate_through_descs( + [&ptr_increments, &finalization_offsets, &idx](const lowered::UnifiedLoopInfo::LoopPortDesc& desc) { + ptr_increments[idx] = desc.ptr_increment; + finalization_offsets[idx] = desc.finalization_offset; + ++idx; + }); +} +} // namespace + RuntimeConfigurator::RuntimeConfigurator(std::shared_ptr c) : m_config(std::move(c)) { OPENVINO_ASSERT(m_config, "Runtime config is nullptr!"); - - // Init LinearIR StateUpdater: some passes to update LoopInfo, BufferInfo etc - m_state_updater.register_pass(); } const std::shared_ptr& RuntimeConfigurator::get_updated_config(const std::shared_ptr& linear_ir) { @@ -26,21 +43,10 @@ const std::shared_ptr& RuntimeConfigurator::get_updated_config(co return m_config; } -void RuntimeConfigurator::update(const std::shared_ptr& linear_ir) { - if (linear_ir->is_dynamic()) { - m_state_updater.run(*linear_ir); - } - - m_config->master_shape = linear_ir->get_master_shape(); - m_config->buffer_scratchpad_size = linear_ir->get_buffer_scratchpad_size(); - - update_data_offsets(); - update_latest_shapes(); -} - void RuntimeConfigurator::initialization(const std::shared_ptr& linear_ir) { init_data_info(linear_ir); init_tensor_rank(linear_ir); + init_buffer_info(linear_ir); OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results"); m_latest_shapes.resize(m_io_num); @@ -48,6 +54,18 @@ void RuntimeConfigurator::initialization(const std::shared_ptrtile_rank = linear_ir->get_config().m_loop_depth; } +void RuntimeConfigurator::update(const std::shared_ptr& linear_ir) { + if (linear_ir->is_dynamic()) { + update_loop_info(linear_ir); + update_buffer_scratchpad_size(linear_ir); + } + + m_config->master_shape = linear_ir->get_master_shape(); + + update_data_offsets(); + update_latest_shapes(); +} + void RuntimeConfigurator::init_tensor_rank(const std::shared_ptr& linear_ir) const { m_config->tensor_rank = linear_ir->get_master_shape().size(); } @@ -94,6 +112,122 @@ void RuntimeConfigurator::init_data_info(const std::shared_ptr& linear_ir) { + std::set cluster_ids; + std::map> dynamic_buffer_clusters, static_buffer_clusters; + + const auto& buffer_expressions = linear_ir->get_buffer_ops(); + for (const auto& buffer_expr : buffer_expressions) { + const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); + OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR"); + + auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; + clusters[buffer->get_cluster_id()].insert(buffer_expr); + cluster_ids.insert(buffer->get_cluster_id()); + } + + OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters"); + OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)), + "Incorrect indetifiers of Buffer clusters"); + + m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); + m_config->buffer_cluster_offsets.resize(cluster_ids.size(), utils::get_dynamic_value()); + + for (const auto& p : static_buffer_clusters) { + const auto& cluster_id = p.first; + const auto& cluster = p.second; + OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic"); + + OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); + size_t cluster_offset = ov::as_type_ptr((*cluster.cbegin())->get_node())->get_offset(); + for (const auto& buffer_expr : cluster) { + OPENVINO_ASSERT(cluster_offset == ov::as_type_ptr(buffer_expr->get_node())->get_offset(), + "Static Buffers from the same cluster must have the same offset!"); + } + + m_config->buffer_cluster_offsets[cluster_id] = cluster_offset; + } + + m_dynamic_buffer_clusters = std::move(dynamic_buffer_clusters); +} + +void RuntimeConfigurator::update_loop_info(const std::shared_ptr& linear_ir) const { + // Initialized UnifiedLoopInfo + struct CurrentUnifiedLoopInfo { + size_t current_work_amount = 0; + std::vector ptr_increments; + std::vector finalization_offsets; + }; + std::unordered_map initializated_info_map; + + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + for (const auto& p : loop_map) { + const auto& expanded_loop_info = ov::as_type_ptr(p.second); + OPENVINO_ASSERT(expanded_loop_info, "UpdateLoopInfo expects ExpandedLoopInfo in LoopManager"); + + // First visiting of unified (whole) loop + const auto& current_unified_loop_info = expanded_loop_info->get_unified_loop_info(); + if (initializated_info_map.count(current_unified_loop_info) == 0) { + auto& current_info = initializated_info_map[current_unified_loop_info]; + lowered::pass::InitLoops::init_loop_info(current_unified_loop_info, true); + + current_info.current_work_amount = current_unified_loop_info->get_work_amount(); + init_data_ptr_shifts(current_unified_loop_info, current_info.ptr_increments, current_info.finalization_offsets); + } + + auto& initializated_info = initializated_info_map.at(current_unified_loop_info); + auto& current_work_amount = initializated_info.current_work_amount; + const auto& ptr_increments = initializated_info.ptr_increments; + const auto& finalization_offsets = initializated_info.finalization_offsets; + + const auto& decomposed_loop_type = expanded_loop_info->get_type(); + + // If the specific iteration is not needed, we skip loop evaluation - set zero as work amount is enough + if (!lowered::pass::InsertSpecificIterations::is_decomposed_loop_needed(current_unified_loop_info, decomposed_loop_type, current_work_amount)) { + expanded_loop_info->set_work_amount(0); + continue; + } + + expanded_loop_info->set_work_amount( + lowered::pass::InsertSpecificIterations::get_decomposed_loop_work_amount(current_unified_loop_info, decomposed_loop_type, current_work_amount)); + // Update remaining Loop work amount + current_work_amount -= expanded_loop_info->get_work_amount(); + + expanded_loop_info->update_ptr_increments(ptr_increments); + if (current_work_amount > 0) { + expanded_loop_info->update_finalization_offsets(std::vector(finalization_offsets.size(), 0)); + } else { + expanded_loop_info->update_finalization_offsets(finalization_offsets); + } + } +} + +void RuntimeConfigurator::update_buffer_scratchpad_size(const std::shared_ptr& linear_ir) const { + const auto& loop_manager = linear_ir->get_loop_manager(); + m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); + + for (const auto& p : m_dynamic_buffer_clusters) { + const auto& cluster_id = p.first; + const auto& cluster = p.second; + + auto& cluster_offset = m_config->buffer_cluster_offsets[cluster_id]; + cluster_offset = utils::get_dynamic_value(); + + size_t additional_size = 0; + for (const auto& buffer_expr : cluster) { + const auto& allocation_size = lowered::pass::ComputeBufferAllocationSize::get_allocation_size(loop_manager, buffer_expr, m_config->tile_rank); + additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); + } + + cluster_offset = m_config->buffer_scratchpad_size; + OPENVINO_ASSERT(!utils::is_dynamic_value(cluster_offset), "Offset of the cluster must be defined!"); + OPENVINO_ASSERT(!utils::is_dynamic_value(additional_size), "Buffer scratchpad size must be defined!"); + m_config->buffer_scratchpad_size += additional_size; + } + + OPENVINO_ASSERT(!utils::is_dynamic_value(m_config->buffer_scratchpad_size), "Buffer scratchpad size must be defined!"); +} + void RuntimeConfigurator::update_data_offsets() const { for (size_t i = 0; i < m_io_num; ++i) { // offsets represent distance between consecutive elements of corresponding dimension. diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp index 49cc1a379c8b18..d56c28acf66a28 100644 --- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp +++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp @@ -12,16 +12,6 @@ using Result = IShapeInferSnippets::Result; * Merge SRC to DST with broadcasting rules defined by the Autobroadcast specifier */ bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op::AutoBroadcastSpec& autob) { - auto broadcast_merge_dim = [](size_t& dst, const size_t& d1, const size_t& d2) { - if (d1 == d2 || d1 == 1 || utils::is_dynamic_value(d1)) { - dst = d2; - } else if (d2 == 1 || utils::is_dynamic_value(d2)) { - dst = d1; - } else { - return false; - } - return true; - }; // Ranks are both static. const auto dst_rank = static_cast(dst.size()); const auto src_rank = static_cast(src.size()); @@ -35,7 +25,7 @@ bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op:: for (int64_t i = 0; i < new_rank; i++) { auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)]; auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)]; - success &= broadcast_merge_dim(dims[i], dsti, srci); + success &= utils::broadcast_merge_dim(dims[i], dsti, srci); } dst = std::move(dims); return success; @@ -55,7 +45,7 @@ bool broadcast_merge_into(VectorDims& dst, const VectorDims& src, const ov::op:: if (src[i] > dst[axis + i]) return false; } - success &= broadcast_merge_dim(dst[axis + i], dst[axis + i], src[i]); + success &= utils::broadcast_merge_dim(dst[axis + i], dst[axis + i], src[i]); } return success; } diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp index 9d67248efc079f..a7f00bbfebcb9c 100644 --- a/src/common/snippets/src/utils.cpp +++ b/src/common/snippets/src/utils.cpp @@ -102,14 +102,15 @@ auto get_non_scalar_constant_count_for_fq(const std::shared_ptr BufferAllocationParams; class BufferAllocationTest : public testing::TestWithParam { @@ -38,7 +39,8 @@ class BufferAllocationTest : public testing::TestWithParam GetModel() const override; }; -class MHABufferAllocationTest : public BufferAllocationTest { -protected: - std::shared_ptr GetModel() const override; - - static void MarkBrgemm(const std::shared_ptr& node, const std::vector& subtensor); -}; - } // namespace snippets } // namespace test } // namespace ov diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp index 26f63454318d37..e56a31a8e92a4c 100644 --- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp +++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp @@ -26,20 +26,21 @@ namespace snippets { std::string BufferAllocationTest::getTestCaseName(testing::TestParamInfo obj) { bool is_optimized, with_split_loops; - size_t expected_size, expected_count; + size_t expected_size, expected_reg_group_count, expected_cluster_count; - std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param; + std::tie(is_optimized, with_split_loops, expected_size, expected_reg_group_count, expected_cluster_count) = obj.param; std::ostringstream result; result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_"; result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_"; result << "ExpBufferSize=" << expected_size << "_"; - result << "ExpBufferNum=" << expected_count; + result << "ExpBufferRegGroupCount=" << expected_reg_group_count << "_"; + result << "ExpBufferClustersCount=" << expected_reg_group_count << "_"; return result.str(); } void BufferAllocationTest::SetUp() { - std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_count) = this->GetParam(); + std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_reg_group_count, m_expected_cluster_count) = this->GetParam(); const auto body = GetModel(); m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared()); @@ -71,7 +72,7 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(2); + pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); @@ -80,14 +81,16 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr gprs; + std::set reg_groups, clusters; for (const auto& expr : m_linear_ir) { if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - gprs.insert(buffer->get_id()); + reg_groups.insert(buffer->get_reg_group()); + clusters.insert(buffer->get_cluster_id()); } } - EXPECT_EQ(gprs.size(), m_expected_count); - EXPECT_EQ(m_linear_ir.get_buffer_scratchpad_size(), m_expected_size); + EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); + EXPECT_EQ(clusters.size(), m_expected_cluster_count); + EXPECT_EQ(m_linear_ir.get_static_buffer_scratchpad_size(), m_expected_size); } std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { @@ -98,9 +101,9 @@ std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto add = std::make_shared(parameter0, parameter1); - const auto buffer0 = std::make_shared(add, static_cast(subtensor_buffer.size())); + const auto buffer0 = std::make_shared(add); const auto relu = std::make_shared(buffer0); - const auto buffer1 = std::make_shared(relu, static_cast(subtensor_buffer.size())); + const auto buffer1 = std::make_shared(relu); const auto exp = std::make_shared(buffer1); const auto body = std::make_shared(std::make_shared(exp), ov::ParameterVector{parameter0, parameter1}); @@ -113,65 +116,9 @@ std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { return body; } -void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr& node, const std::vector& subtensor) { - const auto subtensor_full = std::vector{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM, - ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; - ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr( - node->input(0), std::make_shared(node->input(0), subtensor)); - ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr( - node->input(1), std::make_shared(node->input(1), subtensor_full)); - ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr( - node->output(0), std::make_shared(node->output(0), subtensor)); -} - -std::shared_ptr MHABufferAllocationTest::GetModel() const { - const auto subtensor_scalar = std::vector{1}; - const auto subtensor_eltwise = std::vector{1, m_vector_size}; - const auto subtensor_brgemm = std::vector{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; - const auto subtensor_power = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; - - const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 12, 128, 64})); - const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 128, 12, 64})); - const auto parameter2 = std::make_shared(ov::element::f32, ov::PartialShape({1, 12, 128, 64})); - - const auto load_reshape = std::make_shared(parameter1, 1, 0, std::vector{0, 2, 3, 1}); - const auto store = std::make_shared(load_reshape); - const auto relu0 = std::make_shared(store); - const auto matmul0 = std::make_shared(parameter0, relu0); - const auto relu1 = std::make_shared(matmul0); - - // Decomposed Softmax - const auto reduce_max = std::make_shared(relu1, 3); - ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_max); - const auto subtract = std::make_shared(relu1, reduce_max); - const auto exp = std::make_shared(subtract); - - const auto reduce_sum = std::make_shared(exp, 3); - ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum); - const auto power = std::make_shared(reduce_sum, -1.f); - const auto multiply = std::make_shared(exp, power); - - const auto matmul1 = std::make_shared(multiply, parameter2); - const auto relu2 = std::make_shared(matmul1); - - const auto body = std::make_shared(std::make_shared(relu2), ov::ParameterVector{parameter0, parameter1, parameter2}); - - MarkOp(load_reshape, subtensor_scalar); - MarkOp(store, subtensor_scalar); - MarkOp(power, subtensor_power); - - MarkBrgemm(matmul0, subtensor_brgemm); - MarkBrgemm(matmul1, subtensor_brgemm); - - return body; -} - TEST_P(EltwiseBufferAllocationTest, BufferAllocation) { Validate(); } -TEST_P(MHABufferAllocationTest, BufferAllocation) { - Validate(); -} namespace BufferAllocationTest_Instances { @@ -179,8 +126,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseNotOptimized, El ::testing::Combine( ::testing::Values(false), ::testing::Values(false), // in this test it doesn't make sense - ::testing::Values(80000), // Each Buffer has own allocated memory - ::testing::Values(2)), // Each Buffer has unique ID + ::testing::Values(80000), // Each Buffer has own allocated memory + ::testing::Values(2), // Each Buffer has unique reg group + ::testing::Values(2)), // Each Buffer has unique cluster ID BufferAllocationTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, EltwiseBufferAllocationTest, @@ -188,39 +136,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, Eltwi ::testing::Values(true), ::testing::Values(false), // in this test it doesn't make sense ::testing::Values(40000), // Two Buffer reuse memory - ::testing::Values(1)), // Two Buffers reuse IDs - BufferAllocationTest::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest, - ::testing::Combine( - ::testing::Values(false), - ::testing::Values(true), - ::testing::Values(139264), // Each Buffer has own allocated memory - ::testing::Values(7)), // Each Buffer has unique ID - BufferAllocationTest::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest, - ::testing::Combine( - ::testing::Values(true), - ::testing::Values(true), - ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm) - ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms) - BufferAllocationTest::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest, - ::testing::Combine( - ::testing::Values(false), - ::testing::Values(false), - ::testing::Values(360448), // Each Buffer has own allocated memory - ::testing::Values(7)), // Each Buffer has unique ID - BufferAllocationTest::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest, - ::testing::Combine( - ::testing::Values(true), - ::testing::Values(false), - ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1) - ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms) + ::testing::Values(1), // Two Buffers reuse IDs + ::testing::Values(1)), // Two Buffers are from the same luster BufferAllocationTest::getTestCaseName); } // namespace BufferAllocationTest_Instances diff --git a/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp index 4df2aa7c56033f..f799e5d38e1ab3 100644 --- a/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/tests/src/lowered/pass/insert_load_store.cpp @@ -6,6 +6,7 @@ #include "openvino/opsets/opset10.hpp" #include "snippets/lowered/pass/insert_load_store.hpp" +#include "snippets/snippets_isa.hpp" namespace ov { namespace test { diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 0169201e0aee60..c86be368a5ab1b 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -47,7 +47,7 @@ static void init_linear_ir(const std::vector& in_shapes, Linea const auto outer_wa = std::max(*(in_shape0.rbegin() + 1), *(in_shape1.rbegin() + 1)); const auto outer_inc = blocked_wa; loop_manager->mark_loop(expr_it, std::next(expr_it), inner_wa, inner_inc, 0, loop_input_ports, loop_output_ports); - loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_input_ports, loop_output_ports); + loop_manager->mark_loop(expr_it, std::next(expr_it), blocked_wa, blocked_inc, 1, loop_input_ports, loop_output_ports, true, true); const auto loop_id = loop_manager->mark_loop(expr_it, std::next(expr_it), outer_wa, outer_inc, 1, loop_input_ports, loop_output_ports); const auto& outer_loop_info = loop_manager->get_loop_info(loop_id); const auto outer_tail_size = outer_wa % outer_inc; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp index 71e6b3294e1773..cd36de7847bdbe 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp @@ -48,7 +48,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov std::set unique_buffers; for (const auto& expr : *body) { if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_id = buffer->get_id(); + const auto buffer_id = buffer->get_cluster_id(); if (unique_buffers.count(buffer_id) == 0) { mem_access_exprs.push_back(expr); unique_buffers.insert(buffer_id); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp index 2a34ca9fc50e00..f04720466a4631 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp @@ -46,6 +46,7 @@ struct jit_snippets_call_args { // for all non-static data members. So we can keep them public or friend all control-flow emitters loop_args_t* loop_args = nullptr; amx_tile_config_t amx_tile_config; + size_t memory_access_offsets[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; }; struct jit_snippets_call_args::loop_args_t { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index a1fde3bf28f3bf..e444d65bab774b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -61,8 +61,7 @@ namespace ov { static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { bool ret = false; if (dynamic_cast(emitter) || - dynamic_cast(emitter) || - dynamic_cast(emitter)) { + dynamic_cast(emitter)) { return true; } return ret; @@ -70,8 +69,7 @@ static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) { bool ret = false; - if (dynamic_cast(emitter) || - dynamic_cast(emitter)) { + if (dynamic_cast(emitter)) { return true; } return ret; @@ -171,12 +169,12 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); - jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_convert_emitter); - jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_convert_emitter); + jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); - jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_convert_emitter); - jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_convert_emitter); + jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); + jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); jitters[snippets::op::Scalar::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_scalar_emitter); jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index 42f8d61c669dd5..fe1864ab8467c2 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -44,10 +44,10 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov std::set unique_buffers; for (const auto& expr : *body) { if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_id = buffer->get_id(); - if (unique_buffers.count(buffer_id) == 0) { + const auto buffer_reg_group = buffer->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { mem_access_exprs.push_back(expr); - unique_buffers.insert(buffer_id); + unique_buffers.insert(buffer_reg_group); } } else { if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index 21ba08422a1665..df888b63b3d601 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -4,8 +4,10 @@ #include "jit_memory_emitters.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" #include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp" +#include "snippets/op/buffer.hpp" using namespace Xbyak; @@ -19,143 +21,122 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map in_out_type) + : jit_emitter(h, isa) { + in_out_type_ = in_out_type; + const auto n = expr->get_node(); src_prc = n->get_input_element_type(0); dst_prc = n->get_output_element_type(0); -} -jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) - : jit_memory_emitter(h, isa, expr) { - if (src_prc != dst_prc) - OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ", - src_prc.get_type_name(), - " and ", - dst_prc.get_type_name()); + const auto& memory_access = std::dynamic_pointer_cast(expr->get_node()); + if (in_out_type_ == emitter_in_out_map::gpr_to_vec) { + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_input_port(0), "must be input port - memory access"); + count = memory_access->get_input_count(); + compiled_byte_offset = memory_access->get_input_offset(); + runtime_args_offset = get_parent_buffer_cluster_id(expr); + } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access"); + count = memory_access->get_output_count(); + compiled_byte_offset = memory_access->get_output_offset(); + runtime_args_offset = get_consumer_buffer_cluster_id(expr); + } else { + OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); + } - const auto load = std::dynamic_pointer_cast(expr->get_node()); - count = load->get_count(); - byte_offset = load->get_offset(); - in_out_type_ = emitter_in_out_map::gpr_to_vec; - load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); + if (ov::snippets::utils::is_dynamic_value(compiled_byte_offset)) { + is_offset_runtime = true; + // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset pointer in the register + compiled_byte_offset = 0; + OPENVINO_ASSERT(runtime_args_offset != SIZE_MAX, "Incorrect buffer offset in call_args"); + } } -void jit_load_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); +size_t jit_memory_emitter::aux_gprs_count() const { + // for runtime arguments + return is_offset_runtime ? 1 : 0; +} + +size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { + OPENVINO_ASSERT(expr->get_input_port_connectors().size() == 1, "MemoryAccess must have one parent"); + const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr(); + if (const auto buffer = ov::as_type_ptr(parent_expr->get_node())) { + return buffer->get_cluster_id(); } + return SIZE_MAX; } -template -void jit_load_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - if (!load_emitter) - OV_CPU_JIT_EMITTER_THROW("Load CPU emitter isn't initialized!"); - load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { + OPENVINO_ASSERT(expr->get_output_port_connectors().size() == 1, "MemoryAccess must have one consumer"); + const auto& consumers = expr->get_output_port_connector(0)->get_consumers(); + for (const auto& consumer : consumers) + if (const auto buffer = ov::as_type_ptr(consumer.get_expr()->get_node())) + return buffer->get_cluster_id(); + return SIZE_MAX; } -void jit_load_memory_emitter::emit_data() const { - load_emitter->emit_data(); +std::vector jit_memory_emitter::get_available_aux_gprs() const { + if (aux_gpr_idxs.empty()) + return aux_gpr_idxs; + return std::vector(aux_gpr_idxs.cbegin() + static_cast(is_offset_runtime), aux_gpr_idxs.cend()); } -jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) - : jit_memory_emitter(h, isa, expr) { - if (src_prc != dst_prc) - OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ", - src_prc.get_type_name(), - " and ", - dst_prc.get_type_name()); +void jit_memory_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { + emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); - const auto broadcast_load = std::dynamic_pointer_cast(expr->get_node()); - byte_offset = broadcast_load->get_offset(); - in_out_type_ = emitter_in_out_map::gpr_to_vec; -} + Reg64 reg_runtime_params = abi_param1; // defined by jit_kernel_emitter + Reg64 aux_gpr = is_offset_runtime ? Reg64(static_cast(aux_gpr_idxs[0])) : Reg64(); -void jit_load_broadcast_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); + Reg64 data_reg; + if (in_out_type_ == emitter_in_out_map::gpr_to_vec) { + data_reg = Reg64(in_idxs[0]); + } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { + data_reg = Reg64(out_idxs[0]); } else { - OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); + OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); } -} -template -void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Reg64 in_reg(in[0]); - Vmm vmm_dst = Vmm(out[0]); + if (is_offset_runtime) { + h->mov(aux_gpr, h->ptr[reg_runtime_params + GET_OFF(memory_access_offsets) + runtime_args_offset * sizeof(size_t)]); + h->add(data_reg, aux_gpr); + } - // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, - // key point here is not to add post-increment, it might be fixed by some other approach in future - switch (src_prc.size()) { - case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + byte_offset]); break; - case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + byte_offset]); break; - case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + byte_offset]); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); + emit_impl(in_idxs, out_idxs); + + if (is_offset_runtime) { + h->sub(data_reg, aux_gpr); } -} -jit_load_convert_emitter::jit_load_convert_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) - : jit_memory_emitter(h, isa, expr) { - const auto load = ov::as_type_ptr(expr->get_node()); - count = load->get_count(); - byte_offset = load->get_offset(); - in_out_type_ = emitter_in_out_map::gpr_to_vec; - load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); + emitter_postamble(); } -void jit_load_convert_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); - } else { - OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); - } +jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) { + OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(expr->get_node()), "expects Load node"); + load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } -template -void jit_load_convert_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - if (!load_emitter) - OV_CPU_JIT_EMITTER_THROW("Load CPU emitter isn't initialized!"); - load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +void jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + OV_CPU_JIT_EMITTER_ASSERT(load_emitter, "Load CPU emitter isn't initialized!"); + load_emitter->emit_code({in[0], compiled_byte_offset}, {out[0]}, aux_vec_idxs, get_available_aux_gprs()); } -void jit_load_convert_emitter::emit_data() const { +void jit_load_memory_emitter::emit_data() const { load_emitter->emit_data(); } -jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { +jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) { + OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(expr->get_node()), "expects BroadcastLoad node"); if (src_prc != dst_prc) OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ", src_prc.get_type_name(), " and ", dst_prc.get_type_name()); - - const auto store = ov::as_type_ptr(expr->get_node()); - count = store->get_count(); - byte_offset = store->get_offset(); - in_out_type_ = emitter_in_out_map::vec_to_gpr; - store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } -void jit_store_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::x64::sse41) { emit_isa(in, out); } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { @@ -168,51 +149,41 @@ void jit_store_memory_emitter::emit_impl(const std::vector& in, } template -void jit_store_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - if (!store_emitter) - OV_CPU_JIT_EMITTER_THROW("Store CPU emitter isn't initialized!"); - store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); -} +void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 in_reg(in[0]); + Vmm vmm_dst = Vmm(out[0]); -void jit_store_memory_emitter::emit_data() const { - store_emitter->emit_data(); + // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, + // key point here is not to add post-increment, it might be fixed by some other approach in future + switch (src_prc.size()) { + case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; + case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; + case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; + default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); + } } -jit_store_convert_emitter::jit_store_convert_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) - : jit_memory_emitter(h, isa, expr) { - const auto store = ov::as_type_ptr(expr->get_node()); - count = store->get_count(); - byte_offset = store->get_offset(); - in_out_type_ = emitter_in_out_map::vec_to_gpr; - +jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::vec_to_gpr) { if (ov::is_type(expr->get_node())) { store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::truncation)); } else if (ov::is_type(expr->get_node())) { store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, arithmetic_mode::saturation)); - } -} - -void jit_store_convert_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { - if (host_isa_ == dnnl::impl::cpu::x64::sse41) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { - emit_isa(in, out); - } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { - emit_isa(in, out); + } else if (ov::is_type(expr->get_node())) { + store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } else { - OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); + OV_CPU_JIT_EMITTER_THROW("expects Store node"); } } -template -void jit_store_convert_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - if (!store_emitter) - OV_CPU_JIT_EMITTER_THROW("Store CPU emitter isn't initialized!"); - store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); +void jit_store_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + OV_CPU_JIT_EMITTER_ASSERT(store_emitter, "Store CPU emitter isn't initialized!"); + store_emitter->emit_code({in[0], compiled_byte_offset}, {out[0]}, aux_vec_idxs, get_available_aux_gprs()); } -void jit_store_convert_emitter::emit_data() const { +void jit_store_memory_emitter::emit_data() const { store_emitter->emit_data(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp index 50276d9d9e2f1b..50315ec298a2d4 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp @@ -14,14 +14,27 @@ namespace intel_cpu { class jit_memory_emitter : public jit_emitter { public: jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map in_out_type); + + void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, + const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; protected: + static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); + static size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); + + size_t aux_gprs_count() const override; + + std::vector get_available_aux_gprs() const; + ov::element::Type src_prc; ov::element::Type dst_prc; size_t count = 0; - size_t byte_offset = 0; + size_t compiled_byte_offset = 0; + size_t runtime_args_offset = 0; + bool is_offset_runtime = false; + #ifdef SNIPPETS_DEBUG_CAPS friend std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter); #endif @@ -37,8 +50,6 @@ class jit_load_memory_emitter : public jit_memory_emitter { private: void emit_impl(const std::vector& in, const std::vector& out) const override; - template - void emit_isa(const std::vector &in, const std::vector &out) const; void emit_data() const override; private: @@ -59,24 +70,6 @@ class jit_load_broadcast_emitter : public jit_memory_emitter { void emit_isa(const std::vector &in, const std::vector &out) const; }; -class jit_load_convert_emitter : public jit_memory_emitter { -public: - jit_load_convert_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); - - size_t get_inputs_num() const override {return 0;} - -private: - void emit_impl(const std::vector& in, const std::vector& out) const override; - - template - void emit_isa(const std::vector &in, const std::vector &out) const; - void emit_data() const override; - -private: - std::unique_ptr load_emitter = nullptr; -}; - class jit_store_memory_emitter : public jit_memory_emitter { public: jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, @@ -87,26 +80,6 @@ class jit_store_memory_emitter : public jit_memory_emitter { private: void emit_impl(const std::vector& in, const std::vector& out) const override; - template - void emit_isa(const std::vector &in, const std::vector &out) const; - void emit_data() const override; - -private: - std::unique_ptr store_emitter = nullptr; -}; - -class jit_store_convert_emitter : public jit_memory_emitter { -public: - jit_store_convert_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); - - size_t get_inputs_num() const override {return 1;} - -private: - void emit_impl(const std::vector& in, const std::vector& out) const override; - - template - void emit_isa(const std::vector &in, const std::vector &out) const; void emit_data() const override; private: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp index 6bc410b1b042ee..d9c87dbf8b3ae3 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp @@ -56,7 +56,7 @@ std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter) { ss << " src_precision:" << emitter->src_prc << " dst_precision:" << emitter->dst_prc << " load/store_element_number:" << emitter->count - << " byte_offset:" << emitter->byte_offset; + << " byte_offset:" << emitter->compiled_byte_offset; return ss.str(); } @@ -76,14 +76,6 @@ static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast return ss.str(); } -static std::string init_info_jit_load_convert_emitter(const jit_load_convert_emitter *emitter) { - std::stringstream ss; - std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_load_convert_emitter" - << memory_emitter_info; - return ss.str(); -} - static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter *emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); @@ -92,14 +84,6 @@ static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emi return ss.str(); } -static std::string init_info_jit_store_convert_emitter(const jit_store_convert_emitter *emitter) { - std::stringstream ss; - std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_store_convert_emitter" - << memory_emitter_info; - return ss.str(); -} - std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter) { std::stringstream ss; ss << "Emitter_type_name:jit_brgemm_emitter" @@ -190,12 +174,8 @@ void jit_emitter_info_t::init(const jit_emitter *emitter) { str_ = init_info_jit_load_memory_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { str_ = init_info_jit_load_broadcast_emitter(e_type); - } else if (auto e_type = dynamic_cast(emitter)) { - str_ = init_info_jit_load_convert_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { str_ = init_info_jit_store_memory_emitter(e_type); - } else if (auto e_type = dynamic_cast(emitter)) { - str_ = init_info_jit_store_convert_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { str_ = init_info_jit_brgemm_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 01a7e2eedb967e..5949e2a755d782 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -151,6 +151,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { inline void init_call_args(jit_snippets_call_args& call_args) { call_args.register_loops(loop_args); + std::copy(buffer_offsets.cbegin(), buffer_offsets.cend(), call_args.memory_access_offsets); if (m_buffer_scratchpad_size > 0) call_args.buffer_scratchpad_ptr = @@ -191,10 +192,12 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { void init_runtime_params(const std::shared_ptr& snippet_config) override { SubgraphExecutor::init_runtime_params(snippet_config); + buffer_offsets = snippet_config->buffer_cluster_offsets; data_offsets = snippet_config->io_data_offsets; loop_args = snippet_config->loop_args; } + std::vector buffer_offsets = {}; std::vector> data_offsets = {}; std::vector loop_args = {}; }; @@ -846,6 +849,7 @@ void Subgraph::SubgraphExecutor::init_runtime_params(const std::shared_ptrbuffer_scratchpad_size; + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); m_buffer_scratchpad.resize(m_buffer_scratchpad_size * parallel_get_max_threads(), 0); init_parallel_domain(snippet_config, m_parallel_exec_domain); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index 9c3cf3dca21ab6..d0265b8606d286 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -5,6 +5,7 @@ #include "brgemm_cpu.hpp" #include "snippets/itt.hpp" #include "snippets/utils.hpp" +#include "snippets/snippets_isa.hpp" #include "snippets/lowered/port_descriptor.hpp" #include "utils/general_utils.h" #include "snippets/utils.hpp" diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index bf8f635bd2fe1b..80fde9c733ba18 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -8,6 +8,7 @@ #include "snippets/utils.hpp" #include "snippets/op/brgemm.hpp" +#include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/tpp/x64/op/modifiers.hpp" diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp index 31b135a77da3e9..7e4ec11d8bd532 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp @@ -29,10 +29,10 @@ bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::Lin if (auto copy_b = ov::as_type_ptr(expr->get_node())) { const auto buffer = get_buffer_from_output(expr, 0); const auto buffer_shape = copy_b->get_repacking_buffer_shape(); - buffer->set_allocation_shape(buffer_shape); + buffer->set_allocation_size(ov::shape_size(buffer_shape)); if (copy_b->is_with_compensations()) { const auto compensations_buffer = get_buffer_from_output(expr, 1); - compensations_buffer->set_allocation_shape(copy_b->get_compensations_buffer_shape()); + compensations_buffer->set_allocation_size(ov::shape_size(copy_b->get_compensations_buffer_shape())); } modified = true; } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 85e8c2e10615b7..6ccd9ec5c7c484 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -306,9 +306,8 @@ std::vector disabledTestPatterns() { R"(.*FQLayerDQBias.smoke_CompareWithRefs.*)", R"(.*smoke_matmulBrgemmInt8/MatmulBrgemmInt8Test.CompareWithRefs.*MatMul.*InputType=i8_OutputType=i8.*)", R"(.*smoke_Snippets_MHAWOTransposeOnInputs_4D/MHAWOTransposeOnInputs.CompareWithRefImpl.*)", - // Issue: 123274 (Dynamic Softmax aren't supported) - R"(smoke_Snippets_(Softmax|AddSoftmax|Reduce).*\[.*\?.*\].*)", - R"(smoke_Snippets_BroadcastSelect_Dynamic.*)" + // Issue: 142448 + R"(smoke_Snippets_BroadcastSelect_Dynamic.*)", // Issue: 141705 R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/.*trip_count=5_exec_cond=1_netType=i8.*)", R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/Input0_IS=\[\?.1.\?\]_TS=\(10.1.10\)_\(1.1.1\)_\(1.1.1\)_\(5.1.3\)_Input1_IS=\[\?.\?.\?\]_TS=.*_Input2_IS=\[\?.1.\?\]_.*_types=0_0_1_trip_count_type=.*_trip_count=(1|5)_exec_cond=1_netType=i8.*)", diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp index 11a959b0a70f47..eba3e0db1f08ce 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp @@ -39,7 +39,10 @@ const std::vector inputShape = { {{}, {{1, 3, 128, 20}}}, // DS {{-1, -1}, {{1, 16}, {1, 32}, {1, 1}, {1, 9}, {1, 17}, {1, 19}, {1, 49}, {1, 50}, {5, 16}, {1, 16}, {1, 9}}}, - {{-1, -1, -1, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}} + {{-1, -1, -1, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}}, + {{-1, -1, -1, 128}, {{1, 3, 128, 128}, {1, 3, 128, 128}, {1, 3, 64, 128}, {1, 3, 32, 128}, {1, 3, 64, 128}, {1, 3, 32, 128}}}, + {{-1, -1, -1, 130}, {{1, 3, 8, 130}, {1, 3, 18, 130}, {1, 3, 8, 130}, {1, 3, 32, 130}, {1, 3, 18, 130}, {1, 3, 32, 130}}}, + {{-1, -1, 128, -1}, {{1, 3, 128, 128}, {1, 3, 128, 129}, {1, 3, 128, 130}, {1, 3, 128, 1}, {1, 3, 128, 16}, {1, 3, 128, 1}}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax, @@ -59,8 +62,18 @@ const std::vector> inputShapesPair = { {{{}, {{1, 5, 16, 35}}}, {{}, {{1, 5, 1, 35}}}}, {{{}, {{1, 5, 1, 35}}}, {{}, {{1, 5, 1, 35}}}}, // DS - {{{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 1}, {1, 5, 16, 35}}}, {{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 35}, {1, 5, 16, 35}}}}, - {{{-1, {1, 8}, {1, 16}, {1, 16}}, {{1, 3, 1, 8}, {1, 8, 16, 16}, {1, 3, 1, 8}}}, {{-1, {1, 8}, -1, {1, 8}}, {{1, 3, 2, 8}, {2, 1, 1, 1}, {1, 3, 2, 8}}}} + {{{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 32}, {1, 5, 16, 35}}}, + {{-1, -1, -1, -1}, {{1, 5, 16, 35}, {1, 5, 16, 32}, {1, 5, 16, 35}}}}, + {{{-1, {1, 8}, {1, 16}, {1, 16}}, {{1, 3, 1, 8}, {1, 8, 16, 16}, {1, 3, 1, 8}}}, + {{-1, {1, 8}, -1, {1, 16}}, {{1, 3, 2, 8}, {2, 1, 1, 16}, {1, 3, 2, 8}}}}, + {{{-1, -1, -1, 128}, {{1, 5, 32, 128}, {1, 5, 16, 128}, {1, 5, 32, 128}}}, + {{-1, -1, -1, 128}, {{1, 5, 32, 128}, {1, 5, 16, 128}, {1, 5, 1, 128}}}}, + {{{-1, -1, -1, 130}, {{1, 5, 16, 130}, {1, 5, 32, 130}, {1, 5, 32, 130}}}, + {{-1, -1, -1, 130}, {{1, 1, 1, 130}, {1, 1, 1, 130}, {1, 5, 32, 130}}}}, + {{{-1, -1, 32, -1}, {{1, 5, 32, 35}, {1, 5, 32, 32}, {1, 5, 32, 35}, {1, 5, 32, 35}}}, + {{-1, -1, -1, -1}, {{1, 5, 32, 35}, {1, 5, 32, 32}, {1, 5, 32, 35}, {1, 5, 32, 35}}}}, + {{{-1, -1, 5, -1}, {{1, 1, 5, 35}, {1, 3, 5, 32}, {1, 5, 5, 18}, {1, 2, 5, 35}}}, + {{-1, -1, 5, -1}, {{1, 1, 5, 35}, {1, 1, 5, 32}, {1, 5, 5, 18}, {1, 2, 5, 35}}}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax, diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index c05d4fc712d05b..8e188b0dbf332c 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -38,7 +38,8 @@ typedef std::tuple< bool, // Optimized pipeline bool, // With SplitLoops opt size_t, // Expected Buffer size in bytes - size_t // Expected unique Buffer IDs count + size_t, // Expected unique Buffer reg group count + size_t // Expected unique Buffer cluster count > BufferAllocationCPUParams; class BufferAllocationCPUTest : public testing::TestWithParam { @@ -46,19 +47,20 @@ class BufferAllocationCPUTest : public testing::TestWithParam obj) { bool is_optimized, with_split_loops; - size_t expected_size, expected_count; - std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param; + size_t expected_size, expected_reg_group_count, expected_cluster_count; + std::tie(is_optimized, with_split_loops, expected_size, expected_reg_group_count, expected_cluster_count) = obj.param; std::ostringstream result; result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_"; result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_"; result << "ExpBufferSize=" << expected_size << "_"; - result << "ExpBufferNum=" << expected_count; + result << "ExpBufferRegGroupCount=" << expected_reg_group_count << "_"; + result << "ExpBufferClustersCount=" << expected_reg_group_count << "_"; return result.str(); } protected: void SetUp() override { - std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_count) = this->GetParam(); + std::tie(m_is_buffer_optimized, m_with_split_loops, m_expected_size, m_expected_reg_group_count, m_expected_cluster_count) = this->GetParam(); const auto body = GetModel(); m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared()); @@ -82,7 +84,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(2); + pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); @@ -92,14 +94,16 @@ class BufferAllocationCPUTest : public testing::TestWithParam gprs; + std::set reg_groups, clusters; for (const auto& expr : m_linear_ir) { if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - gprs.insert(buffer->get_id()); + reg_groups.insert(buffer->get_reg_group()); + clusters.insert(buffer->get_cluster_id()); } } - EXPECT_EQ(gprs.size(), m_expected_count); - EXPECT_EQ(m_linear_ir.get_buffer_scratchpad_size(), m_expected_size); + EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); + EXPECT_EQ(clusters.size(), m_expected_cluster_count); + EXPECT_EQ(m_linear_ir.get_static_buffer_scratchpad_size(), m_expected_size); } virtual std::shared_ptr GetModel() const = 0; @@ -116,7 +120,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam GetModel() const override { + const size_t m_blk = 32; + const size_t k_blk = 16; + const size_t n_blk = 64; + const auto subtensor_scalar = std::vector{1}; + const auto subtensor_power = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; + const auto subtensor_full = std::vector(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM); + + const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 12, 128, 64})); + const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 128, 12, 64})); + const auto parameter2 = std::make_shared(ov::element::f32, ov::PartialShape({1, 12, 128, 64})); + + const auto order = std::vector{0, 2, 3, 1}; + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); + const auto store = std::make_shared(load_reshape); + const auto relu0 = std::make_shared(store); + const auto brgemm_cpu0 = std::make_shared(parameter0, relu0, ov::intel_cpu::BrgemmCPU::Type::Floating); + brgemm_cpu0->set_m_block_size(m_blk); + brgemm_cpu0->set_k_block_size(k_blk); + brgemm_cpu0->set_n_block_size(n_blk); + + const auto relu1 = std::make_shared(brgemm_cpu0); + + // Decomposed Softmax + const auto reduce_max = std::make_shared(relu1, 3); + ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_max); + const auto subtract = std::make_shared(relu1, reduce_max); + const auto exp = std::make_shared(subtract); + + const auto reduce_sum = std::make_shared(exp, 3); + ov::snippets::op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum); + const auto power = std::make_shared(reduce_sum, -1.f); + const auto multiply = std::make_shared(exp, power); + + const auto brgemm_cpu1 = std::make_shared(multiply, parameter2, ov::intel_cpu::BrgemmCPU::Type::Floating); + brgemm_cpu1->set_m_block_size(m_blk); + brgemm_cpu1->set_k_block_size(k_blk); + brgemm_cpu1->set_n_block_size(n_blk); + + const auto relu2 = std::make_shared(brgemm_cpu1); + + const auto body = std::make_shared(std::make_shared(relu2), ov::ParameterVector{parameter0, parameter1, parameter2}); + + MarkOp(load_reshape, subtensor_scalar); + MarkOp(store, subtensor_scalar); + MarkOp(power, subtensor_power); + + MarkOp(brgemm_cpu0, subtensor_full); + MarkOp(brgemm_cpu1, subtensor_full); + + ov::snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(load_reshape->input(0))->set_layout(order); + + return body; + } +}; + class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { protected: std::shared_ptr GetModel() const override { @@ -139,7 +202,8 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto parameter1 = std::make_shared(ov::element::bf16, ov::PartialShape({1, 128, 12, 64})); const auto parameter2 = std::make_shared(ov::element::bf16, ov::PartialShape({1, 12, 128, 64})); - const auto load_reshape = std::make_shared(parameter1, 1, 0, std::vector{0, 2, 3, 1}); + const auto order = std::vector{0, 2, 3, 1}; + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto convert0 = std::make_shared(store, ov::element::f32); const auto relu0 = std::make_shared(convert0); @@ -197,47 +261,92 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { MarkOp(scratch0, subtensor_full); MarkOp(scratch1, subtensor_full); + ov::snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(load_reshape->input(0))->set_layout(order); + return body; } }; -TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) { +TEST_P(MHAFP32BufferAllocationTest, BufferAllocationCPU) { Validate(); } +TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) { + Validate(); +} namespace BufferAllocationCPUTest_Instances { +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHAFP32BufferAllocationTest, + ::testing::Combine( + ::testing::Values(false), + ::testing::Values(true), + ::testing::Values(75264), // Each Buffer has own allocated memory + ::testing::Values(7), // Each Buffer has unique ID + ::testing::Values(7)), // Each Buffer has unique cluster ID + BufferAllocationCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHAFP32BufferAllocationTest, + ::testing::Combine( + ::testing::Values(true), + ::testing::Values(true), + ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm) + ::testing::Values(2), // (Buffer before brgemm0 and after brgemm1) + (between brgemms) + ::testing::Values(3)), // (Buffer before brgemm0) + (between brgemms) + (after brgemm1) + BufferAllocationCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHAFP32BufferAllocationTest, + ::testing::Combine( + ::testing::Values(false), + ::testing::Values(false), + ::testing::Values(198144), // Each Buffer has own allocated memory + ::testing::Values(7), // Each Buffer has unique ID + ::testing::Values(7)), // Each Buffer has unique cluster ID + BufferAllocationCPUTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHAFP32BufferAllocationTest, + ::testing::Combine( + ::testing::Values(true), + ::testing::Values(false), + ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1) + ::testing::Values(2), // (Buffer before brgemm0 and after brgemm1) + (between brgemms) + ::testing::Values(3)), // (Buffer before brgemm0) + (between brgemms) + (after brgemm1) + BufferAllocationCPUTest::getTestCaseName); + INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWSplit, MHABF16AMXBufferAllocationTest, ::testing::Combine( ::testing::Values(false), ::testing::Values(true), - ::testing::Values(167936), + ::testing::Values(120064), + ::testing::Values(11), ::testing::Values(11)), BufferAllocationCPUTest::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABF16AMXBufferAllocationTest, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXOptimizedWSplit, MHABF16AMXBufferAllocationTest, ::testing::Combine( ::testing::Values(true), ::testing::Values(true), ::testing::Values(73728), - ::testing::Values(3)), + ::testing::Values(3), + ::testing::Values(8)), BufferAllocationCPUTest::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWOSplit, MHABF16AMXBufferAllocationTest, ::testing::Combine( ::testing::Values(false), ::testing::Values(false), - ::testing::Values(364544), + ::testing::Values(218368), + ::testing::Values(11), ::testing::Values(11)), BufferAllocationCPUTest::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABF16AMXBufferAllocationTest, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXOptimizedWOSplit, MHABF16AMXBufferAllocationTest, ::testing::Combine( ::testing::Values(true), ::testing::Values(false), ::testing::Values(116736), - ::testing::Values(3)), + ::testing::Values(3), + ::testing::Values(8)), BufferAllocationCPUTest::getTestCaseName); } // namespace BufferAllocationCPUTest_Instances