Fix attention mask setting and remove debug logs

openvinotoolkit · Oct 30, 2024 · e2557b2 · e2557b2
1 parent 3097473
commit e2557b2
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 62 deletions.
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {
 
     std::filesystem::path models_path = argv[1];
     std::string wav_file_path = argv[2];
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
 
     ov::genai::WhisperPipeline pipeline(models_path, device);
 

diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
@@ -69,7 +69,6 @@ ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, si
 
 void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
     // NB: Source outputs:
-    // for optimum-cli
     // present.0.encoder.key
     // present.0.encoder.value
 
@@ -89,7 +88,6 @@ void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest)
 
 void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const size_t kv_pos = 0u) {
     // NB: Source outputs:
-    // for optimum-cli
     // present.0.decoder.key
     // present.0.decoder.value
 
@@ -136,9 +134,6 @@ void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder,
     auto attention_mask_data = attention_mask_tensor.data<ov::float16>();
     std::fill_n(attention_mask_data, init_ids.size(), 1u);
     std::fill(attention_mask_data + init_ids.size(), attention_mask_data + attention_mask_tensor.get_size(), 0u);
-
-    //decoder.get_tensor("attention_mask").data<ov::float16>()[input_ids.size() - 1] = 0u;
-    //                                                       ^ Need to used attention_mask size here!
 }
 
 int64_t decode(ov::Tensor& encoder_hidden_state,
@@ -176,11 +171,9 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
                          const std::vector<int64_t>& generated_tokens) {
     // FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
     decoder_with_past.get_tensor("input_ids").data<int32_t>()[0] = static_cast<int32_t>(input_id);
-    // FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
-    //decoder_with_past.get_tensor("position_ids").data<int32_t>()[0] = static_cast<int32_t>(position_id);
-    decoder_with_past.get_tensor("cache_position").data<int64_t>()[0] = position_id; // for optimum-cli
+    decoder_with_past.get_tensor("cache_position").data<int64_t>()[0] = position_id;
     // FIXME: Is "attention_mask" supposed to be f16?
-    decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 1u;
+    decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;
 
     decoder_with_past.infer();
 
@@ -202,21 +195,19 @@ void zero_past_key_values(ov::InferRequest& request) {
             past_key_value_decoder_name.find("past_key_values") == std::string::npos) {
             continue;
         }
-        fill_tensor<ov::float16>(request.get_tensor(past_key_value_decoder_name), 0); // for optimum-cli
+        fill_tensor<ov::float16>(request.get_tensor(past_key_value_decoder_name), 0);
     }
 }
 
 void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) {
-    // NB: Prepare attetion mask to be in a format [1, 1, 1, 0, 0, 0, 0, ..., 1]
+    // NB: Prepare attetion mask to be in a format [0, 0, 0, 1, 1, 1, 1, ..., 0, 1]
+    // Mask should be inverted for decoder_with_past 
     auto attention_mask = decoder_with_past.get_tensor("attention_mask");
     auto* attention_mask_ptr = attention_mask.data<ov::float16>();
-    std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 1);
-    //std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 1, 0);
-    //attention_mask_ptr[attention_mask.get_size() - 1] = 1;
-    // NB: for optimum-cli models attention_mask should be [1, 1, 1, 0, 0, 0, 0, ..., 1, 0], size = size+1 :FIXME
-    std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 2, 0);
-    attention_mask_ptr[attention_mask.get_size() - 2] = 1;
-    attention_mask_ptr[attention_mask.get_size() - 1] = 0;
+    std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 0);
+    std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 2, 1);
+    attention_mask_ptr[attention_mask.get_size() - 2] = 0;
+    attention_mask_ptr[attention_mask.get_size() - 1] = 1;
     // NB: Zero past_key_values.*.decoder.value tensors
     zero_past_key_values(decoder_with_past);
     // NB: Copy KV-caches from decoder
@@ -407,16 +398,14 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model) {
 }
 
 void reshape_to_static(std::shared_ptr<ov::Model> model, const uint32_t input_size, const uint32_t kvcache_size) {
-    //std::cout << "[DEBUG] Reshaping decoder_with_past_model ..." << std::endl;
-
     std::map<std::string, ov::PartialShape> new_shapes;
     for (auto input : model->inputs()) {
         const auto& input_name = input.get_any_name();
         ov::PartialShape new_shape;
         if (input_name.find("input_ids") != std::string::npos) {
             new_shape = ov::PartialShape({1, input_size});
         } else if (input_name.find("attention_mask") != std::string::npos) {
-            new_shape = ov::PartialShape({1, kvcache_size + 1});  // Artefact in attention_mask
+            new_shape = ov::PartialShape({1, kvcache_size + 1});
         } else if (input_name.find("position_ids") != std::string::npos) {
             new_shape = ov::PartialShape({1, input_size});
         } else if (input_name.find("cache_position") != std::string::npos) {
@@ -425,17 +414,14 @@ void reshape_to_static(std::shared_ptr<ov::Model> model, const uint32_t input_si
             const auto& partial_shape = input.get_partial_shape();
             new_shape = partial_shape;
             new_shape[0] = 1;     // batch_dim
-            new_shape[1] = 1500;  // FIXME: where to get this? is it got from encoder output{'last_hidden_state'}
+            new_shape[1] = 1500;  // FIXME: is it got from encoder output{'last_hidden_state'}
         } else if (input_name.find("past_key_values") != std::string::npos) {
             const auto& partial_shape = input.get_partial_shape();
             new_shape = partial_shape;
             new_shape[0] = 1;  // Use batch dim here
             new_shape[2] = input_name.find(".decoder") != std::string::npos
-                               ? kvcache_size - input_size
-                               : 1500;  // kv_size for decoder, 1500 for encoder : is it got from encoder
-                                        // output{'last_hidden_state'}
-
-            //        ^ use kv_dim here
+                               ? kvcache_size - input_size // kv_size for decoder
+                               : 1500;  // for encoder
         }
         new_shapes.emplace(input_name, new_shape);
     }
@@ -496,8 +482,7 @@ void preprocess_decoder(std::shared_ptr<ov::Model> model) {
     }
 
     for (auto tensor : model->outputs()) {
-        //preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
-        if (tensor.get_any_name().find("present") != std::string::npos) { // "present" for models from arch team
+        if (tensor.get_any_name().find("present") != std::string::npos) {
             preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
             preprocessor.output(tensor.get_any_name()).postprocess().convert_element_type();
 
@@ -550,49 +535,22 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
         OPENVINO_THROW("StaticWhisperPipeline expects decoder model has \"attention_mask\" input!");
     }
 
-    // TODO: There must be model reshape to eliminate dynamism!
-    size_t max_sequence_length = 128;
+    size_t max_sequence_length = 448;
 
     reshape_to_static_encoder(encoder_model);
-    reshape_to_static(decoder_model, 4, 4);  // What is 4 here??
+    reshape_to_static(decoder_model, 4, 4);
     reshape_to_static(decoder_with_past_model, 1, max_sequence_length);
 
     // Replace KV-tensors for the entire cache to tensors only for new token
     decoder_with_past_model = redirect_new_kv_to_output(decoder_with_past_model);
 
-    ov::AnyMap config_encoder = {
-        {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"},
-        {"NPU_USE_NPUW", "YES"},
-        {"NPUW_ONLINE_PIPELINE", "NONE"},
-        //{"NPUW_FOLD", "YES"},
-        //{"NPUW_DCOFF_TYPE", "f16"},
-        //{"NPUW_DCOFF_SCALE", "YES"},
-        {"NPUW_DEVICES", "CPU"}};
-
-    ov::AnyMap config = {
-        {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"},
-        {"NPU_USE_NPUW", "YES"},
-        //{"NPUW_FOLD", "YES"},
-        //{"NPUW_DCOFF_TYPE", "f16"},
-        //{"NPUW_DCOFF_SCALE", "YES"},
-        {"NPUW_DEVICES", "CPU"}};
-
     preprocess_encoder(encoder_model);
     preprocess_decoder(decoder_model);
     preprocess_decoder(decoder_with_past_model);
 
-    std::cout << "[DEBUG] All model modifications are done, saving models..." << std::endl;
-    ov::save_model(encoder_model, models_path / "0_openvino_encoder_model_attn.xml");
-    ov::save_model(decoder_model, models_path / "0_openvino_decoder_model_attn.xml");
-    ov::save_model(decoder_with_past_model, models_path / "0_openvino_decoder_with_past_model_attn.xml");
-
-    m_models.encoder = core.compile_model(encoder_model, "NPU", config_encoder).create_infer_request();
-    std::cout << "[DEBUG] Compile encoder model - DONE" << std::endl;
-    m_models.decoder = core.compile_model(decoder_model, "NPU", config_encoder).create_infer_request();
-    std::cout << "[DEBUG] Compile decoder model - DONE" << std::endl;
-    m_models.decoder_with_past =
-        core.compile_model(decoder_with_past_model, "NPU", config_encoder).create_infer_request();
-    std::cout << "[DEBUG] Compile decoder with past model - DONE" << std::endl;
+    m_models.encoder = core.compile_model(encoder_model, "NPU").create_infer_request();
+    m_models.decoder = core.compile_model(decoder_model, "NPU").create_infer_request();
+    m_models.decoder_with_past = core.compile_model(decoder_with_past_model, "NPU").create_infer_request();
 
     // If eos_token_id was not provided, take value
     if (m_generation_config.eos_token_id == -1) {