Skip to content

Commit

Permalink
Fix attention mask setting and remove debug logs
Browse files Browse the repository at this point in the history
  • Loading branch information
eshiryae committed Oct 30, 2024
1 parent 3097473 commit e2557b2
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {

std::filesystem::path models_path = argv[1];
std::string wav_file_path = argv[2];
std::string device = "CPU"; // GPU can be used as well
std::string device = "CPU"; // GPU, NPU can be used as well

ov::genai::WhisperPipeline pipeline(models_path, device);

Expand Down
80 changes: 19 additions & 61 deletions src/cpp/src/whisper_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, si

void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
// NB: Source outputs:
// for optimum-cli
// present.0.encoder.key
// present.0.encoder.value

Expand All @@ -89,7 +88,6 @@ void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest)

void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const size_t kv_pos = 0u) {
// NB: Source outputs:
// for optimum-cli
// present.0.decoder.key
// present.0.decoder.value

Expand Down Expand Up @@ -136,9 +134,6 @@ void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder,
auto attention_mask_data = attention_mask_tensor.data<ov::float16>();
std::fill_n(attention_mask_data, init_ids.size(), 1u);
std::fill(attention_mask_data + init_ids.size(), attention_mask_data + attention_mask_tensor.get_size(), 0u);

//decoder.get_tensor("attention_mask").data<ov::float16>()[input_ids.size() - 1] = 0u;
// ^ Need to used attention_mask size here!
}

int64_t decode(ov::Tensor& encoder_hidden_state,
Expand Down Expand Up @@ -176,11 +171,9 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
const std::vector<int64_t>& generated_tokens) {
// FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
decoder_with_past.get_tensor("input_ids").data<int32_t>()[0] = static_cast<int32_t>(input_id);
// FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
//decoder_with_past.get_tensor("position_ids").data<int32_t>()[0] = static_cast<int32_t>(position_id);
decoder_with_past.get_tensor("cache_position").data<int64_t>()[0] = position_id; // for optimum-cli
decoder_with_past.get_tensor("cache_position").data<int64_t>()[0] = position_id;
// FIXME: Is "attention_mask" supposed to be f16?
decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 1u;
decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;

decoder_with_past.infer();

Expand All @@ -202,21 +195,19 @@ void zero_past_key_values(ov::InferRequest& request) {
past_key_value_decoder_name.find("past_key_values") == std::string::npos) {
continue;
}
fill_tensor<ov::float16>(request.get_tensor(past_key_value_decoder_name), 0); // for optimum-cli
fill_tensor<ov::float16>(request.get_tensor(past_key_value_decoder_name), 0);
}
}

void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) {
// NB: Prepare attetion mask to be in a format [1, 1, 1, 0, 0, 0, 0, ..., 1]
// NB: Prepare attetion mask to be in a format [0, 0, 0, 1, 1, 1, 1, ..., 0, 1]
// Mask should be inverted for decoder_with_past
auto attention_mask = decoder_with_past.get_tensor("attention_mask");
auto* attention_mask_ptr = attention_mask.data<ov::float16>();
std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 1);
//std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 1, 0);
//attention_mask_ptr[attention_mask.get_size() - 1] = 1;
// NB: for optimum-cli models attention_mask should be [1, 1, 1, 0, 0, 0, 0, ..., 1, 0], size = size+1 :FIXME
std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 2, 0);
attention_mask_ptr[attention_mask.get_size() - 2] = 1;
attention_mask_ptr[attention_mask.get_size() - 1] = 0;
std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 0);
std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 2, 1);
attention_mask_ptr[attention_mask.get_size() - 2] = 0;
attention_mask_ptr[attention_mask.get_size() - 1] = 1;
// NB: Zero past_key_values.*.decoder.value tensors
zero_past_key_values(decoder_with_past);
// NB: Copy KV-caches from decoder
Expand Down Expand Up @@ -407,16 +398,14 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model) {
}

void reshape_to_static(std::shared_ptr<ov::Model> model, const uint32_t input_size, const uint32_t kvcache_size) {
//std::cout << "[DEBUG] Reshaping decoder_with_past_model ..." << std::endl;

std::map<std::string, ov::PartialShape> new_shapes;
for (auto input : model->inputs()) {
const auto& input_name = input.get_any_name();
ov::PartialShape new_shape;
if (input_name.find("input_ids") != std::string::npos) {
new_shape = ov::PartialShape({1, input_size});
} else if (input_name.find("attention_mask") != std::string::npos) {
new_shape = ov::PartialShape({1, kvcache_size + 1}); // Artefact in attention_mask
new_shape = ov::PartialShape({1, kvcache_size + 1});
} else if (input_name.find("position_ids") != std::string::npos) {
new_shape = ov::PartialShape({1, input_size});
} else if (input_name.find("cache_position") != std::string::npos) {
Expand All @@ -425,17 +414,14 @@ void reshape_to_static(std::shared_ptr<ov::Model> model, const uint32_t input_si
const auto& partial_shape = input.get_partial_shape();
new_shape = partial_shape;
new_shape[0] = 1; // batch_dim
new_shape[1] = 1500; // FIXME: where to get this? is it got from encoder output{'last_hidden_state'}
new_shape[1] = 1500; // FIXME: is it got from encoder output{'last_hidden_state'}
} else if (input_name.find("past_key_values") != std::string::npos) {
const auto& partial_shape = input.get_partial_shape();
new_shape = partial_shape;
new_shape[0] = 1; // Use batch dim here
new_shape[2] = input_name.find(".decoder") != std::string::npos
? kvcache_size - input_size
: 1500; // kv_size for decoder, 1500 for encoder : is it got from encoder
// output{'last_hidden_state'}

// ^ use kv_dim here
? kvcache_size - input_size // kv_size for decoder
: 1500; // for encoder
}
new_shapes.emplace(input_name, new_shape);
}
Expand Down Expand Up @@ -496,8 +482,7 @@ void preprocess_decoder(std::shared_ptr<ov::Model> model) {
}

for (auto tensor : model->outputs()) {
//preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
if (tensor.get_any_name().find("present") != std::string::npos) { // "present" for models from arch team
if (tensor.get_any_name().find("present") != std::string::npos) {
preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16);
preprocessor.output(tensor.get_any_name()).postprocess().convert_element_type();

Expand Down Expand Up @@ -550,49 +535,22 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
OPENVINO_THROW("StaticWhisperPipeline expects decoder model has \"attention_mask\" input!");
}

// TODO: There must be model reshape to eliminate dynamism!
size_t max_sequence_length = 128;
size_t max_sequence_length = 448;

reshape_to_static_encoder(encoder_model);
reshape_to_static(decoder_model, 4, 4); // What is 4 here??
reshape_to_static(decoder_model, 4, 4);
reshape_to_static(decoder_with_past_model, 1, max_sequence_length);

// Replace KV-tensors for the entire cache to tensors only for new token
decoder_with_past_model = redirect_new_kv_to_output(decoder_with_past_model);

ov::AnyMap config_encoder = {
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"},
{"NPU_USE_NPUW", "YES"},
{"NPUW_ONLINE_PIPELINE", "NONE"},
//{"NPUW_FOLD", "YES"},
//{"NPUW_DCOFF_TYPE", "f16"},
//{"NPUW_DCOFF_SCALE", "YES"},
{"NPUW_DEVICES", "CPU"}};

ov::AnyMap config = {
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"},
{"NPU_USE_NPUW", "YES"},
//{"NPUW_FOLD", "YES"},
//{"NPUW_DCOFF_TYPE", "f16"},
//{"NPUW_DCOFF_SCALE", "YES"},
{"NPUW_DEVICES", "CPU"}};

preprocess_encoder(encoder_model);
preprocess_decoder(decoder_model);
preprocess_decoder(decoder_with_past_model);

std::cout << "[DEBUG] All model modifications are done, saving models..." << std::endl;
ov::save_model(encoder_model, models_path / "0_openvino_encoder_model_attn.xml");
ov::save_model(decoder_model, models_path / "0_openvino_decoder_model_attn.xml");
ov::save_model(decoder_with_past_model, models_path / "0_openvino_decoder_with_past_model_attn.xml");

m_models.encoder = core.compile_model(encoder_model, "NPU", config_encoder).create_infer_request();
std::cout << "[DEBUG] Compile encoder model - DONE" << std::endl;
m_models.decoder = core.compile_model(decoder_model, "NPU", config_encoder).create_infer_request();
std::cout << "[DEBUG] Compile decoder model - DONE" << std::endl;
m_models.decoder_with_past =
core.compile_model(decoder_with_past_model, "NPU", config_encoder).create_infer_request();
std::cout << "[DEBUG] Compile decoder with past model - DONE" << std::endl;
m_models.encoder = core.compile_model(encoder_model, "NPU").create_infer_request();
m_models.decoder = core.compile_model(decoder_model, "NPU").create_infer_request();
m_models.decoder_with_past = core.compile_model(decoder_with_past_model, "NPU").create_infer_request();

// If eos_token_id was not provided, take value
if (m_generation_config.eos_token_id == -1) {
Expand Down

0 comments on commit e2557b2

Please sign in to comment.