[NPU]Qwen2 groupwise performance opt (intel-analytics#12299)

* qwen2 gw performance opt * remove debug
ch1y0q · Oct 30, 2024 · 0763268 · 0763268
1 parent 41b8064
commit 0763268
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -229,7 +229,10 @@ def __init__(
             new_value_states = self.convert_to_fp16(curr_key_values[i][1])
 
         print(f"{mode} start compiling")
-        self.compile()
+        if group_size != 0 and (mode == "prefill" or num_layers == 2):
+            self.compile(npu_dpu_groups=6)
+        else:
+            self.compile()
         print(f"{mode} end compiling")
 
     def build_decoder(