update

intel · Jun 28, 2024 · e0f3be7 · e0f3be7
1 parent 542a23c
commit e0f3be7
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 11 deletions.
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh
@@ -29,3 +29,7 @@ NUM_GPUS=2 # number of used GPU
 # To run chatglm3-6b
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
     generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
+
+# # To run glm-4-9b-chat
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'THUDM/glm-4-9b-chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm4.py b/python/llm/src/ipex_llm/transformers/models/chatglm4.py
@@ -131,13 +131,9 @@ def chatglm4_attention_forward(
     self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
 ):
     # hidden_states: [b, sq, h]
-    # torch.set_printoptions(edgeitems=hidden_states.numel())
-    # print(f'On my rank {torch.distributed.get_rank()}, INPUT IS {hidden_states}, AND {attention_mask}, AND {rotary_pos_emb}, AND {kv_cache}')
-    # print(f'ON ATTENTION FORWARD, INPUT IS {hidden_states}, AND {attention_mask}, AND {rotary_pos_emb}, AND {kv_cache}')
     bsz, q_len, _ = hidden_states.size()
 
     # past_key_value: [bsz, n_kv_head, seq_len, head_dim]
-    # print(f'On my rank {torch.distributed.get_rank()}, {kv_cache}')
     past_key_value = None if kv_cache is None else (kv_cache[0],
                                                     kv_cache[1])
 
@@ -233,12 +229,8 @@ def chatglm4_attention_forward(
             attn_weights = attn_weights + attention_mask
         attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1,
                                                    dtype=torch.float32).to(value_states.dtype)
-        # print(f'On my rank {torch.distributed.get_rank()}, attn_weights IS {attn_weights}, AND value_states IS {value_states}')
-        # print(f'ON ATTENTION FORWARD, attn_weights IS {attn_weights}, AND value_states IS {value_states}')
         attn_output = torch.matmul(attn_weights, value_states)
 
-    # print(f'On my rank {torch.distributed.get_rank()}, ATTEN OUTPUT IS {attn_output}')
-    # print(f'ON ATTENTION FORWARD, ATTEN OUTPUT IS {attn_output}')
     # context_layer's shape: [bsz, n_head, seq_len, head_dim] -> [seq_len, bsz, n_head * head_dim]
     attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, n_head * head_dim)
     output = self.dense(attn_output)
@@ -276,7 +268,7 @@ def chatglm4_encoder_forward(
                 use_reentrant=False
             )
         else:
-            # print(f'On my rank {torch.distributed.get_rank()}, index is {index}')
+            # if kv_caches[index] is not None:
             layer_ret = layer(
                 hidden_states,
                 attention_mask,
@@ -285,7 +277,6 @@ def chatglm4_encoder_forward(
                 use_cache=use_cache
             )
         hidden_states, kv_cache = layer_ret
-        # print(f'ON my rank {torch.distributed.get_rank()}, kv_cache is {kv_cache}')
         if use_cache:
             # token by token decoding, use tuple format
             if kv_caches[0] is not None:

diff --git a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
@@ -126,7 +126,7 @@ def pipeline_parallel(model, pipeline_parallel_stages):
             if i < layer_start or i >= layer_end:
                 model._modules['transformer'].encoder.layers[i] = Dummy_GLMBlock()
             else:
-                model._modules['transformer'].encoder.layers[i].self_attention.num_layers = \
+                model._modules['transformer'].encoder.layers[i].self_attention.layer_number = \
                     i - layer_start
 
         if local_rank != 0: