Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
plusbang committed Jun 28, 2024
1 parent 542a23c commit e0f3be7
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ NUM_GPUS=2 # number of used GPU
# To run chatglm3-6b
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run glm-4-9b-chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'THUDM/glm-4-9b-chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
11 changes: 1 addition & 10 deletions python/llm/src/ipex_llm/transformers/models/chatglm4.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,9 @@ def chatglm4_attention_forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
):
# hidden_states: [b, sq, h]
# torch.set_printoptions(edgeitems=hidden_states.numel())
# print(f'On my rank {torch.distributed.get_rank()}, INPUT IS {hidden_states}, AND {attention_mask}, AND {rotary_pos_emb}, AND {kv_cache}')
# print(f'ON ATTENTION FORWARD, INPUT IS {hidden_states}, AND {attention_mask}, AND {rotary_pos_emb}, AND {kv_cache}')
bsz, q_len, _ = hidden_states.size()

# past_key_value: [bsz, n_kv_head, seq_len, head_dim]
# print(f'On my rank {torch.distributed.get_rank()}, {kv_cache}')
past_key_value = None if kv_cache is None else (kv_cache[0],
kv_cache[1])

Expand Down Expand Up @@ -233,12 +229,8 @@ def chatglm4_attention_forward(
attn_weights = attn_weights + attention_mask
attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1,
dtype=torch.float32).to(value_states.dtype)
# print(f'On my rank {torch.distributed.get_rank()}, attn_weights IS {attn_weights}, AND value_states IS {value_states}')
# print(f'ON ATTENTION FORWARD, attn_weights IS {attn_weights}, AND value_states IS {value_states}')
attn_output = torch.matmul(attn_weights, value_states)

# print(f'On my rank {torch.distributed.get_rank()}, ATTEN OUTPUT IS {attn_output}')
# print(f'ON ATTENTION FORWARD, ATTEN OUTPUT IS {attn_output}')
# context_layer's shape: [bsz, n_head, seq_len, head_dim] -> [seq_len, bsz, n_head * head_dim]
attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, n_head * head_dim)
output = self.dense(attn_output)
Expand Down Expand Up @@ -276,7 +268,7 @@ def chatglm4_encoder_forward(
use_reentrant=False
)
else:
# print(f'On my rank {torch.distributed.get_rank()}, index is {index}')
# if kv_caches[index] is not None:
layer_ret = layer(
hidden_states,
attention_mask,
Expand All @@ -285,7 +277,6 @@ def chatglm4_encoder_forward(
use_cache=use_cache
)
hidden_states, kv_cache = layer_ret
# print(f'ON my rank {torch.distributed.get_rank()}, kv_cache is {kv_cache}')
if use_cache:
# token by token decoding, use tuple format
if kv_caches[0] is not None:
Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def pipeline_parallel(model, pipeline_parallel_stages):
if i < layer_start or i >= layer_end:
model._modules['transformer'].encoder.layers[i] = Dummy_GLMBlock()
else:
model._modules['transformer'].encoder.layers[i].self_attention.num_layers = \
model._modules['transformer'].encoder.layers[i].self_attention.layer_number = \
i - layer_start

if local_rank != 0:
Expand Down

0 comments on commit e0f3be7

Please sign in to comment.