diff --git a/petagraph/convert_weights.py b/petagraph/convert_weights.py new file mode 100644 index 00000000..7663399a --- /dev/null +++ b/petagraph/convert_weights.py @@ -0,0 +1,141 @@ +import json +from pathlib import Path +from typing import Optional + +import nanotron +import torch +from nanotron.config import LlamaConfig as NanotronLlamaConfig +from nanotron.models.llama import LlamaForTraining +from nanotron.trainer import mark_tied_parameters + + +def get_weight_mapping(config: NanotronLlamaConfig, nt_to_hf: bool = True) -> dict[str, str]: + """Returns the nanotron to huggingface parameter mapping if `nt_to_hf`, otherwise the + huggingface to nanotron mapping.""" + + hf_to_nt_map = {} + hf_to_nt_map["lm_head.weight"] = "model.lm_head.pp_block.weight" + hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight" + hf_to_nt_map["model.norm.weight"] = "model.final_layer_norm.pp_block.weight" + hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight" + + for i in range(config.num_hidden_layers): + hf_prefix = f"model.layers.{i}" + nt_prefix = f"model.decoder.{i}.pp_block" + hf_to_nt_map[f"{hf_prefix}.self_attn.q_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight" + hf_to_nt_map[f"{hf_prefix}.self_attn.k_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight" + hf_to_nt_map[f"{hf_prefix}.self_attn.v_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight" + hf_to_nt_map[f"{hf_prefix}.self_attn.o_proj.weight"] = f"{nt_prefix}.attn.o_proj.weight" + hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight" + hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias" + hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight" + hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias" + hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.weight"] = f"{nt_prefix}.mlp.down_proj.weight" + hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.bias"] = f"{nt_prefix}.mlp.down_proj.bias" + hf_to_nt_map[f"{hf_prefix}.input_layernorm.weight"] = f"{nt_prefix}.input_layernorm.weight" + hf_to_nt_map[f"{hf_prefix}.post_attention_layernorm.weight"] = f"{nt_prefix}.post_attention_layernorm.weight" + + if nt_to_hf: + nt_to_hf_map = {} + for hf, nt in hf_to_nt_map.items(): + # Because the qkv and gate_up projections are separated in the + # huggingface format, when we return nanotron to huggingface + # we will need to return a list of parameters instead (e.g. + # the `qkv_proj` will point to a list `[q_proj, k_proj, v_proj]`). + if nt in nt_to_hf_map and isinstance(nt_to_hf_map[nt], list): + nt_to_hf_map[nt].append(hf) + elif nt in nt_to_hf_map: + nt_to_hf_map[nt] = [nt_to_hf_map[nt], hf] + else: + nt_to_hf_map[nt] = hf + return nt_to_hf_map + return hf_to_nt_map + + +def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]: + """Returns either the nanotron to huggingface (if `nt_to_hf`) + configuration mapping, or the huggingface to nanotron.""" + + hf_to_nt_map = { + "bos_token_id": "bos_token_id", + "eos_token_id": "eos_token_id", + "hidden_act": "hidden_act", + "hidden_size": "hidden_size", + "initializer_range": "initializer_range", + "intermediate_size": "intermediate_size", + "max_position_embeddings": "max_position_embeddings", + "num_attention_heads": "num_attention_heads", + "num_hidden_layers": "num_hidden_layers", + "num_key_value_heads": "num_key_value_heads", + "pad_token_id": "pad_token_id", + "pretraining_tp": "pretraining_tp", + "rms_norm_eps": "rms_norm_eps", + "rope_scaling": "rope_scaling", + "rope_theta": "rope_theta", + "tie_word_embeddings": "tie_word_embeddings", + "use_cache": "use_cache", + "vocab_size": "vocab_size", + } + if nt_to_hf: + return {nt: hf for hf, nt in hf_to_nt_map.items()} + return hf_to_nt_map + + +def make_parallel_config( + dp: int = 1, + pp: int = 1, + tp: int = 1, +): + parallel_config = nanotron.config.ParallelismArgs( + dp=dp, + pp=pp, + tp=tp, + pp_engine=nanotron.config.AllForwardAllBackwardPipelineEngine(), + tp_mode=nanotron.config.TensorParallelLinearMode.ALL_REDUCE, + tp_linear_async_communication=False, + ) + return parallel_config + + +def load_nanotron_model( + model_config: Optional[NanotronLlamaConfig] = None, + device: torch.device = torch.device("cuda"), + dtype: torch.dtype = torch.bfloat16, + checkpoint_path: Optional[Path] = None, +) -> LlamaForTraining: + """ + Creates and returns a nanotron model. + If `model_config` is None, then `checkpoint_path` must be set, in which case + the configuration will be loaded from such path. + If `checkpoint_path` is None, then `model_config` must be set, in which case + the model created will have random weights. + """ + + if model_config is None: + assert checkpoint_path is not None + with open(checkpoint_path / "model_config.json") as f: + model_config = NanotronLlamaConfig(**json.load(f)) + parallel_config = make_parallel_config() + parallel_context = nanotron.parallel.ParallelContext( + data_parallel_size=parallel_config.dp, + pipeline_parallel_size=parallel_config.pp, + tensor_parallel_size=parallel_config.tp, + ) + nanotron_model = nanotron.models.build_model( + model_builder=lambda: LlamaForTraining( + config=model_config, + parallel_context=parallel_context, + parallel_config=parallel_config, + random_states=None, + ), + parallel_context=parallel_context, + dtype=dtype, + device=device, + ) + mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context) + # Load checkpoint directly in memory and then only keep the state dictionary + if checkpoint_path is not None: + nanotron.serialize.load_weights( + model=nanotron_model, parallel_context=parallel_context, root_folder=checkpoint_path + ) + return nanotron_model