From c867668c1789013c1cb8e767de5ef166a76f8bf5 Mon Sep 17 00:00:00 2001 From: Valko Milev Date: Thu, 12 Jan 2023 21:19:05 +0200 Subject: [PATCH 1/5] added bert checkpoint converter --- tools/checkpoint_conversion/convert_bert.py | 1319 +++++++++++++++++++ 1 file changed, 1319 insertions(+) create mode 100644 tools/checkpoint_conversion/convert_bert.py diff --git a/tools/checkpoint_conversion/convert_bert.py b/tools/checkpoint_conversion/convert_bert.py new file mode 100644 index 0000000000..8b119e7317 --- /dev/null +++ b/tools/checkpoint_conversion/convert_bert.py @@ -0,0 +1,1319 @@ +import os +import shutil + +import numpy as np +import tensorflow as tf +import torch +import transformers +from absl import app +from absl import flags +from tensorflow import keras +import sys +import keras_nlp + +FLAGS = flags.FLAGS +PRESET_MAP = { + "bert_tiny_uncased_en": "bert_tiny_uncased_en" +} + +flags.DEFINE_string( + "preset", None, f'Must be one of {",".join(PRESET_MAP.keys())}' +) +MODEL_SUFFIX = "uncased" +# MODEL_SPEC_STR = "L-24_H-1024_A-16" +MODEL_SPEC_STR = "L-2_H-128_A-2" + + +def download_model(preset, TOKEN_TYPE, hf_model_name): + print("-> Download original weights in " + preset) + zip_path = f"""https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/{MODEL_SUFFIX}_L-12_H-768_A-12.tar.gz""" + path_file = keras.utils.get_file( + f"""/home/x000ff4/open_source/486/keras-nlp/tools/checkpoint_conversion/{preset}.tar.gz""", + zip_path, + extract=False, + archive_format="tar", + ) + print('path_file', path_file) + extract_dir = f"./content/{preset}/tmp/temp_dir/raw/" + vocab_path = os.path.join(extract_dir, "vocab.txt") + checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt") + config_path = os.path.join(extract_dir, "bert_config.json") + + os.system(f"tar -C ./content/{preset} -xvf {path_file}") + + vars = tf.train.list_variables(checkpoint_path) + weights = {} + for name, shape in vars: + weight = tf.train.load_variable(checkpoint_path, name) + weights[name] = weight + print(name) + model = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased", + load_weights=True) # keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE) + model.summary() + return vocab_path, checkpoint_path, config_path, weights, model + + +def convert_checkpoints(preset, weights, model): + print("\n-> Convert original weights to KerasNLP format.") + + # Transformer layers. + if preset == 'bert_base_cased': + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + elif preset == 'bert_base_multi_cased': + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("pooled_dense").bias.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + pass + elif preset == 'bert_base_uncased': + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["next_sentence..pooler_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + pass + elif preset == 'bert_base_zh': + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("pooled_dense").bias.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + pass + elif preset == 'bert_large_cased_en': + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + elif preset == 'bert_large_uncased_en': + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + elif preset == 'bert_medium_uncased_en': + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + elif preset == 'bert_small_uncased_en': + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + elif preset == 'bert_tiny_uncased_en': + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + # Save the model. + print(f"\n-> Save KerasNLP model weights to `{preset}.h5`.") + model.save_weights(f"{preset}.h5") + + return model + + +def define_preprocessor(vocab_path, checkpoint_path, config_path, model): + def preprocess(x): + tokenizer = keras_nlp.tokenizers.WordPieceTokenizer( + vocabulary=vocab_path, lowercase=False + ) + packer = keras_nlp.layers.MultiSegmentPacker( + sequence_length=model.max_sequence_length, + start_value=tokenizer.token_to_id("[CLS]"), + end_value=tokenizer.token_to_id("[SEP]"), + ) + return packer(tokenizer(x)) + + token_ids, segment_ids = preprocess(["The झटपट brown लोमड़ी."]) + encoder_config = tfm.nlp.encoders.EncoderConfig( + type="bert", + bert=json.load(tf.io.gfile.GFile(config_path)), + ) + mg_model = tfm.nlp.encoders.build_encoder(encoder_config) + checkpoint = tf.train.Checkpoint(encoder=mg_model) + checkpoint.read(checkpoint_path).assert_consumed() + return mg_model, token_ids, segment_ids + + +def check_output( + preset, + keras_nlp_model, + mg_model, + token_ids, + segment_ids +): + keras_nlp_output = keras_nlp_model( + { + "token_ids": token_ids, + "segment_ids": segment_ids, + "padding_mask": token_ids != 0, + } + )["pooled_output"] + + mg_output = mg_model( + { + "input_word_ids": token_ids, + "input_type_ids": segment_ids, + "padding_mask": token_ids != 0, + } + )["pooled_output"] + tf.reduce_mean(keras_nlp_output - mg_output) + model.save_weights(f"""{preset}.h5""") + return keras_nlp_output + + +def main(_): + assert ( + FLAGS.preset in PRESET_MAP.keys() + ), f'Invalid preset {FLAGS.preset}. Must be one of {",".join(PRESET_MAP.keys())}' + size = PRESET_MAP[FLAGS.preset][0] + hf_model_name = PRESET_MAP[FLAGS.preset][1] + + vocab_path, checkpoint_path, config_path, weights, model = download_model(FLAGS.preset, size, hf_model_name) + + keras_nlp_model = convert_checkpoints(FLAGS.preset, weights, model) + + print("\n-> Load HF model.") + hf_model = transformers.AutoModel.from_pretrained(hf_model_name) + hf_model.eval() + + mg_model, token_ids, segment_ids = define_preprocessor( + vocab_path, checkpoint_path, config_path, model + ) + + check_output( + FLAGS.preset, + keras_nlp_model, + mg_model, + token_ids, + segment_ids + ) + + +if __name__ == "__main__": + app.run(main) From 8c97c6807762eb31ea44816dc498c896de149e43 Mon Sep 17 00:00:00 2001 From: Valko Milev Date: Tue, 31 Jan 2023 23:29:39 +0200 Subject: [PATCH 2/5] refactored the if statements --- tools/checkpoint_conversion/convert_bert.py | 959 +++----------------- 1 file changed, 113 insertions(+), 846 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bert.py b/tools/checkpoint_conversion/convert_bert.py index 8b119e7317..ba2bbd480b 100644 --- a/tools/checkpoint_conversion/convert_bert.py +++ b/tools/checkpoint_conversion/convert_bert.py @@ -1,27 +1,40 @@ import os -import shutil - -import numpy as np import tensorflow as tf -import torch import transformers from absl import app from absl import flags from tensorflow import keras -import sys +import tensorflow_models as tfm import keras_nlp +import json FLAGS = flags.FLAGS + PRESET_MAP = { - "bert_tiny_uncased_en": "bert_tiny_uncased_en" + "bert_tiny_uncased_en": ("roberta.base", "bert_tiny_uncased_en") } flags.DEFINE_string( "preset", None, f'Must be one of {",".join(PRESET_MAP.keys())}' ) +VOCAB_SIZE = 30522 +NUM_LAYERS = 24 +NUM_ATTN_HEADS = 16 +EMBEDDING_SIZE = 1024 + MODEL_SUFFIX = "uncased" # MODEL_SPEC_STR = "L-24_H-1024_A-16" MODEL_SPEC_STR = "L-2_H-128_A-2" +model_source = { + 'bert_tiny_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_tiny_uncased_en.ipynb', + 'bert_small_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_small_uncased_en.ipynb', + 'bert_medium_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_medium_uncased_en.ipynb', + 'bert_base_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_uncased.ipynb', + 'bert_base_en': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_cased.ipynb', + 'bert_base_zh': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_zh.ipynb', + 'bert_base_multi': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_multi_cased.ipynb', + 'bert_large_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_large_uncased_en.ipynb', + 'bert_large_en': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_large_cased_en.ipynb'} def download_model(preset, TOKEN_TYPE, hf_model_name): @@ -57,7 +70,7 @@ def convert_checkpoints(preset, weights, model): print("\n-> Convert original weights to KerasNLP format.") # Transformer layers. - if preset == 'bert_base_cased': + if preset in ['bert_base_en_uncased', 'bert_base_en']: model.get_layer("token_embedding").embeddings.assign( weights[ "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" @@ -123,20 +136,36 @@ def convert_checkpoints(preset, weights, model): f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" ] ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) + if preset == 'bert_base_en_uncased': + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + else: + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layernorm.gamma.assign( @@ -193,14 +222,22 @@ def convert_checkpoints(preset, weights, model): f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" ] ) - - model.get_layer("pooled_dense").kernel.assign( - weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("pooled_dense").bias.assign( - weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] - ) - elif preset == 'bert_base_multi_cased': + if preset == 'bert_base_en_uncased': + model.get_layer("pooled_dense").kernel.assign( + weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["next_sentence..pooler_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + else: + model.get_layer("pooled_dense").kernel.assign( + weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + pass + elif preset in ['bert_base_zh', 'bert_base_multi']: model.get_layer("token_embedding").embeddings.assign( weights[ "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" @@ -336,770 +373,132 @@ def convert_checkpoints(preset, weights, model): f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" ] ) - - model.get_layer("pooled_dense").kernel.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("pooled_dense").bias.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - pass - elif preset == 'bert_base_uncased': - model.get_layer("token_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights[ - "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + if preset == 'bert_base_zh': + model.get_layer("pooled_dense").kernel.assign( + weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + model.get_layer("pooled_dense").bias.assign( + weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( + else: + model.get_layer("pooled_dense").kernel.assign( weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" ] ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( + model.get_layer("pooled_dense").bias.assign( weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" ] ) - - model.get_layer("pooled_dense").kernel.assign( - weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("pooled_dense").bias.assign( - weights["next_sentence..pooler_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"] - ) pass - elif preset == 'bert_base_zh': + elif preset in ['bert_large_en_uncased', 'bert_large_en', 'bert_medium_en_uncased', 'bert_small_en_uncased', + 'bert_tiny_en_uncased']: model.get_layer("token_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights["bert/embeddings/word_embeddings"] ) model.get_layer("position_embedding").position_embeddings.assign( - weights[ - "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights["bert/embeddings/position_embeddings"] ) model.get_layer("segment_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights["bert/embeddings/token_type_embeddings"] ) model.get_layer("embeddings_layer_norm").gamma.assign( - weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + weights["bert/embeddings/LayerNorm/gamma"] ) model.get_layer("embeddings_layer_norm").beta.assign( - weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + weights["bert/embeddings/LayerNorm/beta"] ) for i in range(model.num_layers): model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._key_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._key_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._query_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._query_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._value_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._value_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (NUM_ATTN_HEADS, -1) + ) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._output_dense.kernel.assign( weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] ) model.get_layer( f"transformer_layer_{i}" )._self_attention_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] ) model.get_layer( f"transformer_layer_{i}" )._feedforward_intermediate_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] ) model.get_layer( f"transformer_layer_{i}" )._feedforward_intermediate_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] ) model.get_layer( f"transformer_layer_{i}" )._feedforward_output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] ) model.get_layer( f"transformer_layer_{i}" )._feedforward_output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] + weights[f"bert/encoder/layer_{i}/output/dense/bias"] ) model.get_layer( f"transformer_layer_{i}" )._feedforward_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - - model.get_layer("pooled_dense").kernel.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("pooled_dense").bias.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - pass - elif preset == 'bert_large_cased_en': - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] - ) - - model.get_layer("pooled_dense").kernel.assign( - weights["bert/pooler/dense/kernel"] - ) - model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) - pass - elif preset == 'bert_large_uncased_en': - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] - ) - - model.get_layer("pooled_dense").kernel.assign( - weights["bert/pooler/dense/kernel"] - ) - model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) - pass - elif preset == 'bert_medium_uncased_en': - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] - ) - - model.get_layer("pooled_dense").kernel.assign( - weights["bert/pooler/dense/kernel"] - ) - model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) - pass - elif preset == 'bert_small_uncased_en': - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] ) model.get_layer( f"transformer_layer_{i}" @@ -1112,124 +511,7 @@ def convert_checkpoints(preset, weights, model): ) model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) pass - elif preset == 'bert_tiny_uncased_en': - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] - ) - model.get_layer("pooled_dense").kernel.assign( - weights["bert/pooler/dense/kernel"] - ) - model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) - pass # Save the model. print(f"\n-> Save KerasNLP model weights to `{preset}.h5`.") model.save_weights(f"{preset}.h5") @@ -1283,7 +565,7 @@ def check_output( } )["pooled_output"] tf.reduce_mean(keras_nlp_output - mg_output) - model.save_weights(f"""{preset}.h5""") + keras_nlp_model.save_weights(f"""{preset}.h5""") return keras_nlp_output @@ -1298,22 +580,7 @@ def main(_): keras_nlp_model = convert_checkpoints(FLAGS.preset, weights, model) - print("\n-> Load HF model.") - hf_model = transformers.AutoModel.from_pretrained(hf_model_name) - hf_model.eval() - - mg_model, token_ids, segment_ids = define_preprocessor( - vocab_path, checkpoint_path, config_path, model - ) - - check_output( - FLAGS.preset, - keras_nlp_model, - mg_model, - token_ids, - segment_ids - ) +if __name__ == "__main__": -if __name__ == "__main__": app.run(main) From 42246d7f29668f71e91b86413a930a60a2b5a099 Mon Sep 17 00:00:00 2001 From: Valko Milev Date: Wed, 1 Mar 2023 23:05:56 +0200 Subject: [PATCH 3/5] fixes guided by the comments --- .../convert_bert_checkpoints.py | 631 ++++++++++++++++++ 1 file changed, 631 insertions(+) create mode 100644 tools/checkpoint_conversion/convert_bert_checkpoints.py diff --git a/tools/checkpoint_conversion/convert_bert_checkpoints.py b/tools/checkpoint_conversion/convert_bert_checkpoints.py new file mode 100644 index 0000000000..4986df724a --- /dev/null +++ b/tools/checkpoint_conversion/convert_bert_checkpoints.py @@ -0,0 +1,631 @@ +import os +import tensorflow as tf +import transformers +from absl import app +from absl import flags +from tensorflow import keras +import tensorflow_models as tfm +import keras_nlp +import json + +FLAGS = flags.FLAGS + +PRESET_MAP = { + "bert_base_cased": {'base':"roberta.base", + 'base_model':"bert_base_cased", + 'TOKEN_TYPE': "cased", + 'MODEL_TYPE': "bert_base", + 'VOCAB_SIZE': 28996}, + "bert_base_multi_cased": {'base':"roberta.base", + 'base_model':"bert_base_multi_cased", + 'MODEL_TYPE': "bert_base", + 'MODEL_SUFFIX': "multi_cased", + 'VOCAB_SIZE': 119547}, + "bert_base_uncased": {'base':"roberta.base", + 'base_model':"bert_base_uncased", + 'MODEL_TYPE': "bert_base", + 'TOKEN_TYPE': "uncased", + 'VOCAB_SIZE': 30522}, + "bert_base_zh": {'base':"roberta.base", + 'base_model': "bert_base_zh", + 'MODEL_TYPE': "bert_base", + 'MODEL_SUFFIX': "chinese", + 'VOCAB_SIZE': 21128}, + "bert_large_cased_en": {'base':"roberta.base", + 'base_model':"bert_large_cased_en", + 'MODEL_TYPE': "bert_large", + 'MODEL_SUFFIX': "cased", + 'MODEL_SPEC_STR': "L-24_H-1024_A-16", + 'VOCAB_SIZE': 28996, + 'NUM_LAYERS': 24, + 'NUM_ATTN_HEADS': 16, + 'EMBEDDING_SIZE': 1024}, + "bert_large_uncased_en": {'base':"roberta.base", + 'base_model':"bert_large_uncased_en", + 'MODEL_TYPE': "bert_large", + 'MODEL_SUFFIX': "uncased", + 'MODEL_SPEC_STR': "L-24_H-1024_A-16", + 'VOCAB_SIZE': 30522, + 'NUM_LAYERS': 24, + 'NUM_ATTN_HEADS': 16, + 'EMBEDDING_SIZE': 1024 + }, + "bert_medium_uncased_en": {'base':"roberta.base", + 'base_model':"bert_medium_uncased_en", + 'MODEL_TYPE' : "bert_medium", + 'MODEL_SUFFIX' : "uncased", + 'MODEL_SPEC_STR' : "L-8_H-512_A-8", + 'VOCAB_SIZE' : 30522, + 'NUM_LAYERS' : 8, + 'NUM_ATTN_HEADS' : 8, + 'EMBEDDING_SIZE' : 512 + }, + "bert_small_uncased_en":{'base':"roberta.base", + 'base_model':"bert_small_uncased_en", + 'MODEL_TYPE' : "bert_small", + 'MODEL_SUFFIX' : "uncased", + 'MODEL_SPEC_STR' : "L-4_H-512_A-8", + 'VOCAB_SIZE' : 30522, + 'NUM_LAYERS' : 4, + 'NUM_ATTN_HEADS' : 8, + 'EMBEDDING_SIZE' : 512}, + "bert_tiny_uncased_en": {'base':"roberta.base", + 'base_model':"bert_tiny_uncased_en", + 'MODEL_TYPE' : "bert_tiny", + 'MODEL_SUFFIX' : "uncased", + 'MODEL_SPEC_STR' : "L-2_H-128_A-2", + 'VOCAB_SIZE' : 30522, + 'NUM_LAYERS' : 2, + 'NUM_ATTN_HEADS' : 2, + 'EMBEDDING_SIZE' : 128} +} + +flags.DEFINE_string( + "preset", None, f'Must be one of {",".join(PRESET_MAP.keys())}' +) + + +def download_model(preset): + print("-> Download original weights in " + preset) + zip_path = f"""https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/{MODEL_SUFFIX}_L-12_H-768_A-12.tar.gz""" + path_file = keras.utils.get_file( + f"""{preset}.tar.gz""", + zip_path, + extract=False, + archive_format="tar", + ) + print('path_file', path_file) + extract_dir = f"./content/{preset}/tmp/temp_dir/raw/" + vocab_path = os.path.join(extract_dir, "vocab.txt") + checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt") + config_path = os.path.join(extract_dir, "bert_config.json") + + os.system(f"tar -C ./content/{preset} -xvf {path_file}") + + + return vocab_path, checkpoint_path, config_path + + +def convert_checkpoints(preset,checkpoint_path,config_dict): + print("\n-> Convert original weights to KerasNLP format.") + + # Transformer layers. + vars = tf.train.list_variables(checkpoint_path) + weights = {} + for name, shape in vars: + weight = tf.train.load_variable(checkpoint_path, name) + weights[name] = weight + print(name) + model = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased", + load_weights=True) # keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE) + model.summary() + if preset in ['bert_base_en_uncased', 'bert_base_en']: + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + if preset == 'bert_base_en_uncased': + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + else: + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + if preset == 'bert_base_en_uncased': + model.get_layer("pooled_dense").kernel.assign( + weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["next_sentence..pooler_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + else: + model.get_layer("pooled_dense").kernel.assign( + weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + pass + elif preset in ['bert_base_zh', 'bert_base_multi']: + model.get_layer("token_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights[ + "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights[ + "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[ + f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + if preset == 'bert_base_zh': + model.get_layer("pooled_dense").kernel.assign( + weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] + ) + model.get_layer("pooled_dense").bias.assign( + weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] + ) + else: + model.get_layer("pooled_dense").kernel.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + model.get_layer("pooled_dense").bias.assign( + weights[ + f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" + ] + ) + pass + elif preset in ['bert_large_en_uncased', 'bert_large_en', 'bert_medium_en_uncased', 'bert_small_en_uncased', + 'bert_tiny_en_uncased']: + model.get_layer("token_embedding").embeddings.assign( + weights["bert/embeddings/word_embeddings"] + ) + model.get_layer("position_embedding").position_embeddings.assign( + weights["bert/embeddings/position_embeddings"] + ) + model.get_layer("segment_embedding").embeddings.assign( + weights["bert/embeddings/token_type_embeddings"] + ) + model.get_layer("embeddings_layer_norm").gamma.assign( + weights["bert/embeddings/LayerNorm/gamma"] + ) + model.get_layer("embeddings_layer_norm").beta.assign( + weights["bert/embeddings/LayerNorm/beta"] + ) + + for i in range(model.num_layers): + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( + (config_dict['EMBEDDING_SIZE'], config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( + (config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( + (config_dict['EMBEDDING_SIZE'], config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( + (config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( + (config_dict['EMBEDDING_SIZE'], config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( + (config_dict['NUM_ATTN_HEADS'], -1) + ) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.kernel.assign( + weights[ + f"bert/encoder/layer_{i}/attention/output/dense/kernel" + ].reshape((config_dict['NUM_ATTN_HEADS'], -1, config_dict['EMBEDDING_SIZE'])) + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.kernel.assign( + weights[f"bert/encoder/layer_{i}/output/dense/kernel"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.bias.assign( + weights[f"bert/encoder/layer_{i}/output/dense/bias"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.gamma.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] + ) + model.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.beta.assign( + weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] + ) + + model.get_layer("pooled_dense").kernel.assign( + weights["bert/pooler/dense/kernel"] + ) + model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) + pass + + # Save the model. + print(f"\n-> Save KerasNLP model weights to `{preset}.h5`.") + model.save_weights(f"{preset}.h5") + + return model + + +def define_preprocessor(vocab_path, checkpoint_path, config_path, model): + def preprocess(x): + tokenizer = keras_nlp.tokenizers.WordPieceTokenizer( + vocabulary=vocab_path, lowercase=False + ) + + return keras_nlp.models.BertPreprocessor(tokenizer(x)) + + token_ids, segment_ids = preprocess(["The झटपट brown लोमड़ी."]) + encoder_config = tfm.nlp.encoders.EncoderConfig( + type="bert", + bert=json.load(tf.io.gfile.GFile(config_path)), + ) + mg_model = tfm.nlp.encoders.build_encoder(encoder_config) + checkpoint = tf.train.Checkpoint(encoder=mg_model) + checkpoint.read(checkpoint_path).assert_consumed() + return mg_model, token_ids, segment_ids + + +def check_output( + preset, + keras_nlp_model, + mg_model, + token_ids, + segment_ids +): + keras_nlp_output = keras_nlp_model( + { + "token_ids": token_ids, + "segment_ids": segment_ids, + "padding_mask": token_ids != 0, + } + )["pooled_output"] + + mg_output = mg_model( + { + "input_word_ids": token_ids, + "input_type_ids": segment_ids, + "padding_mask": token_ids != 0, + } + )["pooled_output"] + tf.reduce_mean(keras_nlp_output - mg_output) + keras_nlp_model.save_weights(f"""{preset}.h5""") + return keras_nlp_output + + +def main(_): + assert ( + FLAGS.preset in PRESET_MAP.keys() + ), f'Invalid preset {FLAGS.preset}. Must be one of {",".join(PRESET_MAP.keys())}' + + + vocab_path, checkpoint_path, config_path, weights, model = download_model(FLAGS.preset) + + keras_nlp_model = convert_checkpoints(FLAGS.preset,checkpoint_path,PRESET_MAP[FLAGS.preset]) + +if __name__ == "__main__": + + + app.run(main) + From e8064c738ed7635181ca648b572b42bcbe080e63 Mon Sep 17 00:00:00 2001 From: Valko Milev Date: Wed, 1 Mar 2023 23:08:12 +0200 Subject: [PATCH 4/5] removed convert_bert.py --- tools/checkpoint_conversion/convert_bert.py | 586 -------------------- 1 file changed, 586 deletions(-) delete mode 100644 tools/checkpoint_conversion/convert_bert.py diff --git a/tools/checkpoint_conversion/convert_bert.py b/tools/checkpoint_conversion/convert_bert.py deleted file mode 100644 index ba2bbd480b..0000000000 --- a/tools/checkpoint_conversion/convert_bert.py +++ /dev/null @@ -1,586 +0,0 @@ -import os -import tensorflow as tf -import transformers -from absl import app -from absl import flags -from tensorflow import keras -import tensorflow_models as tfm -import keras_nlp -import json - -FLAGS = flags.FLAGS - -PRESET_MAP = { - "bert_tiny_uncased_en": ("roberta.base", "bert_tiny_uncased_en") -} - -flags.DEFINE_string( - "preset", None, f'Must be one of {",".join(PRESET_MAP.keys())}' -) -VOCAB_SIZE = 30522 -NUM_LAYERS = 24 -NUM_ATTN_HEADS = 16 -EMBEDDING_SIZE = 1024 - -MODEL_SUFFIX = "uncased" -# MODEL_SPEC_STR = "L-24_H-1024_A-16" -MODEL_SPEC_STR = "L-2_H-128_A-2" -model_source = { - 'bert_tiny_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_tiny_uncased_en.ipynb', - 'bert_small_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_small_uncased_en.ipynb', - 'bert_medium_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_medium_uncased_en.ipynb', - 'bert_base_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_uncased.ipynb', - 'bert_base_en': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_cased.ipynb', - 'bert_base_zh': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_zh.ipynb', - 'bert_base_multi': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_base_multi_cased.ipynb', - 'bert_large_en_uncased': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_large_uncased_en.ipynb', - 'bert_large_en': 'https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/bert_large_cased_en.ipynb'} - - -def download_model(preset, TOKEN_TYPE, hf_model_name): - print("-> Download original weights in " + preset) - zip_path = f"""https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/{MODEL_SUFFIX}_L-12_H-768_A-12.tar.gz""" - path_file = keras.utils.get_file( - f"""/home/x000ff4/open_source/486/keras-nlp/tools/checkpoint_conversion/{preset}.tar.gz""", - zip_path, - extract=False, - archive_format="tar", - ) - print('path_file', path_file) - extract_dir = f"./content/{preset}/tmp/temp_dir/raw/" - vocab_path = os.path.join(extract_dir, "vocab.txt") - checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt") - config_path = os.path.join(extract_dir, "bert_config.json") - - os.system(f"tar -C ./content/{preset} -xvf {path_file}") - - vars = tf.train.list_variables(checkpoint_path) - weights = {} - for name, shape in vars: - weight = tf.train.load_variable(checkpoint_path, name) - weights[name] = weight - print(name) - model = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased", - load_weights=True) # keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE) - model.summary() - return vocab_path, checkpoint_path, config_path, weights, model - - -def convert_checkpoints(preset, weights, model): - print("\n-> Convert original weights to KerasNLP format.") - - # Transformer layers. - if preset in ['bert_base_en_uncased', 'bert_base_en']: - model.get_layer("token_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights[ - "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - if preset == 'bert_base_en_uncased': - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - else: - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - if preset == 'bert_base_en_uncased': - model.get_layer("pooled_dense").kernel.assign( - weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("pooled_dense").bias.assign( - weights["next_sentence..pooler_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"] - ) - else: - model.get_layer("pooled_dense").kernel.assign( - weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("pooled_dense").bias.assign( - weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] - ) - pass - elif preset in ['bert_base_zh', 'bert_base_multi']: - model.get_layer("token_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights[ - "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights[ - "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[ - f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - if preset == 'bert_base_zh': - model.get_layer("pooled_dense").kernel.assign( - weights["encoder/layer_with_weights-16/kernel/.ATTRIBUTES/VARIABLE_VALUE"] - ) - model.get_layer("pooled_dense").bias.assign( - weights["encoder/layer_with_weights-16/bias/.ATTRIBUTES/VARIABLE_VALUE"] - ) - else: - model.get_layer("pooled_dense").kernel.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - model.get_layer("pooled_dense").bias.assign( - weights[ - f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE" - ] - ) - pass - elif preset in ['bert_large_en_uncased', 'bert_large_en', 'bert_medium_en_uncased', 'bert_small_en_uncased', - 'bert_tiny_en_uncased']: - model.get_layer("token_embedding").embeddings.assign( - weights["bert/embeddings/word_embeddings"] - ) - model.get_layer("position_embedding").position_embeddings.assign( - weights["bert/embeddings/position_embeddings"] - ) - model.get_layer("segment_embedding").embeddings.assign( - weights["bert/embeddings/token_type_embeddings"] - ) - model.get_layer("embeddings_layer_norm").gamma.assign( - weights["bert/embeddings/LayerNorm/gamma"] - ) - model.get_layer("embeddings_layer_norm").beta.assign( - weights["bert/embeddings/LayerNorm/beta"] - ) - - for i in range(model.num_layers): - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._key_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._query_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape( - (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._value_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape( - (NUM_ATTN_HEADS, -1) - ) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.kernel.assign( - weights[ - f"bert/encoder/layer_{i}/attention/output/dense/kernel" - ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE)) - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layer._output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._self_attention_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_intermediate_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.kernel.assign( - weights[f"bert/encoder/layer_{i}/output/dense/kernel"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_output_dense.bias.assign( - weights[f"bert/encoder/layer_{i}/output/dense/bias"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.gamma.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"] - ) - model.get_layer( - f"transformer_layer_{i}" - )._feedforward_layernorm.beta.assign( - weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"] - ) - - model.get_layer("pooled_dense").kernel.assign( - weights["bert/pooler/dense/kernel"] - ) - model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"]) - pass - - # Save the model. - print(f"\n-> Save KerasNLP model weights to `{preset}.h5`.") - model.save_weights(f"{preset}.h5") - - return model - - -def define_preprocessor(vocab_path, checkpoint_path, config_path, model): - def preprocess(x): - tokenizer = keras_nlp.tokenizers.WordPieceTokenizer( - vocabulary=vocab_path, lowercase=False - ) - packer = keras_nlp.layers.MultiSegmentPacker( - sequence_length=model.max_sequence_length, - start_value=tokenizer.token_to_id("[CLS]"), - end_value=tokenizer.token_to_id("[SEP]"), - ) - return packer(tokenizer(x)) - - token_ids, segment_ids = preprocess(["The झटपट brown लोमड़ी."]) - encoder_config = tfm.nlp.encoders.EncoderConfig( - type="bert", - bert=json.load(tf.io.gfile.GFile(config_path)), - ) - mg_model = tfm.nlp.encoders.build_encoder(encoder_config) - checkpoint = tf.train.Checkpoint(encoder=mg_model) - checkpoint.read(checkpoint_path).assert_consumed() - return mg_model, token_ids, segment_ids - - -def check_output( - preset, - keras_nlp_model, - mg_model, - token_ids, - segment_ids -): - keras_nlp_output = keras_nlp_model( - { - "token_ids": token_ids, - "segment_ids": segment_ids, - "padding_mask": token_ids != 0, - } - )["pooled_output"] - - mg_output = mg_model( - { - "input_word_ids": token_ids, - "input_type_ids": segment_ids, - "padding_mask": token_ids != 0, - } - )["pooled_output"] - tf.reduce_mean(keras_nlp_output - mg_output) - keras_nlp_model.save_weights(f"""{preset}.h5""") - return keras_nlp_output - - -def main(_): - assert ( - FLAGS.preset in PRESET_MAP.keys() - ), f'Invalid preset {FLAGS.preset}. Must be one of {",".join(PRESET_MAP.keys())}' - size = PRESET_MAP[FLAGS.preset][0] - hf_model_name = PRESET_MAP[FLAGS.preset][1] - - vocab_path, checkpoint_path, config_path, weights, model = download_model(FLAGS.preset, size, hf_model_name) - - keras_nlp_model = convert_checkpoints(FLAGS.preset, weights, model) - -if __name__ == "__main__": - - - app.run(main) From 16e131ff46288463b67a5ca5ef4bfd2820336626 Mon Sep 17 00:00:00 2001 From: Valko Milev Date: Tue, 11 Apr 2023 22:12:52 +0300 Subject: [PATCH 5/5] beta 1.0 --- .../convert_bert_checkpoints.py | 256 ++++++++++-------- 1 file changed, 147 insertions(+), 109 deletions(-) diff --git a/tools/checkpoint_conversion/convert_bert_checkpoints.py b/tools/checkpoint_conversion/convert_bert_checkpoints.py index 4986df724a..86041e56bd 100644 --- a/tools/checkpoint_conversion/convert_bert_checkpoints.py +++ b/tools/checkpoint_conversion/convert_bert_checkpoints.py @@ -7,77 +7,84 @@ import tensorflow_models as tfm import keras_nlp import json +import shutil +import numpy as np + +from tools.checkpoint_conversion.checkpoint_conversion_utils import ( + get_md5_checksum, +) FLAGS = flags.FLAGS PRESET_MAP = { - "bert_base_cased": {'base':"roberta.base", - 'base_model':"bert_base_cased", - 'TOKEN_TYPE': "cased", - 'MODEL_TYPE': "bert_base", - 'VOCAB_SIZE': 28996}, - "bert_base_multi_cased": {'base':"roberta.base", - 'base_model':"bert_base_multi_cased", - 'MODEL_TYPE': "bert_base", - 'MODEL_SUFFIX': "multi_cased", - 'VOCAB_SIZE': 119547}, - "bert_base_uncased": {'base':"roberta.base", - 'base_model':"bert_base_uncased", - 'MODEL_TYPE': "bert_base", - 'TOKEN_TYPE': "uncased", - 'VOCAB_SIZE': 30522}, - "bert_base_zh": {'base':"roberta.base", - 'base_model': "bert_base_zh", - 'MODEL_TYPE': "bert_base", - 'MODEL_SUFFIX': "chinese", - 'VOCAB_SIZE': 21128}, - "bert_large_cased_en": {'base':"roberta.base", - 'base_model':"bert_large_cased_en", - 'MODEL_TYPE': "bert_large", - 'MODEL_SUFFIX': "cased", - 'MODEL_SPEC_STR': "L-24_H-1024_A-16", - 'VOCAB_SIZE': 28996, - 'NUM_LAYERS': 24, - 'NUM_ATTN_HEADS': 16, - 'EMBEDDING_SIZE': 1024}, - "bert_large_uncased_en": {'base':"roberta.base", - 'base_model':"bert_large_uncased_en", - 'MODEL_TYPE': "bert_large", - 'MODEL_SUFFIX': "uncased", - 'MODEL_SPEC_STR': "L-24_H-1024_A-16", - 'VOCAB_SIZE': 30522, - 'NUM_LAYERS': 24, - 'NUM_ATTN_HEADS': 16, - 'EMBEDDING_SIZE': 1024 - }, - "bert_medium_uncased_en": {'base':"roberta.base", - 'base_model':"bert_medium_uncased_en", - 'MODEL_TYPE' : "bert_medium", - 'MODEL_SUFFIX' : "uncased", - 'MODEL_SPEC_STR' : "L-8_H-512_A-8", - 'VOCAB_SIZE' : 30522, - 'NUM_LAYERS' : 8, - 'NUM_ATTN_HEADS' : 8, - 'EMBEDDING_SIZE' : 512 + "bert_base_cased": {'base': "roberta.base", + 'base_model': "bert-base-cased", + 'TOKEN_TYPE': "cased", + 'MODEL_SUFFIX': "cased", + 'MODEL_SPEC_STR': "2018_10_18/cased_L-12_H-768_A-12", + 'MODEL_SPEC_STR_DIR': "cased_L-12_H-768_A-12/", + 'MODEL_TYPE': "bert_base_en_cased", + 'VOCAB_SIZE': 28996}, + "bert_base_multi_cased": {'base': "roberta.base", + 'base_model': "bert-base-multilingual-cased", + 'MODEL_TYPE': "bert_base_multi_cased", + 'MODEL_SUFFIX': "multi_cased", + 'MODEL_SPEC_STR': "2018_11_23/multi_cased_L-12_H-768_A-12", + 'MODEL_SPEC_STR_DIR': "multi_cased_L-12_H-768_A-12/", + 'VOCAB_SIZE': 119547}, + + "bert_large_cased_en": {'base': "roberta.base", + 'base_model': "bert-large-cased", + 'MODEL_TYPE': "bert_large_en_cased", + 'MODEL_SUFFIX': "cased", + 'MODEL_SPEC_STR': "2018_10_18/cased_L-12_H-768_A-12", + 'MODEL_SPEC_STR_DIR': "cased_L-12_H-768_A-12/", + 'VOCAB_SIZE': 28996, + 'NUM_LAYERS': 24, + 'NUM_ATTN_HEADS': 16, + 'EMBEDDING_SIZE': 1024}, + "bert_large_uncased_en": {'base': "roberta.base", + 'base_model': "bert-large-uncased", + 'MODEL_TYPE': "bert_large_en_uncased", + 'MODEL_SUFFIX': "uncased", + 'MODEL_SPEC_STR': "2020_02_20/uncased_L-12_H-768_A-12", + 'MODEL_SPEC_STR_DIR': "uncased_L-12_H-768_A-12/", + 'VOCAB_SIZE': 30522, + 'NUM_LAYERS': 24, + 'NUM_ATTN_HEADS': 16, + 'EMBEDDING_SIZE': 768 + }, + "bert_medium_uncased_en": {'base': "roberta.base", + 'base_model': "bert-base-uncased", + 'MODEL_TYPE': "bert_medium_en_uncased", + 'MODEL_SUFFIX': "uncased", + 'MODEL_SPEC_STR': "2020_02_20/uncased_L-8_H-512_A-8", + 'MODEL_SPEC_STR_DIR': "", + 'VOCAB_SIZE': 30522, + 'NUM_LAYERS': 8, + 'NUM_ATTN_HEADS': 8, + 'EMBEDDING_SIZE': 512 }, - "bert_small_uncased_en":{'base':"roberta.base", - 'base_model':"bert_small_uncased_en", - 'MODEL_TYPE' : "bert_small", - 'MODEL_SUFFIX' : "uncased", - 'MODEL_SPEC_STR' : "L-4_H-512_A-8", - 'VOCAB_SIZE' : 30522, - 'NUM_LAYERS' : 4, - 'NUM_ATTN_HEADS' : 8, - 'EMBEDDING_SIZE' : 512}, - "bert_tiny_uncased_en": {'base':"roberta.base", - 'base_model':"bert_tiny_uncased_en", - 'MODEL_TYPE' : "bert_tiny", - 'MODEL_SUFFIX' : "uncased", - 'MODEL_SPEC_STR' : "L-2_H-128_A-2", - 'VOCAB_SIZE' : 30522, - 'NUM_LAYERS' : 2, - 'NUM_ATTN_HEADS' : 2, - 'EMBEDDING_SIZE' : 128} + "bert_small_uncased_en": {'base': "roberta.base", + 'base_model': "bert-base-uncased", + 'MODEL_TYPE': "bert_small_en_uncased", + 'MODEL_SUFFIX': "uncased", + 'MODEL_SPEC_STR': "2020_02_20/uncased_L-4_H-512_A-8", + 'MODEL_SPEC_STR_DIR': "", + 'VOCAB_SIZE': 30522, + 'NUM_LAYERS': 4, + 'NUM_ATTN_HEADS': 8, + 'EMBEDDING_SIZE': 768}, + "bert_base_uncased": {'base': "roberta.base", + 'base_model': "bert-base-uncased", + 'MODEL_TYPE': "bert_tiny_en_uncased", + 'MODEL_SUFFIX': "uncased", + 'MODEL_SPEC_STR': "2018_10_18/uncased_L-12_H-768_A-12", + 'MODEL_SPEC_STR_DIR': "uncased_L-12_H-768_A-12", + 'VOCAB_SIZE': 30522, + 'NUM_LAYERS': 2, + 'NUM_ATTN_HEADS': 2, + 'EMBEDDING_SIZE': 128} } flags.DEFINE_string( @@ -87,26 +94,23 @@ def download_model(preset): print("-> Download original weights in " + preset) - zip_path = f"""https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/{MODEL_SUFFIX}_L-12_H-768_A-12.tar.gz""" + zip_path = f"""https://storage.googleapis.com/bert_models/{PRESET_MAP[preset]['MODEL_SPEC_STR']}.zip""" # 768 path_file = keras.utils.get_file( - f"""{preset}.tar.gz""", + f"""{preset}.zip""", zip_path, extract=False, archive_format="tar", ) - print('path_file', path_file) - extract_dir = f"./content/{preset}/tmp/temp_dir/raw/" + extract_dir = f"./content/{preset}/{PRESET_MAP[preset]['MODEL_SPEC_STR_DIR']}" vocab_path = os.path.join(extract_dir, "vocab.txt") checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt") config_path = os.path.join(extract_dir, "bert_config.json") - - os.system(f"tar -C ./content/{preset} -xvf {path_file}") - + os.system(f"unzip -o -d ./content/{preset} {path_file}") return vocab_path, checkpoint_path, config_path -def convert_checkpoints(preset,checkpoint_path,config_dict): +def convert_checkpoints(preset, checkpoint_path, config_dict): print("\n-> Convert original weights to KerasNLP format.") # Transformer layers. @@ -115,11 +119,10 @@ def convert_checkpoints(preset,checkpoint_path,config_dict): for name, shape in vars: weight = tf.train.load_variable(checkpoint_path, name) weights[name] = weight - print(name) - model = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased", - load_weights=True) # keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE) - model.summary() - if preset in ['bert_base_en_uncased', 'bert_base_en']: + + model = keras_nlp.models.BertBackbone.from_preset(config_dict['MODEL_TYPE'], + load_weights=True) + if preset in ['bert_base_en']: model.get_layer("token_embedding").embeddings.assign( weights[ "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE" @@ -185,7 +188,7 @@ def convert_checkpoints(preset,checkpoint_path,config_dict): f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE" ] ) - if preset == 'bert_base_en_uncased': + if preset == 'bert_base_uncased': model.get_layer( f"transformer_layer_{i}" )._self_attention_layer._output_dense.kernel.assign( @@ -271,7 +274,7 @@ def convert_checkpoints(preset,checkpoint_path,config_dict): f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE" ] ) - if preset == 'bert_base_en_uncased': + if preset == 'bert_base_uncased': model.get_layer("pooled_dense").kernel.assign( weights["next_sentence..pooler_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"] ) @@ -562,8 +565,8 @@ def convert_checkpoints(preset,checkpoint_path,config_dict): pass # Save the model. - print(f"\n-> Save KerasNLP model weights to `{preset}.h5`.") - model.save_weights(f"{preset}.h5") + print(f"\n-> Save KerasNLP model weights to `{config_dict['base_model']}.h5`.") + model.save_weights(f"{config_dict['base_model']}.h5") return model @@ -588,30 +591,56 @@ def preprocess(x): def check_output( - preset, + keras_nlp_preprocessor, keras_nlp_model, - mg_model, - token_ids, - segment_ids + hf_tokenizer, + hf_model, ): - keras_nlp_output = keras_nlp_model( - { - "token_ids": token_ids, - "segment_ids": segment_ids, - "padding_mask": token_ids != 0, - } - )["pooled_output"] - - mg_output = mg_model( - { - "input_word_ids": token_ids, - "input_type_ids": segment_ids, - "padding_mask": token_ids != 0, - } - )["pooled_output"] - tf.reduce_mean(keras_nlp_output - mg_output) - keras_nlp_model.save_weights(f"""{preset}.h5""") - return keras_nlp_output + print("\n-> Check the outputs.") + sample_text = ["cricket is awesome, easily the best sport in the world!"] + + # KerasNLP + keras_nlp_inputs = keras_nlp_preprocessor(tf.constant(sample_text)) + keras_nlp_output = keras_nlp_model.predict(keras_nlp_inputs)[ + "sequence_output" + ] + + # HF + hf_inputs = hf_tokenizer( + sample_text, padding="max_length", return_tensors="pt" + ) + hf_output = hf_model(**hf_inputs).last_hidden_state + + print("KerasNLP output:", keras_nlp_output[0, 0, :10]) + print("HF output:", hf_output[0, 0, :10]) + print('keras_nlp_output', keras_nlp_output.shape) + print('hf_output', hf_output.detach().numpy().shape) + print("Difference:", np.mean(keras_nlp_output - hf_output.detach().numpy())) + + +def extract_vocab(hf_tokenizer, vocab_path): + spm_path = os.path.join('models/' + FLAGS.preset, "spiece.model") + print(f"\n-> Save KerasNLP SPM vocabulary file to `{spm_path}`.") + + os.makedirs('models/' + FLAGS.preset, exist_ok=True) + shutil.copyfile( + transformers.utils.hub.get_file_from_repo("bert-base-uncased", "tokenizer_config.json" + + ), + spm_path, + ) + keras_nlp_tokenizer = keras_nlp.models.BertTokenizer( + + vocabulary=vocab_path + ) + keras_nlp_preprocessor = keras_nlp.models.BertPreprocessor( + keras_nlp_tokenizer + ) + + print("-> Print MD5 checksum of the vocab files.") + print(f"`{spm_path}` md5sum: ", get_md5_checksum(spm_path)) + + return keras_nlp_preprocessor def main(_): @@ -619,13 +648,22 @@ def main(_): FLAGS.preset in PRESET_MAP.keys() ), f'Invalid preset {FLAGS.preset}. Must be one of {",".join(PRESET_MAP.keys())}' + vocab_path, checkpoint_path, config_path = download_model(FLAGS.preset) - vocab_path, checkpoint_path, config_path, weights, model = download_model(FLAGS.preset) + keras_nlp_model = convert_checkpoints(FLAGS.preset, checkpoint_path, PRESET_MAP[FLAGS.preset]) - keras_nlp_model = convert_checkpoints(FLAGS.preset,checkpoint_path,PRESET_MAP[FLAGS.preset]) + hf_model = transformers.AutoModel.from_pretrained(PRESET_MAP[FLAGS.preset]['base_model']) + hf_model.eval() -if __name__ == "__main__": + hf_tokenizer = transformers.AutoTokenizer.from_pretrained(PRESET_MAP[FLAGS.preset]['base_model']) + keras_nlp_preprocessor = extract_vocab(hf_tokenizer, vocab_path) + check_output( + keras_nlp_preprocessor, + keras_nlp_model, + hf_tokenizer, + hf_model, + ) +if __name__ == "__main__": app.run(main) -