From ca0b9db4b1fcfaac4df1e9a2db4ce96906033972 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 20 Jun 2022 11:10:24 +0200 Subject: [PATCH] Do not mark relative position as optional when they are required (#844) --- python/ctranslate2/converters/opennmt_py.py | 41 ++++++-------------- python/ctranslate2/converters/opennmt_tf.py | 35 +++++------------ python/ctranslate2/specs/attention_spec.py | 17 ++++---- python/ctranslate2/specs/transformer_spec.py | 26 ++++++++++--- 4 files changed, 52 insertions(+), 67 deletions(-) diff --git a/python/ctranslate2/converters/opennmt_py.py b/python/ctranslate2/converters/opennmt_py.py index eea80b9b1..044d943e3 100644 --- a/python/ctranslate2/converters/opennmt_py.py +++ b/python/ctranslate2/converters/opennmt_py.py @@ -170,27 +170,19 @@ def _load(self): def set_transformer_spec(spec, variables): - set_transformer_encoder( - spec.encoder, variables, relative=spec.with_relative_position - ) - set_transformer_decoder( - spec.decoder, variables, relative=spec.with_relative_position - ) + set_transformer_encoder(spec.encoder, variables) + set_transformer_decoder(spec.decoder, variables) -def set_transformer_encoder(spec, variables, relative=False): - set_input_layers(spec, variables, "encoder", relative=relative) +def set_transformer_encoder(spec, variables): + set_input_layers(spec, variables, "encoder") set_layer_norm(spec.layer_norm, variables, "encoder.layer_norm") for i, layer in enumerate(spec.layer): - set_transformer_encoder_layer( - layer, variables, "encoder.transformer.%d" % i, relative=relative - ) + set_transformer_encoder_layer(layer, variables, "encoder.transformer.%d" % i) -def set_transformer_decoder( - spec, variables, relative=False, with_encoder_attention=True -): - set_input_layers(spec, variables, "decoder", relative=relative) +def set_transformer_decoder(spec, variables, with_encoder_attention=True): + set_input_layers(spec, variables, "decoder") set_linear(spec.projection, variables, "generator") set_layer_norm(spec.layer_norm, variables, "decoder.layer_norm") for i, layer in enumerate(spec.layer): @@ -198,12 +190,11 @@ def set_transformer_decoder( layer, variables, "decoder.transformer_layers.%d" % i, - relative=relative, with_encoder_attention=with_encoder_attention, ) -def set_input_layers(spec, variables, scope, relative=False): +def set_input_layers(spec, variables, scope): try: set_position_encodings( spec.position_encodings, @@ -211,8 +202,6 @@ def set_input_layers(spec, variables, scope, relative=False): "%s.embeddings.make_embedding.pe" % scope, ) except KeyError: - if not relative: - raise # See https://github.com/OpenNMT/OpenNMT-py/issues/1722 spec.scale_embeddings = False @@ -228,28 +217,24 @@ def set_input_layers(spec, variables, scope, relative=False): ) -def set_transformer_encoder_layer(spec, variables, scope, relative=False): +def set_transformer_encoder_layer(spec, variables, scope): set_ffn(spec.ffn, variables, "%s.feed_forward" % scope) set_multi_head_attention( spec.self_attention, variables, "%s.self_attn" % scope, self_attention=True, - relative=relative, ) set_layer_norm(spec.self_attention.layer_norm, variables, "%s.layer_norm" % scope) -def set_transformer_decoder_layer( - spec, variables, scope, relative=False, with_encoder_attention=True -): +def set_transformer_decoder_layer(spec, variables, scope, with_encoder_attention=True): set_ffn(spec.ffn, variables, "%s.feed_forward" % scope) set_multi_head_attention( spec.self_attention, variables, "%s.self_attn" % scope, self_attention=True, - relative=relative, ) set_layer_norm(spec.self_attention.layer_norm, variables, "%s.layer_norm_1" % scope) if with_encoder_attention: @@ -263,9 +248,7 @@ def set_ffn(spec, variables, scope): set_linear(spec.linear_1, variables, "%s.w_2" % scope) -def set_multi_head_attention( - spec, variables, scope, self_attention=False, relative=False -): +def set_multi_head_attention(spec, variables, scope, self_attention=False): if self_attention: split_layers = [common_spec.LinearSpec() for _ in range(3)] set_linear(split_layers[0], variables, "%s.linear_query" % scope) @@ -279,7 +262,7 @@ def set_multi_head_attention( set_linear(split_layers[1], variables, "%s.linear_values" % scope) utils.fuse_linear(spec.linear[1], split_layers) set_linear(spec.linear[-1], variables, "%s.final_linear" % scope) - if relative: + if spec.relative_position_keys is None: spec.relative_position_keys = _get_variable( variables, "%s.relative_positions_embeddings.weight" % scope ) diff --git a/python/ctranslate2/converters/opennmt_tf.py b/python/ctranslate2/converters/opennmt_tf.py index 61eabc30c..396904d04 100644 --- a/python/ctranslate2/converters/opennmt_tf.py +++ b/python/ctranslate2/converters/opennmt_tf.py @@ -498,30 +498,23 @@ def set_transformer_spec_v2(spec, variables): "model/examples_inputter/features_inputter", version=2, ) - set_transformer_encoder_v2( - spec.encoder, variables, "model/encoder", relative=spec.with_relative_position - ) + set_transformer_encoder_v2(spec.encoder, variables, "model/encoder") set_transformer_decoder_v2( spec.decoder, variables, "model/decoder", target_embedding_name, - relative=spec.with_relative_position, ) -def set_transformer_encoder_v2(spec, variables, scope, relative=False): +def set_transformer_encoder_v2(spec, variables, scope): if spec.layer_norm != OPTIONAL: set_layer_norm(spec.layer_norm, variables, "%s/layer_norm" % scope) for i, layer in enumerate(spec.layer): - set_transformer_encoder_layer_v2( - layer, variables, "%s/layers/%d" % (scope, i), relative=relative - ) + set_transformer_encoder_layer_v2(layer, variables, "%s/layers/%d" % (scope, i)) -def set_transformer_decoder_v2( - spec, variables, scope, target_embedding_name, relative=False -): +def set_transformer_decoder_v2(spec, variables, scope, target_embedding_name): try: set_linear( spec.projection, @@ -542,34 +535,28 @@ def set_transformer_decoder_v2( if spec.layer_norm != OPTIONAL: set_layer_norm(spec.layer_norm, variables, "%s/layer_norm" % scope) for i, layer in enumerate(spec.layer): - set_transformer_decoder_layer_v2( - layer, variables, "%s/layers/%d" % (scope, i), relative=relative - ) + set_transformer_decoder_layer_v2(layer, variables, "%s/layers/%d" % (scope, i)) -def set_transformer_encoder_layer_v2(spec, variables, scope, relative=False): +def set_transformer_encoder_layer_v2(spec, variables, scope): set_ffn_v2(spec.ffn, variables, "%s/ffn" % scope) set_multi_head_attention_v2( spec.self_attention, variables, "%s/self_attention" % scope, self_attention=True, - relative=relative, ) -def set_transformer_decoder_layer_v2(spec, variables, scope, relative=False): +def set_transformer_decoder_layer_v2(spec, variables, scope): set_ffn_v2(spec.ffn, variables, "%s/ffn" % scope) set_multi_head_attention_v2( spec.self_attention, variables, "%s/self_attention" % scope, self_attention=True, - relative=relative, - ) - set_multi_head_attention_v2( - spec.attention, variables, "%s/attention/0" % scope, relative=relative ) + set_multi_head_attention_v2(spec.attention, variables, "%s/attention/0" % scope) def set_ffn_v2(spec, variables, scope): @@ -581,9 +568,7 @@ def set_ffn_v2(spec, variables, scope): set_linear(spec.linear_1, variables, "%s/layer/outer" % scope) -def set_multi_head_attention_v2( - spec, variables, scope, self_attention=False, relative=False -): +def set_multi_head_attention_v2(spec, variables, scope, self_attention=False): try: set_layer_norm(spec.layer_norm, variables, "%s/input_layer_norm" % scope) except KeyError: @@ -594,7 +579,7 @@ def set_multi_head_attention_v2( set_linear(split_layers[1], variables, "%s/layer/linear_keys" % scope) set_linear(split_layers[2], variables, "%s/layer/linear_values" % scope) utils.fuse_linear(spec.linear[0], split_layers) - if relative: + if spec.relative_position_keys is None: spec.relative_position_keys = variables[ "%s/layer/relative_position_keys" % scope ] diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py index c36602eb7..20c230131 100644 --- a/python/ctranslate2/specs/attention_spec.py +++ b/python/ctranslate2/specs/attention_spec.py @@ -2,12 +2,15 @@ class MultiHeadAttentionSpec(model_spec.LayerSpec): - def __init__(self, self_attention=False): + def __init__(self, self_attention=False, relative_position=False): self.layer_norm = common_spec.LayerNormSpec() - if self_attention: - num_projections = 2 + self.linear = [ + common_spec.LinearSpec() for _ in range(2 if self_attention else 3) + ] + + if relative_position: + self.relative_position_keys = None + self.relative_position_values = None else: - num_projections = 3 - self.linear = [common_spec.LinearSpec() for _ in range(num_projections)] - self.relative_position_keys = model_spec.OPTIONAL - self.relative_position_values = model_spec.OPTIONAL + self.relative_position_keys = model_spec.OPTIONAL + self.relative_position_values = model_spec.OPTIONAL diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py index 3b1c72a47..50a99464c 100644 --- a/python/ctranslate2/specs/transformer_spec.py +++ b/python/ctranslate2/specs/transformer_spec.py @@ -60,11 +60,13 @@ def __init__( pre_norm=pre_norm, num_source_embeddings=num_source_embeddings, layernorm_embedding=layernorm_embedding, + relative_position=with_relative_position, ) self.decoder = TransformerDecoderSpec( num_decoder_layers, pre_norm=pre_norm, layernorm_embedding=layernorm_embedding, + relative_position=with_relative_position, ) super().__init__( source_embeddings_specs=self.encoder.embeddings, @@ -137,6 +139,7 @@ def __init__( pre_norm=True, num_source_embeddings=1, layernorm_embedding=False, + relative_position=False, ): self.embeddings = [ common_spec.EmbeddingsSpec() for _ in range(num_source_embeddings) @@ -149,7 +152,10 @@ def __init__( self.layernorm_embedding = ( common_spec.LayerNormSpec() if layernorm_embedding else model_spec.OPTIONAL ) - self.layer = [TransformerEncoderLayerSpec() for _ in range(num_layers)] + self.layer = [ + TransformerEncoderLayerSpec(relative_position=relative_position) + for _ in range(num_layers) + ] class TransformerDecoderSpec(model_spec.LayerSpec): @@ -161,6 +167,7 @@ def __init__( with_encoder_attention=True, no_final_norm=False, project_in_out=False, + relative_position=False, ): self.embeddings = common_spec.EmbeddingsSpec() self.scale_embeddings = True @@ -175,7 +182,10 @@ def __init__( ) self.projection = common_spec.LinearSpec() self.layer = [ - TransformerDecoderLayerSpec(with_encoder_attention=with_encoder_attention) + TransformerDecoderLayerSpec( + with_encoder_attention=with_encoder_attention, + relative_position=relative_position, + ) for _ in range(num_layers) ] self.start_from_zero_embedding = False @@ -189,14 +199,18 @@ def __init__( class TransformerEncoderLayerSpec(model_spec.LayerSpec): - def __init__(self): - self.self_attention = attention_spec.MultiHeadAttentionSpec(self_attention=True) + def __init__(self, relative_position=False): + self.self_attention = attention_spec.MultiHeadAttentionSpec( + self_attention=True, relative_position=relative_position + ) self.ffn = FeedForwardSpec() class TransformerDecoderLayerSpec(model_spec.LayerSpec): - def __init__(self, with_encoder_attention=True): - self.self_attention = attention_spec.MultiHeadAttentionSpec(self_attention=True) + def __init__(self, with_encoder_attention=True, relative_position=False): + self.self_attention = attention_spec.MultiHeadAttentionSpec( + self_attention=True, relative_position=relative_position + ) self.attention = ( attention_spec.MultiHeadAttentionSpec() if with_encoder_attention