Skip to content

Commit

Permalink
Do not mark relative position as optional when they are required (Ope…
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumekln authored Jun 20, 2022
1 parent 5ee3717 commit ca0b9db
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 67 deletions.
41 changes: 12 additions & 29 deletions python/ctranslate2/converters/opennmt_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,49 +170,38 @@ def _load(self):


def set_transformer_spec(spec, variables):
set_transformer_encoder(
spec.encoder, variables, relative=spec.with_relative_position
)
set_transformer_decoder(
spec.decoder, variables, relative=spec.with_relative_position
)
set_transformer_encoder(spec.encoder, variables)
set_transformer_decoder(spec.decoder, variables)


def set_transformer_encoder(spec, variables, relative=False):
set_input_layers(spec, variables, "encoder", relative=relative)
def set_transformer_encoder(spec, variables):
set_input_layers(spec, variables, "encoder")
set_layer_norm(spec.layer_norm, variables, "encoder.layer_norm")
for i, layer in enumerate(spec.layer):
set_transformer_encoder_layer(
layer, variables, "encoder.transformer.%d" % i, relative=relative
)
set_transformer_encoder_layer(layer, variables, "encoder.transformer.%d" % i)


def set_transformer_decoder(
spec, variables, relative=False, with_encoder_attention=True
):
set_input_layers(spec, variables, "decoder", relative=relative)
def set_transformer_decoder(spec, variables, with_encoder_attention=True):
set_input_layers(spec, variables, "decoder")
set_linear(spec.projection, variables, "generator")
set_layer_norm(spec.layer_norm, variables, "decoder.layer_norm")
for i, layer in enumerate(spec.layer):
set_transformer_decoder_layer(
layer,
variables,
"decoder.transformer_layers.%d" % i,
relative=relative,
with_encoder_attention=with_encoder_attention,
)


def set_input_layers(spec, variables, scope, relative=False):
def set_input_layers(spec, variables, scope):
try:
set_position_encodings(
spec.position_encodings,
variables,
"%s.embeddings.make_embedding.pe" % scope,
)
except KeyError:
if not relative:
raise
# See https://github.com/OpenNMT/OpenNMT-py/issues/1722
spec.scale_embeddings = False

Expand All @@ -228,28 +217,24 @@ def set_input_layers(spec, variables, scope, relative=False):
)


def set_transformer_encoder_layer(spec, variables, scope, relative=False):
def set_transformer_encoder_layer(spec, variables, scope):
set_ffn(spec.ffn, variables, "%s.feed_forward" % scope)
set_multi_head_attention(
spec.self_attention,
variables,
"%s.self_attn" % scope,
self_attention=True,
relative=relative,
)
set_layer_norm(spec.self_attention.layer_norm, variables, "%s.layer_norm" % scope)


def set_transformer_decoder_layer(
spec, variables, scope, relative=False, with_encoder_attention=True
):
def set_transformer_decoder_layer(spec, variables, scope, with_encoder_attention=True):
set_ffn(spec.ffn, variables, "%s.feed_forward" % scope)
set_multi_head_attention(
spec.self_attention,
variables,
"%s.self_attn" % scope,
self_attention=True,
relative=relative,
)
set_layer_norm(spec.self_attention.layer_norm, variables, "%s.layer_norm_1" % scope)
if with_encoder_attention:
Expand All @@ -263,9 +248,7 @@ def set_ffn(spec, variables, scope):
set_linear(spec.linear_1, variables, "%s.w_2" % scope)


def set_multi_head_attention(
spec, variables, scope, self_attention=False, relative=False
):
def set_multi_head_attention(spec, variables, scope, self_attention=False):
if self_attention:
split_layers = [common_spec.LinearSpec() for _ in range(3)]
set_linear(split_layers[0], variables, "%s.linear_query" % scope)
Expand All @@ -279,7 +262,7 @@ def set_multi_head_attention(
set_linear(split_layers[1], variables, "%s.linear_values" % scope)
utils.fuse_linear(spec.linear[1], split_layers)
set_linear(spec.linear[-1], variables, "%s.final_linear" % scope)
if relative:
if spec.relative_position_keys is None:
spec.relative_position_keys = _get_variable(
variables, "%s.relative_positions_embeddings.weight" % scope
)
Expand Down
35 changes: 10 additions & 25 deletions python/ctranslate2/converters/opennmt_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,30 +498,23 @@ def set_transformer_spec_v2(spec, variables):
"model/examples_inputter/features_inputter",
version=2,
)
set_transformer_encoder_v2(
spec.encoder, variables, "model/encoder", relative=spec.with_relative_position
)
set_transformer_encoder_v2(spec.encoder, variables, "model/encoder")
set_transformer_decoder_v2(
spec.decoder,
variables,
"model/decoder",
target_embedding_name,
relative=spec.with_relative_position,
)


def set_transformer_encoder_v2(spec, variables, scope, relative=False):
def set_transformer_encoder_v2(spec, variables, scope):
if spec.layer_norm != OPTIONAL:
set_layer_norm(spec.layer_norm, variables, "%s/layer_norm" % scope)
for i, layer in enumerate(spec.layer):
set_transformer_encoder_layer_v2(
layer, variables, "%s/layers/%d" % (scope, i), relative=relative
)
set_transformer_encoder_layer_v2(layer, variables, "%s/layers/%d" % (scope, i))


def set_transformer_decoder_v2(
spec, variables, scope, target_embedding_name, relative=False
):
def set_transformer_decoder_v2(spec, variables, scope, target_embedding_name):
try:
set_linear(
spec.projection,
Expand All @@ -542,34 +535,28 @@ def set_transformer_decoder_v2(
if spec.layer_norm != OPTIONAL:
set_layer_norm(spec.layer_norm, variables, "%s/layer_norm" % scope)
for i, layer in enumerate(spec.layer):
set_transformer_decoder_layer_v2(
layer, variables, "%s/layers/%d" % (scope, i), relative=relative
)
set_transformer_decoder_layer_v2(layer, variables, "%s/layers/%d" % (scope, i))


def set_transformer_encoder_layer_v2(spec, variables, scope, relative=False):
def set_transformer_encoder_layer_v2(spec, variables, scope):
set_ffn_v2(spec.ffn, variables, "%s/ffn" % scope)
set_multi_head_attention_v2(
spec.self_attention,
variables,
"%s/self_attention" % scope,
self_attention=True,
relative=relative,
)


def set_transformer_decoder_layer_v2(spec, variables, scope, relative=False):
def set_transformer_decoder_layer_v2(spec, variables, scope):
set_ffn_v2(spec.ffn, variables, "%s/ffn" % scope)
set_multi_head_attention_v2(
spec.self_attention,
variables,
"%s/self_attention" % scope,
self_attention=True,
relative=relative,
)
set_multi_head_attention_v2(
spec.attention, variables, "%s/attention/0" % scope, relative=relative
)
set_multi_head_attention_v2(spec.attention, variables, "%s/attention/0" % scope)


def set_ffn_v2(spec, variables, scope):
Expand All @@ -581,9 +568,7 @@ def set_ffn_v2(spec, variables, scope):
set_linear(spec.linear_1, variables, "%s/layer/outer" % scope)


def set_multi_head_attention_v2(
spec, variables, scope, self_attention=False, relative=False
):
def set_multi_head_attention_v2(spec, variables, scope, self_attention=False):
try:
set_layer_norm(spec.layer_norm, variables, "%s/input_layer_norm" % scope)
except KeyError:
Expand All @@ -594,7 +579,7 @@ def set_multi_head_attention_v2(
set_linear(split_layers[1], variables, "%s/layer/linear_keys" % scope)
set_linear(split_layers[2], variables, "%s/layer/linear_values" % scope)
utils.fuse_linear(spec.linear[0], split_layers)
if relative:
if spec.relative_position_keys is None:
spec.relative_position_keys = variables[
"%s/layer/relative_position_keys" % scope
]
Expand Down
17 changes: 10 additions & 7 deletions python/ctranslate2/specs/attention_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@


class MultiHeadAttentionSpec(model_spec.LayerSpec):
def __init__(self, self_attention=False):
def __init__(self, self_attention=False, relative_position=False):
self.layer_norm = common_spec.LayerNormSpec()
if self_attention:
num_projections = 2
self.linear = [
common_spec.LinearSpec() for _ in range(2 if self_attention else 3)
]

if relative_position:
self.relative_position_keys = None
self.relative_position_values = None
else:
num_projections = 3
self.linear = [common_spec.LinearSpec() for _ in range(num_projections)]
self.relative_position_keys = model_spec.OPTIONAL
self.relative_position_values = model_spec.OPTIONAL
self.relative_position_keys = model_spec.OPTIONAL
self.relative_position_values = model_spec.OPTIONAL
26 changes: 20 additions & 6 deletions python/ctranslate2/specs/transformer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,13 @@ def __init__(
pre_norm=pre_norm,
num_source_embeddings=num_source_embeddings,
layernorm_embedding=layernorm_embedding,
relative_position=with_relative_position,
)
self.decoder = TransformerDecoderSpec(
num_decoder_layers,
pre_norm=pre_norm,
layernorm_embedding=layernorm_embedding,
relative_position=with_relative_position,
)
super().__init__(
source_embeddings_specs=self.encoder.embeddings,
Expand Down Expand Up @@ -137,6 +139,7 @@ def __init__(
pre_norm=True,
num_source_embeddings=1,
layernorm_embedding=False,
relative_position=False,
):
self.embeddings = [
common_spec.EmbeddingsSpec() for _ in range(num_source_embeddings)
Expand All @@ -149,7 +152,10 @@ def __init__(
self.layernorm_embedding = (
common_spec.LayerNormSpec() if layernorm_embedding else model_spec.OPTIONAL
)
self.layer = [TransformerEncoderLayerSpec() for _ in range(num_layers)]
self.layer = [
TransformerEncoderLayerSpec(relative_position=relative_position)
for _ in range(num_layers)
]


class TransformerDecoderSpec(model_spec.LayerSpec):
Expand All @@ -161,6 +167,7 @@ def __init__(
with_encoder_attention=True,
no_final_norm=False,
project_in_out=False,
relative_position=False,
):
self.embeddings = common_spec.EmbeddingsSpec()
self.scale_embeddings = True
Expand All @@ -175,7 +182,10 @@ def __init__(
)
self.projection = common_spec.LinearSpec()
self.layer = [
TransformerDecoderLayerSpec(with_encoder_attention=with_encoder_attention)
TransformerDecoderLayerSpec(
with_encoder_attention=with_encoder_attention,
relative_position=relative_position,
)
for _ in range(num_layers)
]
self.start_from_zero_embedding = False
Expand All @@ -189,14 +199,18 @@ def __init__(


class TransformerEncoderLayerSpec(model_spec.LayerSpec):
def __init__(self):
self.self_attention = attention_spec.MultiHeadAttentionSpec(self_attention=True)
def __init__(self, relative_position=False):
self.self_attention = attention_spec.MultiHeadAttentionSpec(
self_attention=True, relative_position=relative_position
)
self.ffn = FeedForwardSpec()


class TransformerDecoderLayerSpec(model_spec.LayerSpec):
def __init__(self, with_encoder_attention=True):
self.self_attention = attention_spec.MultiHeadAttentionSpec(self_attention=True)
def __init__(self, with_encoder_attention=True, relative_position=False):
self.self_attention = attention_spec.MultiHeadAttentionSpec(
self_attention=True, relative_position=relative_position
)
self.attention = (
attention_spec.MultiHeadAttentionSpec()
if with_encoder_attention
Expand Down

0 comments on commit ca0b9db

Please sign in to comment.