-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bc3dcdc
commit 12ce0f0
Showing
5 changed files
with
354 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
model_checkpoint_path: "model.ckpt-1003900" | ||
all_model_checkpoint_paths: "model.ckpt-1000900" | ||
all_model_checkpoint_paths: "model.ckpt-1001900" | ||
all_model_checkpoint_paths: "model.ckpt-1002900" | ||
all_model_checkpoint_paths: "model.ckpt-1003900" |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,349 @@ | ||
import mesh_tensorflow.optimize | ||
import mesh_tensorflow.transformer.dataset as mesh_tensorflow2 | ||
import mesh_tensorflow.transformer.learning_rate_schedules as mesh_tensorflow3 | ||
import mesh_tensorflow.transformer.t2t_vocabulary as mesh_tensorflow4 | ||
import mesh_tensorflow.transformer.transformer as mesh_tensorflow5 | ||
import mesh_tensorflow.transformer.transformer_layers as mesh_tensorflow6 | ||
import mesh_tensorflow.transformer.utils as mesh_tensorflow7 | ||
import t5.models.mesh_transformer | ||
|
||
# Macros: | ||
# ============================================================================== | ||
d_ff = 3072 | ||
d_kv = 64 | ||
d_model = 768 | ||
dropout_rate = 0.1 | ||
init_checkpoint = 'gs://t5-zahraa/models/docs.query/model.ckpt-1002900' | ||
MIXTURE_NAME = 'all_mix' | ||
num_heads = 12 | ||
num_layers = 12 | ||
tokens_per_batch = 131072 | ||
|
||
# Parameters for adafactor_decay_rate_pow: | ||
# ============================================================================== | ||
adafactor_decay_rate_pow.exponent = 0.8 | ||
adafactor_decay_rate_pow.offset = 0 | ||
|
||
# Parameters for AdafactorOptimizer: | ||
# ============================================================================== | ||
AdafactorOptimizer.beta1 = 0.0 | ||
AdafactorOptimizer.clipping_threshold = 1.0 | ||
AdafactorOptimizer.decay_rate = None | ||
AdafactorOptimizer.epsilon1 = 1e-30 | ||
AdafactorOptimizer.epsilon2 = 0.001 | ||
AdafactorOptimizer.exclude_from_parameter_scale = None | ||
AdafactorOptimizer.factored = True | ||
AdafactorOptimizer.min_dim_size_to_factor = 128 | ||
AdafactorOptimizer.multiply_by_parameter_scale = True | ||
AdafactorOptimizer.stacked_dim_names = None | ||
|
||
# Parameters for Bitransformer: | ||
# ============================================================================== | ||
Bitransformer.shared_embedding = True | ||
|
||
# Parameters for constant_learning_rate: | ||
# ============================================================================== | ||
constant_learning_rate.learning_rate = 0.001 | ||
|
||
# Parameters for decoder/DenseReluDense: | ||
# ============================================================================== | ||
decoder/DenseReluDense.activation = 'relu' | ||
decoder/DenseReluDense.dropout_rate = %dropout_rate | ||
decoder/DenseReluDense.hidden_size = %d_ff | ||
decoder/DenseReluDense.use_bias = False | ||
|
||
# Parameters for encoder/DenseReluDense: | ||
# ============================================================================== | ||
encoder/DenseReluDense.activation = 'relu' | ||
encoder/DenseReluDense.dropout_rate = %dropout_rate | ||
encoder/DenseReluDense.hidden_size = %d_ff | ||
encoder/DenseReluDense.use_bias = False | ||
|
||
# Parameters for enc_dec_attention: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for enc_dec_attention_bias: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for decoder/EncDecAttention: | ||
# ============================================================================== | ||
decoder/EncDecAttention.relative_attention_type = None | ||
|
||
# Parameters for get_variable_dtype: | ||
# ============================================================================== | ||
get_variable_dtype.activation_dtype = 'bfloat16' | ||
|
||
# Parameters for get_vocab_embedding_cls: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for get_vocabulary: | ||
# ============================================================================== | ||
get_vocabulary.mixture_or_task_name = %MIXTURE_NAME | ||
|
||
# Parameters for init_checkpoint_variable_mapping: | ||
# ============================================================================== | ||
init_checkpoint_variable_mapping.mapping_fn = None | ||
|
||
# Parameters for decoder/LayerStack: | ||
# ============================================================================== | ||
decoder/LayerStack.dropout_rate = %dropout_rate | ||
decoder/LayerStack.norm_epsilon = 1e-06 | ||
decoder/LayerStack.recompute_grads = False | ||
decoder/LayerStack.sublayers_final = \ | ||
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout] | ||
decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout] | ||
decoder/LayerStack.sublayers_per_layer = \ | ||
[@transformer.sublayer_rms_norm, | ||
@transformer.sublayer_call_layer, | ||
@transformer.sublayer_dropout, | ||
@transformer.sublayer_residual] | ||
|
||
# Parameters for encoder/LayerStack: | ||
# ============================================================================== | ||
encoder/LayerStack.dropout_rate = %dropout_rate | ||
encoder/LayerStack.norm_epsilon = 1e-06 | ||
encoder/LayerStack.recompute_grads = False | ||
encoder/LayerStack.sublayers_final = \ | ||
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout] | ||
encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout] | ||
encoder/LayerStack.sublayers_per_layer = \ | ||
[@transformer.sublayer_rms_norm, | ||
@transformer.sublayer_call_layer, | ||
@transformer.sublayer_dropout, | ||
@transformer.sublayer_residual] | ||
|
||
# Parameters for make_bitransformer: | ||
# ============================================================================== | ||
make_bitransformer.decoder_name = 'decoder' | ||
make_bitransformer.encoder_name = 'encoder' | ||
|
||
# Parameters for decoder/make_layer_stack: | ||
# ============================================================================== | ||
decoder/make_layer_stack.block_scope = True | ||
decoder/make_layer_stack.layers = \ | ||
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | ||
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention, | ||
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | ||
decoder/make_layer_stack.num_layers = %num_layers | ||
|
||
# Parameters for encoder/make_layer_stack: | ||
# ============================================================================== | ||
encoder/make_layer_stack.block_scope = True | ||
encoder/make_layer_stack.layers = \ | ||
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, | ||
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] | ||
encoder/make_layer_stack.num_layers = %num_layers | ||
|
||
# Parameters for pack_dataset: | ||
# ============================================================================== | ||
pack_dataset.use_custom_ops = False | ||
|
||
# Parameters for pack_or_pad: | ||
# ============================================================================== | ||
pack_or_pad.ensure_eos = False | ||
pack_or_pad.feature_keys = None | ||
pack_or_pad.pack = True | ||
|
||
# Parameters for packed_parallel_tsv_dataset: | ||
# ============================================================================== | ||
packed_parallel_tsv_dataset.batch_size = None | ||
packed_parallel_tsv_dataset.max_encoded_len = 0 | ||
|
||
# Parameters for rewrite_stack_variables: | ||
# ============================================================================== | ||
rewrite_stack_variables.max_combined_variable_size = 536870912 | ||
|
||
# Parameters for run: | ||
# ============================================================================== | ||
run.autostack = True | ||
run.batch_size = ('tokens_per_batch', %tokens_per_batch) | ||
run.checkpoint_input_pipeline = False | ||
run.dataset_split = 'train' | ||
run.ensemble_inputs = None | ||
run.eval_checkpoint_step = None | ||
run.eval_dataset_fn = None | ||
run.eval_dir_suffix = None | ||
run.eval_summary_dir = None | ||
run.export_checkpoint_step = None | ||
run.export_path = '' | ||
run.init_checkpoint = %init_checkpoint | ||
run.iterations_per_loop = 100 | ||
run.keep_checkpoint_max = None | ||
run.layout_rules = \ | ||
'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch' | ||
run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate | ||
run.mesh_devices = None | ||
run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape() | ||
run.mode = 'train' | ||
run.model_type = 'bitransformer' | ||
run.optimizer = @optimize.AdafactorOptimizer | ||
run.output_eval_examples = True | ||
run.perplexity_eval_steps = 10 | ||
run.predict_fn = None | ||
run.save_checkpoints_steps = 2400 | ||
run.seen_data_init_step = 0 | ||
run.sequence_length = {'inputs': 2048, 'targets': 32} | ||
run.skip_seen_data = False | ||
run.total_run_steps = None | ||
run.train_dataset_fn = @t5.models.mesh_transformer.tsv_dataset_fn | ||
run.train_steps = 1003900 | ||
run.variable_filter = None | ||
|
||
# Parameters for decoder/SelfAttention: | ||
# ============================================================================== | ||
decoder/SelfAttention.attention_func = None | ||
decoder/SelfAttention.attention_kwargs = None | ||
decoder/SelfAttention.combine_dims = True | ||
decoder/SelfAttention.dropout_rate = %dropout_rate | ||
decoder/SelfAttention.fold_scaling_into_initializer = True | ||
decoder/SelfAttention.hyperprompt_hidden_dim = None | ||
decoder/SelfAttention.hyperprompt_length_decoder = None | ||
decoder/SelfAttention.hyperprompt_length_encoder = None | ||
decoder/SelfAttention.hyperprompt_mtlshare = False | ||
decoder/SelfAttention.hyperprompt_task_num = 8 | ||
decoder/SelfAttention.keep_query_heads_dims = False | ||
decoder/SelfAttention.key_value_size = %d_kv | ||
decoder/SelfAttention.num_heads = %num_heads | ||
decoder/SelfAttention.num_memory_heads = 0 | ||
decoder/SelfAttention.relative_attention_num_buckets = 32 | ||
decoder/SelfAttention.relative_attention_type = 'bias_shared' | ||
decoder/SelfAttention.shared_kv = False | ||
decoder/SelfAttention.use_hyperprompt = False | ||
decoder/SelfAttention.z_loss_coeff = None | ||
|
||
# Parameters for encoder/SelfAttention: | ||
# ============================================================================== | ||
encoder/SelfAttention.attention_func = None | ||
encoder/SelfAttention.attention_kwargs = None | ||
encoder/SelfAttention.combine_dims = True | ||
encoder/SelfAttention.dropout_rate = %dropout_rate | ||
encoder/SelfAttention.fold_scaling_into_initializer = True | ||
encoder/SelfAttention.hyperprompt_hidden_dim = None | ||
encoder/SelfAttention.hyperprompt_length_decoder = None | ||
encoder/SelfAttention.hyperprompt_length_encoder = None | ||
encoder/SelfAttention.hyperprompt_mtlshare = False | ||
encoder/SelfAttention.hyperprompt_task_num = 8 | ||
encoder/SelfAttention.keep_query_heads_dims = False | ||
encoder/SelfAttention.key_value_size = %d_kv | ||
encoder/SelfAttention.num_heads = %num_heads | ||
encoder/SelfAttention.num_memory_heads = 0 | ||
encoder/SelfAttention.relative_attention_num_buckets = 32 | ||
encoder/SelfAttention.relative_attention_type = 'bias_shared' | ||
encoder/SelfAttention.shared_kv = False | ||
encoder/SelfAttention.use_hyperprompt = False | ||
encoder/SelfAttention.z_loss_coeff = None | ||
|
||
# Parameters for serialize_num_microbatches: | ||
# ============================================================================== | ||
serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192 | ||
|
||
# Parameters for should_load_variable: | ||
# ============================================================================== | ||
should_load_variable.filter_fn = None | ||
|
||
# Parameters for SimdMeshImpl: | ||
# ============================================================================== | ||
SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8 | ||
|
||
# Parameters for sublayer_call_layer: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_dropout: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_legacy_dropout: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_legacy_final_rms_norm: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_legacy_rms_norm: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_mask_padding: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_residual: | ||
# ============================================================================== | ||
# None. | ||
|
||
# Parameters for sublayer_rms_norm: | ||
# ============================================================================== | ||
sublayer_rms_norm.epsilon = 1e-06 | ||
|
||
# Parameters for tpu_estimator_model_fn: | ||
# ============================================================================== | ||
tpu_estimator_model_fn.hierarchical_tiling_spec = None | ||
tpu_estimator_model_fn.init_variable_filter = '' | ||
tpu_estimator_model_fn.model_info_file = None | ||
tpu_estimator_model_fn.outer_batch_size = 1 | ||
tpu_estimator_model_fn.tpu_summaries = False | ||
tpu_estimator_model_fn.weight_decay_checkpoint = None | ||
|
||
# Parameters for tpu_mesh_shape: | ||
# ============================================================================== | ||
tpu_mesh_shape.ensemble_parallelism = None | ||
tpu_mesh_shape.model_parallelism = 1 | ||
tpu_mesh_shape.tpu_topology = 'v3-8' | ||
|
||
# Parameters for tsv_dataset_fn: | ||
# ============================================================================== | ||
tsv_dataset_fn.filename = 'gs://t5-zahraa/orcas/docs.query.document.train.tsv' | ||
tsv_dataset_fn.shuffle_buffer_size = 10000 | ||
|
||
# Parameters for unit_scaling_convention: | ||
# ============================================================================== | ||
unit_scaling_convention.value = False | ||
|
||
# Parameters for decoder/Unitransformer: | ||
# ============================================================================== | ||
decoder/Unitransformer.d_model = %d_model | ||
decoder/Unitransformer.ensemble = None | ||
decoder/Unitransformer.input_full_attention = False | ||
decoder/Unitransformer.label_smoothing = 0.0 | ||
decoder/Unitransformer.loss_denominator = 233472 | ||
decoder/Unitransformer.loss_fn = None | ||
decoder/Unitransformer.loss_on_targets_only = False | ||
decoder/Unitransformer.max_length = 512 | ||
decoder/Unitransformer.positional_embedding = False | ||
decoder/Unitransformer.shared_embedding_and_softmax_weights = True | ||
decoder/Unitransformer.sinusoid_positional_embedding = False | ||
decoder/Unitransformer.token_dropout_rate = 0.0 | ||
decoder/Unitransformer.vocab_divisor = 128 | ||
decoder/Unitransformer.z_loss = 0.0001 | ||
|
||
# Parameters for encoder/Unitransformer: | ||
# ============================================================================== | ||
encoder/Unitransformer.d_model = %d_model | ||
encoder/Unitransformer.ensemble = None | ||
encoder/Unitransformer.input_full_attention = False | ||
encoder/Unitransformer.label_smoothing = 0.0 | ||
encoder/Unitransformer.loss_denominator = None | ||
encoder/Unitransformer.loss_fn = None | ||
encoder/Unitransformer.loss_on_targets_only = False | ||
encoder/Unitransformer.max_length = 512 | ||
encoder/Unitransformer.positional_embedding = False | ||
encoder/Unitransformer.shared_embedding_and_softmax_weights = True | ||
encoder/Unitransformer.sinusoid_positional_embedding = False | ||
encoder/Unitransformer.token_dropout_rate = 0.0 | ||
encoder/Unitransformer.vocab_divisor = 128 | ||
encoder/Unitransformer.z_loss = 0.0001 | ||
|
||
# Parameters for VarianceScalingInitializer: | ||
# ============================================================================== | ||
VarianceScalingInitializer.distribution = 'normal' | ||
VarianceScalingInitializer.mode = 'fan_in' | ||
VarianceScalingInitializer.scale = 1.0 | ||
|
||
# Parameters for VocabEmbedding: | ||
# ============================================================================== | ||
VocabEmbedding.scale_variable_like_classifier_weights = False |