Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ZahraTaherikhonakdar authored Nov 6, 2024
1 parent bc3dcdc commit 12ce0f0
Show file tree
Hide file tree
Showing 5 changed files with 354 additions and 0 deletions.
5 changes: 5 additions & 0 deletions checkpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_checkpoint_path: "model.ckpt-1003900"
all_model_checkpoint_paths: "model.ckpt-1000900"
all_model_checkpoint_paths: "model.ckpt-1001900"
all_model_checkpoint_paths: "model.ckpt-1002900"
all_model_checkpoint_paths: "model.ckpt-1003900"
Binary file added model.ckpt-1003900.data-00000-of-00002
Binary file not shown.
Binary file added model.ckpt-1003900.index
Binary file not shown.
Binary file added model.ckpt-1003900.meta
Binary file not shown.
349 changes: 349 additions & 0 deletions operative_config.gin
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
import mesh_tensorflow.optimize
import mesh_tensorflow.transformer.dataset as mesh_tensorflow2
import mesh_tensorflow.transformer.learning_rate_schedules as mesh_tensorflow3
import mesh_tensorflow.transformer.t2t_vocabulary as mesh_tensorflow4
import mesh_tensorflow.transformer.transformer as mesh_tensorflow5
import mesh_tensorflow.transformer.transformer_layers as mesh_tensorflow6
import mesh_tensorflow.transformer.utils as mesh_tensorflow7
import t5.models.mesh_transformer

# Macros:
# ==============================================================================
d_ff = 3072
d_kv = 64
d_model = 768
dropout_rate = 0.1
init_checkpoint = 'gs://t5-zahraa/models/docs.query/model.ckpt-1002900'
MIXTURE_NAME = 'all_mix'
num_heads = 12
num_layers = 12
tokens_per_batch = 131072

# Parameters for adafactor_decay_rate_pow:
# ==============================================================================
adafactor_decay_rate_pow.exponent = 0.8
adafactor_decay_rate_pow.offset = 0

# Parameters for AdafactorOptimizer:
# ==============================================================================
AdafactorOptimizer.beta1 = 0.0
AdafactorOptimizer.clipping_threshold = 1.0
AdafactorOptimizer.decay_rate = None
AdafactorOptimizer.epsilon1 = 1e-30
AdafactorOptimizer.epsilon2 = 0.001
AdafactorOptimizer.exclude_from_parameter_scale = None
AdafactorOptimizer.factored = True
AdafactorOptimizer.min_dim_size_to_factor = 128
AdafactorOptimizer.multiply_by_parameter_scale = True
AdafactorOptimizer.stacked_dim_names = None

# Parameters for Bitransformer:
# ==============================================================================
Bitransformer.shared_embedding = True

# Parameters for constant_learning_rate:
# ==============================================================================
constant_learning_rate.learning_rate = 0.001

# Parameters for decoder/DenseReluDense:
# ==============================================================================
decoder/DenseReluDense.activation = 'relu'
decoder/DenseReluDense.dropout_rate = %dropout_rate
decoder/DenseReluDense.hidden_size = %d_ff
decoder/DenseReluDense.use_bias = False

# Parameters for encoder/DenseReluDense:
# ==============================================================================
encoder/DenseReluDense.activation = 'relu'
encoder/DenseReluDense.dropout_rate = %dropout_rate
encoder/DenseReluDense.hidden_size = %d_ff
encoder/DenseReluDense.use_bias = False

# Parameters for enc_dec_attention:
# ==============================================================================
# None.

# Parameters for enc_dec_attention_bias:
# ==============================================================================
# None.

# Parameters for decoder/EncDecAttention:
# ==============================================================================
decoder/EncDecAttention.relative_attention_type = None

# Parameters for get_variable_dtype:
# ==============================================================================
get_variable_dtype.activation_dtype = 'bfloat16'

# Parameters for get_vocab_embedding_cls:
# ==============================================================================
# None.

# Parameters for get_vocabulary:
# ==============================================================================
get_vocabulary.mixture_or_task_name = %MIXTURE_NAME

# Parameters for init_checkpoint_variable_mapping:
# ==============================================================================
init_checkpoint_variable_mapping.mapping_fn = None

# Parameters for decoder/LayerStack:
# ==============================================================================
decoder/LayerStack.dropout_rate = %dropout_rate
decoder/LayerStack.norm_epsilon = 1e-06
decoder/LayerStack.recompute_grads = False
decoder/LayerStack.sublayers_final = \
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
decoder/LayerStack.sublayers_per_layer = \
[@transformer.sublayer_rms_norm,
@transformer.sublayer_call_layer,
@transformer.sublayer_dropout,
@transformer.sublayer_residual]

# Parameters for encoder/LayerStack:
# ==============================================================================
encoder/LayerStack.dropout_rate = %dropout_rate
encoder/LayerStack.norm_epsilon = 1e-06
encoder/LayerStack.recompute_grads = False
encoder/LayerStack.sublayers_final = \
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
encoder/LayerStack.sublayers_per_layer = \
[@transformer.sublayer_rms_norm,
@transformer.sublayer_call_layer,
@transformer.sublayer_dropout,
@transformer.sublayer_residual]

# Parameters for make_bitransformer:
# ==============================================================================
make_bitransformer.decoder_name = 'decoder'
make_bitransformer.encoder_name = 'encoder'

# Parameters for decoder/make_layer_stack:
# ==============================================================================
decoder/make_layer_stack.block_scope = True
decoder/make_layer_stack.layers = \
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
decoder/make_layer_stack.num_layers = %num_layers

# Parameters for encoder/make_layer_stack:
# ==============================================================================
encoder/make_layer_stack.block_scope = True
encoder/make_layer_stack.layers = \
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
encoder/make_layer_stack.num_layers = %num_layers

# Parameters for pack_dataset:
# ==============================================================================
pack_dataset.use_custom_ops = False

# Parameters for pack_or_pad:
# ==============================================================================
pack_or_pad.ensure_eos = False
pack_or_pad.feature_keys = None
pack_or_pad.pack = True

# Parameters for packed_parallel_tsv_dataset:
# ==============================================================================
packed_parallel_tsv_dataset.batch_size = None
packed_parallel_tsv_dataset.max_encoded_len = 0

# Parameters for rewrite_stack_variables:
# ==============================================================================
rewrite_stack_variables.max_combined_variable_size = 536870912

# Parameters for run:
# ==============================================================================
run.autostack = True
run.batch_size = ('tokens_per_batch', %tokens_per_batch)
run.checkpoint_input_pipeline = False
run.dataset_split = 'train'
run.ensemble_inputs = None
run.eval_checkpoint_step = None
run.eval_dataset_fn = None
run.eval_dir_suffix = None
run.eval_summary_dir = None
run.export_checkpoint_step = None
run.export_path = ''
run.init_checkpoint = %init_checkpoint
run.iterations_per_loop = 100
run.keep_checkpoint_max = None
run.layout_rules = \
'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate
run.mesh_devices = None
run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
run.mode = 'train'
run.model_type = 'bitransformer'
run.optimizer = @optimize.AdafactorOptimizer
run.output_eval_examples = True
run.perplexity_eval_steps = 10
run.predict_fn = None
run.save_checkpoints_steps = 2400
run.seen_data_init_step = 0
run.sequence_length = {'inputs': 2048, 'targets': 32}
run.skip_seen_data = False
run.total_run_steps = None
run.train_dataset_fn = @t5.models.mesh_transformer.tsv_dataset_fn
run.train_steps = 1003900
run.variable_filter = None

# Parameters for decoder/SelfAttention:
# ==============================================================================
decoder/SelfAttention.attention_func = None
decoder/SelfAttention.attention_kwargs = None
decoder/SelfAttention.combine_dims = True
decoder/SelfAttention.dropout_rate = %dropout_rate
decoder/SelfAttention.fold_scaling_into_initializer = True
decoder/SelfAttention.hyperprompt_hidden_dim = None
decoder/SelfAttention.hyperprompt_length_decoder = None
decoder/SelfAttention.hyperprompt_length_encoder = None
decoder/SelfAttention.hyperprompt_mtlshare = False
decoder/SelfAttention.hyperprompt_task_num = 8
decoder/SelfAttention.keep_query_heads_dims = False
decoder/SelfAttention.key_value_size = %d_kv
decoder/SelfAttention.num_heads = %num_heads
decoder/SelfAttention.num_memory_heads = 0
decoder/SelfAttention.relative_attention_num_buckets = 32
decoder/SelfAttention.relative_attention_type = 'bias_shared'
decoder/SelfAttention.shared_kv = False
decoder/SelfAttention.use_hyperprompt = False
decoder/SelfAttention.z_loss_coeff = None

# Parameters for encoder/SelfAttention:
# ==============================================================================
encoder/SelfAttention.attention_func = None
encoder/SelfAttention.attention_kwargs = None
encoder/SelfAttention.combine_dims = True
encoder/SelfAttention.dropout_rate = %dropout_rate
encoder/SelfAttention.fold_scaling_into_initializer = True
encoder/SelfAttention.hyperprompt_hidden_dim = None
encoder/SelfAttention.hyperprompt_length_decoder = None
encoder/SelfAttention.hyperprompt_length_encoder = None
encoder/SelfAttention.hyperprompt_mtlshare = False
encoder/SelfAttention.hyperprompt_task_num = 8
encoder/SelfAttention.keep_query_heads_dims = False
encoder/SelfAttention.key_value_size = %d_kv
encoder/SelfAttention.num_heads = %num_heads
encoder/SelfAttention.num_memory_heads = 0
encoder/SelfAttention.relative_attention_num_buckets = 32
encoder/SelfAttention.relative_attention_type = 'bias_shared'
encoder/SelfAttention.shared_kv = False
encoder/SelfAttention.use_hyperprompt = False
encoder/SelfAttention.z_loss_coeff = None

# Parameters for serialize_num_microbatches:
# ==============================================================================
serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192

# Parameters for should_load_variable:
# ==============================================================================
should_load_variable.filter_fn = None

# Parameters for SimdMeshImpl:
# ==============================================================================
SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8

# Parameters for sublayer_call_layer:
# ==============================================================================
# None.

# Parameters for sublayer_dropout:
# ==============================================================================
# None.

# Parameters for sublayer_legacy_dropout:
# ==============================================================================
# None.

# Parameters for sublayer_legacy_final_rms_norm:
# ==============================================================================
# None.

# Parameters for sublayer_legacy_rms_norm:
# ==============================================================================
# None.

# Parameters for sublayer_mask_padding:
# ==============================================================================
# None.

# Parameters for sublayer_residual:
# ==============================================================================
# None.

# Parameters for sublayer_rms_norm:
# ==============================================================================
sublayer_rms_norm.epsilon = 1e-06

# Parameters for tpu_estimator_model_fn:
# ==============================================================================
tpu_estimator_model_fn.hierarchical_tiling_spec = None
tpu_estimator_model_fn.init_variable_filter = ''
tpu_estimator_model_fn.model_info_file = None
tpu_estimator_model_fn.outer_batch_size = 1
tpu_estimator_model_fn.tpu_summaries = False
tpu_estimator_model_fn.weight_decay_checkpoint = None

# Parameters for tpu_mesh_shape:
# ==============================================================================
tpu_mesh_shape.ensemble_parallelism = None
tpu_mesh_shape.model_parallelism = 1
tpu_mesh_shape.tpu_topology = 'v3-8'

# Parameters for tsv_dataset_fn:
# ==============================================================================
tsv_dataset_fn.filename = 'gs://t5-zahraa/orcas/docs.query.document.train.tsv'
tsv_dataset_fn.shuffle_buffer_size = 10000

# Parameters for unit_scaling_convention:
# ==============================================================================
unit_scaling_convention.value = False

# Parameters for decoder/Unitransformer:
# ==============================================================================
decoder/Unitransformer.d_model = %d_model
decoder/Unitransformer.ensemble = None
decoder/Unitransformer.input_full_attention = False
decoder/Unitransformer.label_smoothing = 0.0
decoder/Unitransformer.loss_denominator = 233472
decoder/Unitransformer.loss_fn = None
decoder/Unitransformer.loss_on_targets_only = False
decoder/Unitransformer.max_length = 512
decoder/Unitransformer.positional_embedding = False
decoder/Unitransformer.shared_embedding_and_softmax_weights = True
decoder/Unitransformer.sinusoid_positional_embedding = False
decoder/Unitransformer.token_dropout_rate = 0.0
decoder/Unitransformer.vocab_divisor = 128
decoder/Unitransformer.z_loss = 0.0001

# Parameters for encoder/Unitransformer:
# ==============================================================================
encoder/Unitransformer.d_model = %d_model
encoder/Unitransformer.ensemble = None
encoder/Unitransformer.input_full_attention = False
encoder/Unitransformer.label_smoothing = 0.0
encoder/Unitransformer.loss_denominator = None
encoder/Unitransformer.loss_fn = None
encoder/Unitransformer.loss_on_targets_only = False
encoder/Unitransformer.max_length = 512
encoder/Unitransformer.positional_embedding = False
encoder/Unitransformer.shared_embedding_and_softmax_weights = True
encoder/Unitransformer.sinusoid_positional_embedding = False
encoder/Unitransformer.token_dropout_rate = 0.0
encoder/Unitransformer.vocab_divisor = 128
encoder/Unitransformer.z_loss = 0.0001

# Parameters for VarianceScalingInitializer:
# ==============================================================================
VarianceScalingInitializer.distribution = 'normal'
VarianceScalingInitializer.mode = 'fan_in'
VarianceScalingInitializer.scale = 1.0

# Parameters for VocabEmbedding:
# ==============================================================================
VocabEmbedding.scale_variable_like_classifier_weights = False

0 comments on commit 12ce0f0

Please sign in to comment.