From ac0b8390ddad3b583de86689ef6543c2368890fa Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 23 Oct 2024 23:32:08 -0700 Subject: [PATCH] more configs --- classifiers/scripts/fineweb_40b.sh | 45 +++++++++++++++++++++++ classifiers/scripts/nvidia-deberta-40b.sh | 45 +++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 classifiers/scripts/fineweb_40b.sh create mode 100644 classifiers/scripts/nvidia-deberta-40b.sh diff --git a/classifiers/scripts/fineweb_40b.sh b/classifiers/scripts/fineweb_40b.sh new file mode 100644 index 00000000..4cef5644 --- /dev/null +++ b/classifiers/scripts/fineweb_40b.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' +NUM_NODES=1 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="high" + + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4" diff --git a/classifiers/scripts/nvidia-deberta-40b.sh b/classifiers/scripts/nvidia-deberta-40b.sh new file mode 100644 index 00000000..101a5cd8 --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-40b.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/40b-split/*/*zstd' + +NUM_NODES=2 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="high" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"