Skip to content

Commit

Permalink
tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Oct 29, 2024
1 parent e897c55 commit b5271a6
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 1 deletion.
2 changes: 1 addition & 1 deletion configs/peteish-anneal/mix-fw25.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ streams:
# Remove repetitions
- >-
(.attributes.HuggingFaceFW_fineweb_edu_classifier[0][-1] * 0.8) +
(.attributes.random_number_v1__random_number_v1__random[0][-1] * 5 * 0.2) >= 1.7
(.attributes.random_number_v1__random_number_v1__random[0][-1] * 5 * 0.2) >= 2
syntax: jq
span_replacement: []

Expand Down
16 changes: 16 additions & 0 deletions configs/peteish-anneal/tokens-fw25.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_fwEdu25/documents/full/allenai/dolma2-tokenizer
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_fwEdu25/documents/full/*

processes: 128
seed: 3920
max_size: 4_294_967_296
dtype: uint32

tokenizer:
name_or_path: allenai/dolma2-tokenizer
bos_token_id: null
eos_token_id: 100257
pad_token_id: 100277
segment_before_tokenization: false
encode_special_tokens: true
16 changes: 16 additions & 0 deletions configs/peteish-anneal/tokens-nvidia25.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
destination: ${oc.env:HOME}/ai2-llm/preprocessed/sources/dclm/v1_nvidia25/documents/full/allenai/dolma2-tokenizer
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v1_nvidia25/documents/full/*

processes: 128
seed: 3920
max_size: 4_294_967_296
dtype: uint32

tokenizer:
name_or_path: allenai/dolma2-tokenizer
bos_token_id: null
eos_token_id: 100257
pad_token_id: 100277
segment_before_tokenization: false
encode_special_tokens: true

0 comments on commit b5271a6

Please sign in to comment.