From 824b63ca5c8c2abe00bab69523e3afefff9844e8 Mon Sep 17 00:00:00 2001 From: Ahmed Ahmed Date: Mon, 18 Nov 2024 11:06:01 -0800 Subject: [PATCH] add token counter --- examples/count_tokens.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 examples/count_tokens.py diff --git a/examples/count_tokens.py b/examples/count_tokens.py new file mode 100644 index 000000000..b6ddf79df --- /dev/null +++ b/examples/count_tokens.py @@ -0,0 +1,5 @@ +from levanter.store import JaggedArrayStore + +a = JaggedArrayStore.open("gs://marin-us-central2/tokenized/dolma/algebraic-stack-cc00cf/train/input_ids", dtype=int) + +a.data_size \ No newline at end of file