Merge pull request #22 from allenai/soldni/cond_stat

Scripts to compute statistics
allenai · Aug 18, 2023 · 705d358 · 705d358
2 parents b43ed8a + 9205db2
commit 705d358
Show file tree

Hide file tree

Showing 15 changed files with 1,225 additions and 61 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -8,6 +8,9 @@ on:
      tags:
        - '*'
    pull_request:
+      branches:
+        - main
+        - master
    workflow_dispatch:
 
 
@@ -16,6 +19,21 @@ permissions:
 
 jobs:
 
+  info:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Echo environment variables
+        run: |
+          echo "reference:  ${{ github.ref }}"
+          echo "event name: ${{ github.event_name }}"
+          echo "run tests:  ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}"
+          echo "is main:    ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' }}"
+          echo "is release: ${{ startsWith(github.ref, 'refs/tags/') }}"
+          echo "commit:     ${{ github.sha }}"
+          echo "-------------------------------------------------------------------------"
+          echo "Full PR details:"
+          echo "${{ toJson(github.event.pull_request) }}"
+
   tests:
     runs-on: ubuntu-latest
     env:
@@ -93,7 +111,10 @@ jobs:
           source .venv/bin/activate
           pip uninstall -y dolma
 
+
+
   build-linux:
+    if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}
     runs-on: ubuntu-latest
     env:
       CC:   gcc-11
@@ -128,6 +149,7 @@ jobs:
           path: dist
 
   build-windows:
+    if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}
     runs-on: windows-latest
     strategy:
       matrix:
@@ -151,6 +173,7 @@ jobs:
           path: dist
 
   build-macos:
+    if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}
     runs-on: macos-latest
     strategy:
       matrix:
@@ -174,6 +197,7 @@ jobs:
 
   sdist:
     runs-on: ubuntu-latest
+    if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}
     steps:
       - uses: actions/checkout@v3
       - name: Build sdist

diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
-# Dolma
+<img alt="Dolma's official logo. It's dolma written in yellow, round lowercase letters over a blue background." src="https://github.com/allenai/dolma/blob/main/res/logo.png?raw=true" width="100%">
 
-*Data to feed OLMo's Appetite*
 
-<img alt="Dolma's official logo. It's dolma written in yellow, round lowercase letters over a blue background." src="https://github.com/allenai/dolma/blob/main/res/logo.png?raw=true" width="100%">
+Dolma is an open dataset of 3 trillion tokens from a diverse mix of web content, academic publications, code, books, and encyclopedic materials.
+It was created as a training corpus for [OLMo](https://allenai.org/olmo), AI2 language model.
 
-Data and tools for generating and inspecting OLMo pre-training data.
+Dolma is available for download on the HuggingFace 🤗 Hub: [`huggingface.co/datasets/allenai/dolma`](https://huggingface.co/datasets/allenai/dolma). To access Dolma, users must agree to the terms of the terms of [AI2 ImpACT License for Medium Risk Artifacts](https://allenai.org/licenses/impact-mr).
+You can also read more about Dolma in [our announcement](https://blog.allenai.org/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64), as well as by consulting its [data sheet](https://drive.google.com/file/d/12gOf5I5RytsD159nSP7iim_5zN31FCXq/view?usp=drive_link).
 
-To get started, install dolma using [pip](https://pypi.org/project/dolma/).
+This repository contains tools for generating and inspecting Dolma. To get started, install the Dolma Python library from [PyPI](https://pypi.org/project/dolma/).
 
 ```shell
 pip install dolma
@@ -28,7 +29,7 @@ For all commands, configurations can be specified from command line, or by passi
 dolma -c config.yaml dedupe --dedupe.name "test"
 ```
 
-### `dolma tag`
+### The `tag` command
 
 The tag command is used to run any of the built-in taggers on a set of documents. For example:
 
@@ -44,7 +45,7 @@ dolma tag \
 
 This command will run the `random_number_v1` tagger on all documents in the specified S3 paths. The results will be written to the `s3://ai2-llm/pretraining-data/sources/common-crawl/test/v0/attributes/sample` and `s3://ai2-llm/pretraining-data/sources/common-crawl/test/v1/attributes/sample` paths.
 
-### `dolma dedupe`
+### The `dedupe` command
 
 The dedupe command is used to deduplicate a set of documents at the attribute level using a bloom filter.
 For example configurations, see directory `tests/config`. For example:
@@ -53,14 +54,15 @@ For example configurations, see directory `tests/config`. For example:
 dolma dedupe -c tests/config/dedupe-paragraphs.json
 ```
 
-### `dolma mix`
+### The `mix` command
 
 The mix command is used to mix documents from multiple sources, optionally filtering by attributes and/or performing string replacement. For example configurations, see directory `tests/config`. For example:
 
 ```shell
 dolma mix -c tests/config/mixer.json
 ```
 
+
 ## Development
 
 Create a conda environment with Python >= 3.8. In this case, we use Python 3.10 and use Anaconda to create the environment.
@@ -90,6 +92,7 @@ make test
 
 You can choose to run just the Python or Rust tests by calling `make test-python` or `make test-rust` respectively.
 
+
 ## Citation
 
 If you use this repository, please cite it as:
@@ -98,7 +101,7 @@ If you use this repository, please cite it as:
 @software{dolma,
     author = {{Soldaini, Luca and Lo, Kyle and Kinney, Rodney and Naik, Aakanksha and Ravichander, Abhilasha and Bhagia, Akshita and Groeneveld, Dirk and Schwenk, Dustin and Magnusson, Ian and Chandu, Khyathi}},
     license = {{Apache-2.0}},
-    title = {{DOLMa}},
+    title = {{Dolma}},
     url = {https://github.com/allenai/dolma}
 }
 ```
diff --git a/configs/dedup/pes2o_decontamination.json b/configs/dedup/pes2o_decontamination.json
@@ -0,0 +1,63 @@
+{
+    "documents": [
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=0/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=1/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=2/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=3/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=4/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=5/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=6/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=7/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=8/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=valid/part_id=9/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=0/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=1/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=2/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=3/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=4/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=5/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=6/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=7/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=8/*.gz",
+        "s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=valid/part_id=9/*.gz"
+    ],
+    "work_dir": {
+        "input": "/tmp/s2/v3-fos/deduper/input",
+        "output": "/tmp/s2/v3-fos/deduper/output"
+    },
+    "dedupe": {
+        "name": "decontamination",
+        "paragraphs": {
+            "attribute_name": "bff_duplicate_paragraph_spans"
+        },
+        "skip_empty": true
+    },
+    "bloom_filter": {
+        "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+        "size_in_bytes": 8388608,
+        "read_only": true,
+        "estimated_doc_count": 3898706,
+        "desired_false_positive_rate": 0.001
+    },
+    "processes": 120
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,11 +31,10 @@ dependencies = [
     "numpy"
 ]
 classifiers = [
-    "Development Status :: 3 - Alpha",
+    "Development Status :: 4 - Beta",
     "Typing :: Typed",
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
 ]
 
 [[project.authors]]

diff --git a/python/dolma/cli/__main__.py b/python/dolma/cli/__main__.py
@@ -7,6 +7,9 @@
 from .analyzer import AnalyzerCli
 from .deduper import DeduperCli
 from .mixer import MixerCli
+
+# must import these to register the resolvers
+from .resolvers import *  # noqa: F401,F403
 from .tagger import ListTaggerCli, TaggerCli
 
 AVAILABLE_COMMANDS = {

diff --git a/python/dolma/cli/resolvers.py b/python/dolma/cli/resolvers.py
@@ -0,0 +1,36 @@
+import multiprocessing
+from typing import List, TypeVar
+
+from cached_path import cached_path
+from omegaconf.omegaconf import OmegaConf as om
+from omegaconf.omegaconf import Resolver
+
+from ..core.paths import glob_path
+
+__all__ = ["cache", "glob", "processes"]
+
+
+C = TypeVar("C", bound=Resolver)
+
+
+def resolver(resolver: C) -> C:
+    resolver_name = f"d.{resolver.__name__}"
+    om.register_new_resolver(resolver_name, resolver, replace=True)
+    return resolver
+
+
+@resolver
+def cache(path: str) -> str:
+    return str(cached_path(path))
+
+
+@resolver
+def glob(path: str) -> List[str]:
+    globbed = list(glob_path(path))
+    assert len(globbed) > 0, f"Path {path} does not match any files"
+    return globbed
+
+
+@resolver
+def processes(n: int = 0) -> int:
+    return max(1, multiprocessing.cpu_count() - n)
diff --git a/python/dolma/core/analyzer.py b/python/dolma/core/analyzer.py
@@ -2,7 +2,6 @@
 import re
 import shutil
 from contextlib import ExitStack
-from queue import Queue
 from tempfile import TemporaryDirectory
 from typing import Dict, List, Optional
 
@@ -13,18 +12,24 @@
 from rich.console import Console
 from rich.table import Table
 
-from .binning import BucketsValTracker
+from .binning import BaseBucketApi, FixedBucketsValTracker, InferBucketsValTracker
 from .data_types import OutputSpec
 from .errors import DolmaError
-from .parallel import BaseParallelProcessor
+from .parallel import BaseParallelProcessor, QueueType
 from .paths import glob_path, mkdir_p
 
 NUM_BINS = 100_000
 BUFF_SIZE = 1_000
 
 
-def _make_tracker() -> BucketsValTracker:
-    return BucketsValTracker(NUM_BINS, BUFF_SIZE)
+def _make_tracker(type_: str = "fixed", **kwargs: int) -> BaseBucketApi:
+    """Make a tracker of given type. Choose between `infer` or `fixed`"""
+    if type_ == "infer":
+        return InferBucketsValTracker(**{"n": NUM_BINS, "b": BUFF_SIZE, **kwargs})
+    elif type_ == "fixed":
+        return FixedBucketsValTracker(**{"n": NUM_BINS, **kwargs})
+    else:
+        raise ValueError(f"Unknown tracker type {type_}")
 
 
 class SummarySpec(msgspec.Struct):
@@ -33,24 +38,21 @@ class SummarySpec(msgspec.Struct):
     bins: List[float]
 
     @classmethod
-    def from_tracker(self, name: str, tracker: "BucketsValTracker", n: int) -> "SummarySpec":
+    def from_tracker(cls, name: str, tracker: "BaseBucketApi", n: int) -> "SummarySpec":
         counts, bins = tracker.summarize(n=n)
         return SummarySpec(name=name, counts=counts, bins=bins)
 
-    def to_tracker(self) -> "BucketsValTracker":
+    def to_tracker(self) -> "BaseBucketApi":
         tracker = _make_tracker()
-        try:
-            tracker.add_many(values=self.bins, counts=self.counts)
-        except ValueError:
-            breakpoint()
+        tracker.add_many(values=self.bins, counts=self.counts)
         return tracker
 
 
 class AnalyzerProcessor(BaseParallelProcessor):
     @classmethod
     def increment_progressbar(  # type: ignore
         cls,
-        queue,  # queue must be the first argument, and it should be a positional-only argument
+        queue: QueueType,  # queue must be the first argument, and it should be a positional-only argument
         /,
         files: int = 0,
         documents: int = 0,
@@ -66,7 +68,7 @@ def process_single(
         cls,
         source_path: str,
         destination_path: str,
-        queue: "Queue",
+        queue: QueueType,
         **kwargs,
     ):
         # instantiate a decoder for faster decoding
@@ -79,7 +81,7 @@ def process_single(
         name_regex = re.compile(r) if (r := kwargs.get("name_regex", None)) else None
 
         # keep track of the length and score of each attribute
-        trackers: Dict[str, BucketsValTracker] = {}
+        trackers: Dict[str, BaseBucketApi] = {}
 
         # interval at which to update the progress bar; will double if queue is too full
         update_interval = 1
@@ -142,7 +144,7 @@ def process_single(
 
 def aggregate_summaries(summaries_path: str, num_bins: int = 1000) -> List[SummarySpec]:
     # keep track of the length and score of each attribute
-    trackers: Dict[str, BucketsValTracker] = {}
+    trackers: Dict[str, BaseBucketApi] = {}
 
     # instantiate a decoder for faster decoding
     decoder = Decoder(SummarySpec)