From da2e7ddd66d58fbfaa2a80d9f98263003f0ab6ff Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 15 Mar 2024 14:11:00 +0100 Subject: [PATCH 1/4] minor --- run_train.py | 10 +++------- src/nanotron/parallel/tensor_parallel/nn.py | 1 - 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/run_train.py b/run_train.py index 1b3ab081..505d8e46 100644 --- a/run_train.py +++ b/run_train.py @@ -24,13 +24,9 @@ from nanotron.trainer import DistributedTrainer from nanotron.utils import main_rank_first -try: - from huggingface_hub import __version__ as hf_hub_version - from transformers import AutoTokenizer - from transformers import __version__ as tf_version -except ImportError: - hf_hub_version = None - tf_version = None +from huggingface_hub import __version__ as hf_hub_version +from transformers import AutoTokenizer +from transformers import __version__ as tf_version logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/tensor_parallel/nn.py b/src/nanotron/parallel/tensor_parallel/nn.py index adbd15f5..6f22be1e 100644 --- a/src/nanotron/parallel/tensor_parallel/nn.py +++ b/src/nanotron/parallel/tensor_parallel/nn.py @@ -35,7 +35,6 @@ from nanotron.parallel.tensor_parallel.functional import column_linear, row_linear from nanotron.parallel.tied_parameters import create_tied_parameter - class TensorParallelColumnLinear(nn.Linear): def __init__( self, From 92fa5ed99c26062503648cac4c9411fbfe046479 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 15 Mar 2024 14:35:21 +0100 Subject: [PATCH 2/4] minor --- README.md | 86 +------- docs/3d_parallelism.md | 406 ------------------------------------- docs/docs.md | 402 ------------------------------------ src/nanotron/fp8/linear.py | 17 +- 4 files changed, 21 insertions(+), 890 deletions(-) delete mode 100644 docs/3d_parallelism.md delete mode 100644 docs/docs.md diff --git a/README.md b/README.md index 672b1880..4fedbf20 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,12 @@ -# ⚡️ Nanotron +# ⚡️ FMEngine -The objective of this library is to provide easy distributed primitives in order to train a variety of models efficiently using 3D parallelism. For more information about the internal design of the library or 3D parallelism in general, please check out [[docs.md]](./docs/docs.md) and [[3d_parallelism.md]](./docs/3d_parallelism.md). - - -# Philosophy - -- Make it fast. At least as fast as other open source versions. -- Make it minimal. We don't actually need to support all techniques and all versions of 3D parallelism. What matters is that we can efficiently use the "best" ones. -- Make everything explicit instead of transparent. As we move forward, making things transparent works well when it works well but is a horrible debugging experience if one doesn't understand the implications of techniques used. In order to mitigate this, we choose to be explicit in the way it does things - -# Core Features - -We support the following: - - 3D parallelism, including one-forward-one-backward pipeline engine - - ZeRO-1 optimizer - - FP32 gradient accumulation - - Parameter tying/sharding - -# Installation - -Requirements: - - Python >= 3.10 - - PyTorch >= 2.0.0 - - Flash-Attention >= 2.5.0 - -To install (in a new env): -```bash -pip install torch -pip install packaging; pip install "flash-attn>=2.5.0" --no-build-isolation -git clone git@github.com:huggingface/nanotron.git -cd nanotron -pip install -e . -``` - -Also nice to have `transformers` `datasets` `python-etcd` `tensorboardX`: `pip install transformers datasets python-etcd tensorboardX` - -We also support a set of flavors that you can install using `pip install -e [$FLAVOR]`: - - `dev`: Used is you are developping in `nanotron`. It installs in particular our linter mechanism. On top of that you have to run `pre-commit install` afterwards. - - `test`: We use `pytest` in order to run out testing suite. In order to run tests in parallel, it will install `pytest-xdist`, which you can leverage by running `pytest -n 12 tests` (12 is the number of parallel test) - - -# Quick examples - -In the `/examples` directory, you can find a few example configuration file, and a script to run it. - -You can run a sample training using: -```bash -torchrun --nproc_per_node=8 run_train.py --config-file examples/debug_run_train.yaml -``` - -And run a sample generation using: -```bash -torchrun --nproc_per_node=8 run_generation.py --ckpt-path checkpoints/text/4 -``` - -# Development guidelines - -If you plan on developing on `nanotron`, we suggest you install the `dev` flavor: `pip install -e ".[dev]"` - -We use pre-commit to run a bunch of callbacks on each commit, mostly normalization code in order for the codebase to stay consistent. Please do run `pre-commit install`. - -For the linting: -```bash -pre-commit install -pre-commit run --config .pre-commit-config.yaml --all-files -``` - -Features we would like to add: -- [ ] Support `torch.compile` -- [ ] Support `torch.distributed.rpc` -- [ ] More optimized kernels -- [ ] Support Zero3 -- [ ] Other PP schedules (such as Interleaved 1f1b...) -- [ ] Ring attention / Sequence Parallelism -- [ ] 3D Parallel MoEs -- [ ] Supporting more architectures (Mamba..) -- [ ] ... +FMEngine is our opinionated take on foundation model training framework. The first version of FMEngine is built on top of `PyTorch` and `DeepSpeed` and is designed to be a drop-in replacement for `DeepSpeed` with a few additional features. In the `v2` version we forked from HuggingFace's `nanotron` and added some features to make it easier to use. # Credits -We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: Nvidia for `Megatron-LM/apex`, Microsoft for `DeepSpeed`, HazyResearch for `flash-attn` +We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: + +- HuggingFace for `nanotron`, +- Nvidia for `Megatron-LM/apex`, +- Microsoft for `DeepSpeed`, +- HazyResearch for `flash-attn` diff --git a/docs/3d_parallelism.md b/docs/3d_parallelism.md deleted file mode 100644 index dfc7c4f2..00000000 --- a/docs/3d_parallelism.md +++ /dev/null @@ -1,406 +0,0 @@ -## The internals of nanotron - -### 1. Tensor Parallelism - -#### Asynchronous Tensor Parallelism - -Q: What are the two different tensor parallel linear modes in nanotron? - -A: All-reduce and Reduce-scatter - - -Q: How does asynchronous column parallel linear work differently than regular column parallel linear? - -A: In regular column parallel linear, each rank only computes its portion of the output matrix, then gathers the partial outputs at the end. - -In asynchronous column parallel, each rank kicks off an asynchronous all-gather on the input tensor at the start. While that communication is happening, the rank computes the portion of the output corresponding to its local shard of the weights. When the all-gather finishes, each rank can compute the remaining portions of the output matrix it's missing, using the parts of the gathered input from other ranks. - - -Q: In asynchronous column parallel, what exactly does it gather? - -A: In asynchronous column parallel, each rank kicks off an all-gather operation on the input tensor X at the start of the forward pass. This gathers the shards of X from all tensor parallel ranks into one large tensor. - -For example with 4 GPUs: -+ Input X is sharded as [X0, X1, X2, X3] across 4 ranks -+ Rank 0 all-gathers: [X0, X1, X2, X3] - -So each GPU gathers the complete input X from all GPUs. - - -Q: In nanotron, what is the core difference between regular and asynchronous tensor parallel linear layers in terms of computation? - -A: -- In regular column parallel, each rank only computes the portion of the output corresponding to its shard of weights. It does not compute the full output matrix. -- In asynchronous column parallel, each rank computes the entire output matrix locally using inputs and its shard of weights. - - -Q: What do before_shard and after_shard represent in asynchronous tensor parallel? - -A: -+ before_shard is the portion of the output matrix that a rank can compute using input shards that come before its own input shard. -+ after_shard is the portion of the output matrix that a rank can compute using input shards that come after its own input shard. - -For example, on rank 2 with input shards [X0, X1, X2, X3]: before_shard = X0 * W0 + X1 * W1 after_shard = X3 * W3 - -Q: What is the core tradeoff between asynchronous and regular tensor parallelism? - -A: Async trades off more floating point operations (FLOPs) for less communication. -It does more FLOPs by having each rank compute the full output matrix instead of just a partial shard. But it reduces communication by doing only a single collective communication. So async can improve performance if the model is communication bound, at the cost of increased FLOP requirements. - - -Q: Can you give a concrete example illustrating how asynchronous tensor parallelism works? (6 steps) - -A: -- Step 1: Let's look at an example with 4 GPU ranks: - + Input X sharded across ranks as [X0, X1, X2, X3] - + Weight matrix W sharded as [W0, W1, W2, W3] -- Step 2: Rank 2 kicks off async all-gather to get [X0, X1, X2, X3] -- Step 3: While gathering, rank 2 computes: local_output = X2 * W2 -- Step 4: All-gather completes, rank 2 has [X0, X1, X2, X3] -- Step 5: Rank 2 computes: before_local_output = X0 * W0 + X1 * W1, after_local_output = X3 * W3 -- Step 6: Rank 2's output = before_local_output + local_output + after_local_output - -So each rank computes the full output using the locally gathered X and its shard of W. - -#### Tied Linear - -Q: Why does brrr have only a single rank save tied linear weights instead of all ranks? - -A: Tied linear weights are replicated across ranks, meaning all ranks hold the same weight values. Having every rank save the tied weight would result in the same weight being saved multiple times redundantly. So brrr designates only one rank (such as rank 0), to save the weight to avoid duplicating the same weight in the checkpoint. - - -Q: How does Nanotron detect tied parameters? - -A: Nanotron has a base model class called NanotronModel. NanotronModel class implements a common method for accessing tied parameters (called .get_tied_parameters()) When initializing the model, Trainer calls this method to get a list of parameter names that should be tied. -For example, for a goose model, it may return ["lm_head.weight", "word_embeddings.weight"] indicating the lm head weight and word embedding weight should be tied. - -Q: How does a tied linear layer differ from a regular parallel linear layer in nanotron? -A: -+ In a regular parallel linear layer, the weight matrix is sharded across ranks. -+ In a tied linear layer, the entire weight matrix is replicated on all ranks. - - -Q: What is the difference between a tied parameter and a regular parameter in nanotron? - -A: -+ Tied parameters in nanotron are parameters that need to have their gradients synchronized (typically summed) across a specific set of ranks during training. -+ Regular parameters don't have any special synchronization requirements. - - -Q: When would you use tied parameters in a transformer model in nanotron? - -A: Tied parameters should be used when the same weights are replicated in multiple layers of the transformer. A common example is tying the weights of the embedding layer and the final linear layer in the language modeling head. - - -Q: What are the different types of linear layers in nanotron and how are they different? - -A: Tied linear, tensor parallel linear, and async tensor parallel linear - -### 2. Pipeline Parallelism - -Q: What are the four core components in brrr’s pipeline parallelism? - -A: -+ PipelineBlock: Contains model computation split up over devices. -+ PipelineEngine: Orchestrate overall forward/backward passes across blocks. -+ PipelineBatchState: Stores all P2P operations -+ TensorPointer: Pointer to a tensor produced on a different device. - - -Q: How does PipelineEngine allow implementing different schedules like 1F1B or GPipe? - -A: PipelineEngine has abstract methods like train_batch_iter and validate_batch_iter that are overridden by subclasses to implement different execution orderings. - -For example, AllForwardAllBackward does all forwards first, then all backwards. 1F1B interleaves them, doing 1 forward then 1 backward. The specific scheduling logic is handled in these methods. - - -Q: What is the advantage of TensorPointer compared to directly sending activations after computation? - -A: So TensorPointers allow pipeline stages to represent tensors produced on other ranks, and request them on-demand when needed for computation. The key benefit is lazy communication - tensors are only transferred between processes when really needed, not all upfront -The TensorPointers allow us to queue up a whole batch of communications that will happen later, instead of blocking and communicating each tensor as it is needed. - - -Q: How do TensorPointers interact with other components in brrr? (4 steps) - -A: TensorPointer is used to represent tensors that are not locally available on the current process. It contains metadata about which process rank actually holds the real tensor data. - -+ Step 1: Block A runs on rank 0, produces output tensor X -+ Step 2: Block B runs on rank 1, needs X as input -+ Step 3: In Block B's forward, X is represented as a TensorPointer pointing to rank 0. To actually get the X tensor data, Block B uses the TensorPointer to send a request to rank 0 to receive X. -+ Step 4: Rank 0 receives the request, sends X to rank 1, which populates it into Block B's input - -Similarly, if Block B produces an output Y that the next Block C on rank 2 needs, it will return Y wrapped in a TensorPointer pointing to rank 1. - - -Q: In the forward pass, how do the four core components in brrr's pipeline parallelism work together? (5 steps) - -A: -- Step 1: PipelineEngine coordinates executing the PipelineBlocks for each microbatch. -- Step 2: PipelineBlockA runs on device A, producing an activation x. It returns {"x": TensorPointer(rank=A)} -- Step 3: PipelineBlockB runs on device B. It sees the TensorPointer for x, telling it to retrieve x from device A. PipelineBlockB tells PipelineBatchState to receive x from device A. -- Step 4: PipelineEngine triggers PipelineBatchState to run communication. PipelineBatchState executes the receive operation, getting x from device A. -- Step 5: PipelineBlockB retrieves x from PipelineBatchState's buffer and continues its computation. - - -Q: What are the three core components of brrr's P2P communication? - -A: -- P2P class: Handles sending and receiving tensors between ranks. -- TensorMetaData: Stores tensor’s metadata like shape, dtype… to interpret raw tensor data. -- Communication buffers: Reusable buffers for sending metadata and tensor data. - - -Q: What is the difference between PipelineBatchState and BatchTensorSendRecvState? - -A: PipelineBatchState orchestrates pipeline communication across microbatches during training or inference. BatchTensorSendRecvState handles sending/receiving generic tensors in a batch. - -PipelineBatchState leverages BatchTensorSendRecvState under the hood for lower-level P2P communication but adds pipeline-specific logic on top like managing activations and gradients across stages. - - -Q: Why does pipeline engine batch p2p communication? Isn’t at each clock cycle, there is only a single send or recv in a microbatch? - -A: The pipeline engine batches P2P communication across microbatches, not within a microbatch. Within a microbatch there may be only a single send or receive between stages, but across microbatches the sends/receives can be batched. - -For example, say we have a model with two pipeline stages, A and B. In microbatch 1, A sends tensor X to B. In microbatch 2, A sends tensor Y to B. Instead of sending X and Y in separate P2P operations, the pipeline engine will batch them together into one send of [X,Y]. - - -Q: How does PipelineBlock's forward pass work? (4 steps) - -A: -- Step 1: It receives inputs, which can be Tensors or TensorPointers from other ranks. -- Step 2: For any TensorPointer inputs, it uses P2P communication to fetch the actual tensor from the rank specified. -- Step 3: It runs the forward pass of the module it encapsulates, passing the tensors as inputs. -- Step 4: It returns a dict containing the outputs of the module. For ranks that didn't run this block, it returns TensorPointers instead of real tensors. - - -Q: How does a PipelineBlock decide to return a Tensor vs a TensorPointer? Explain - -A: A PipelineBlock will return a TensorPointer if the block is running on a different pipeline rank from the one that is meant to output that tensor. Otherwise, it will return the actual Tensor -For example, say PipelineBlockA produces output X and is assigned to pipeline rank 2. -+ When running on pipeline rank 2, PipelineBlockA will return the actual Tensor X. -+ But when running on rank 1 or 3, PipelineBlockA will return a TensorPointer to rank 2 rather than the actual Tensor X data. - - -Q: In 3D parallelism, how does Nanotron calculate the overall loss when each microbatch has a different loss value? - -A: -- Step 1: Each microbatch has its own loss value -- Step 2: The losses for each microbatch are summed together -- Step 3: The total sum is averaged across data parallelism -This represents the mean loss across all microbatches in the global batch - - -Q: What does PipelineBlock.rank represent? - -A: PipelineBlock.rank specifies which pipeline parallel rank the block is assigned to. When initializing the model, each PipelineBlock's rank is set to place it on a particular pipeline rank. -For example, setting a block's rank to 2 means it will run on pipeline rank 2. The block's parameters will be instantiated on rank 2's device, and its forward pass will execute on rank 2. - - -Q: What do target_pp_ranks represent when initializing a nanotron model? - -A: -target_pp_ranks specifies which subset of pipeline ranks the model should be built on. By default, the model is built on all pipeline ranks (0 to pp_size-1). But you can pass a custom list like [0, 2, 3] to build the model only on those ranks. -Concrete example: pp_size = 8, target_pp_ranks = [0, 4, 7]. This will build the model only on pipeline ranks 0, 4, and 7 out of the total 8 ranks. The intermediate ranks 1-3 and 5-6 will not have the model built on them. - - -#### Loading data in 3D parallelism - -Q: In 3D parallelism, how does brrr sample training data for model replicas? (2 steps) - -A: For example, with 2 devices, 4 microbatch size, and 100 samples: -- Step 1: It first divides the full dataset into equal chunks, one chunk per GPU. - + Device 0 gets samples [0, 2, 4, .. 98] - + Device 1 gets samples [1, 3, 5, .. 99] - -- Step 2: Then within each GPU, samples are drawn sequentially to create micro-batches. The samples are accumulated into microbatches. - Epoch 1: - + Device 0 samples [0, 2, 4, 6] -> first microbatch - + Device 1 samples [1, 3, 5, 7] - - Epoch 2: - + Device 0 samples [8, 10, 12, 14] - + Device 1 samples [9, 11, 13, 15] - - -Q: In the BRRR dataloader, why are some tensor values replaced with TensorPointers? - -A: Dataloader is designed to work with BRRR's pipeline parallelism. Certain tensors like the input ids and attention mask are only needed by the first pipeline stage. Other ranks don't need the actual tensors - a TensorPointer is just a placeholder. - -For example, say rank 2 is where the model input is located. Dataloader will return: -+ Rank 2: {"input_ids": } -+ Other ranks: {"input_ids": TensorPointer(group_rank=2)} - - -Q: Given a dataset with: 100,000 samples, 10 model replicas, Micro-batch size = 16, Consumed samples so far = 10,000 -How does the MegatronPretrainingSampler work concretely? (4 steps) - -A: -+ Step 1: Available samples = 100,000 - 10,000 = 90,000 -+ Step 2 Each model replicas gets shard of 90,000 / 10 = 9,000 samples -+ Step 3: With a microbatch size of 16, each worker samples indices 0-15, 16-31 etc. from its shard (9,000 - 18,000)… -+ Step 4: Update consumed samples after each micro-batch of 16 - - -Q: In 3D parallelism, what's the difference between sequential and random pretraining samplers? - -A: For example, with 2 GPUs, 4 microbatch size, and 8 samples: -- Sequential sampler walks through its chunk sequentially. -+ GPU 0: [0, 2, 4, 6] -+ GPU 1: [1, 3, 5, 7] - -- Random sampler shuffles its chunk each epoch before sampling. -+ GPU 0: [6, 4, 0, 2] // shuffled shard -+ GPU 1: [5, 7, 1, 3] - - - -### 3. Distributed Serialization - -Q: What are the five things saved in a brrr checkpoint? - -A: Model weights, optimizer state, learning rate scheduler, random number generator state, and any other misc metadata required for restoring sharded weights - - -Q: What are the key differences when brrr saves the weights for the 3 types of parameters? - -A: -+ Regular parameters: Just directly save the full tensor normally. -+ Sharded parameters: Only save the shard owned by the first model replicas, to avoid redundancy across data parallelism. -+ Tied parameters: Only a rank in the tied group saves the weight. - - -Q: How does brrr reconstruct the full original unsharded tensor from the shards when loading a checkpoint? - -A: When saving a sharded weight, brrr stores metadata about how the shards map to the original tensor. This includes: - -Slices mapping info - Maps each shard's slice of the tensor to the corresponding slice in the original unsharded tensor. Like shard 1 covers unsharded tensor indices 0-50, etc. - -During loading, BRRR uses this mapping to copy each shard into the right location in the unsharded tensor to reconstruct it. - -- Step 1: Orig tensor A: [A1][A2][A3] -- Step 2: Checkpoint shards: A1 A2 A3 -- Step 3: Loading: - + A1 -> copy to indices 0-50 of A - + A2 -> copy to indices 51-100 of A - + A3 -> copy to indices 101-150 of A - - -Q: What are the three types of parameters that BRRR handles when saving checkpoints? - -A: Regular parameters, sharded parameters, tied/replicated parameters - - -Q: How does brrr ensure all ranks start with the same initial random state for determinism? (3 steps) - -A: -- Step 1: Rank 0 generates the initial state by seeding the RNG and grabbing the state tensor. -- Step 2: The state tensor is broadcast from rank 0 to all ranks. -- Step 3: Each rank loads the state tensor into its RNG. - -### 4. Trainer & Model Initialization - -#### Trainer - -Q: What's the main idea behind brrr’s model initialization? - -A: The main idea is to initialize models directly on the device and datatype we want by overriding PyTorch's default initialization. For example, by default PyTorch may initialize weights on CPU and in fp32. brrr overrides this so we can initialize directly in target precision format on GPUs from the start. - - -Q: How does brrr’s model initialization context manager work? (3 steps) - -A: -- Step 1: Enter context: Override nn.Module register methods and tensor creation functions -- Step 2: Inside context: Modules/tensors now use overridden methods, so they initialize directly on target device/dtype -- Step 3: Exit context: Restore original nn.Module methods and tensor creation functions - - -Q: Which two nn.Module methods does brrr override to implement its model initialization context manager? Explain - -A: brrr overrides nn.Module.register_parameter() and nn.Module.register_buffer() which are called when modules register parameters and buffers during initialization. - - -Q: What does kill switch do in Nanotron? - -A: Kill switch is a file that the trainer periodically checks during training. If the kill switch file is detected, Trainer will: -+ Step 1: Save a checkpoint -+ Step 2: Exit training gracefully - -Q: Why does brrr have the custom initialization context manager instead of just using module.to() to move models to the target device? - -A: module.to() moves existing tensors to a new device. BRRR's custom initialization context manager initializes tensors directly on the target device to begin with. For example, if we want mixed precision on GPU from the start, the context manager will initialize weights in fp16 on the GPU, instead of initializing in fp32 on CPU then moving. - - -Q: In FP16 training, how does nanotron updates in the accumulated FP32 gradients when each parameter has an FP16 gradient? (4 steps) - -A: -- Step 1: Each FP16 parameter has an associated FP32 gradient buffer allocated. -- Step 2: During backward, the FP16 gradients are accumulated into the FP32 buffer, instead of directly into the .grad attribute. -- Step 3: Before the optimizer step, nanotron copies the accumulated FP32 gradients into the .grad attribute of the FP32 copy of each parameter that will be updated. -- Step 4: The optimizer performs the update on the FP32 parameters. - - -#### Model Initialization - - -Q: In Nanotron, how does Trainer initialize a model from scratch using 3D parallelism? (5 steps) - -A: -- Step 1: Create an instance of the model -- Step 2: Initialize parameters randomly (using model.init_model_randomly()) -- Step 3: Mark tied parameters (using tie_parameters()) -- Step 4: Sync model parameters across data parallelism with all_reduce -- Step 5: Sync tied parameters across their tied groups with all_reduce - - -Q: What is the high-level flow of BRRR's training loop? (3 steps) (ignore schedulers, logging…) - -A: -- Step 1: Do a training step - run forward/backward pass through the model pipeline. -- Step 2: Check for kill switch file, exit if triggered. -- Step 3: Save checkpoint if current step matches interval. - - -Q: In 3D parallelism, how does Nanotron calculate the total number of parameters of a replicas? (2 steps) - -A: -- Step 1: Sum the parameters within each pipeline stage (across tensor parallelism) ⇒ The total params for that stage. -- Step 2: Sum the parameters across pipeline stages ⇒ The total model parameters - For example with 2 pipeline stages, 2 tensor parallel: - + Stage 1: (TP0): 10 params, (TP1): 15 params. Sum = 25 - + Stage 2: (TP0): 20 params, (TP1): 25 params. Sum = 45 - Total params = Stage 1 + Stage 2 = (10+15) + (20+25) = 35 + 45 = 70 - - -Q: Why does BRRR need a kill switch to terminate training? Can't we just Ctrl-C or cancel the job? - -A: Kill switch provides a graceful way to terminate training without losing progress: -+ Ctrl-C stops the process immediately, risking corrupted checkpoints. -+ Cancelling the job kills all processes abruptly. -The kill switch allows: checkpoint is safely saved before terminating - - -Q: Why is there a second all-reduce after the first DP all-reduce during model initialization? - -A: The first DP all-reduce syncs weights across data parallelism, but not within each replica. For example, it syncs embedding weights across DP ranks, but not between embeddings and lm_head within each rank. The second all-reduce specifically syncs tied weights like embeddings and lm_head within each replica. -For example, suppose we have: + [Embedding A1, LM Head A1], [Embedding A2, LM Head A2] -The first all-reduce makes -+ Embedding A1 == Embedding A2 -+ LM Head A1 == LM Head A2 -but not Embedding A1 == LM Head A1.The second all-reduce syncs Embedding A1 and LM Head A1, and Embedding A2 and LM Head A2. - - -Q: Why does BRRR issue an all-reduce across data parallelism dimension when initializing a model from scratch? - -A: When initializing a model randomly, each replica (data parallel rank) can end up with different initial values due to randomness. The all-reduce (or an equivalent operation) syncs up these initial values across data parallelism, so each replica starts with the same initial weights. -For example, with 2 data parallel ranks: -+ Replica 1: Embedding weights initially [0.1, 0.3, 0.2] -+ Replica 2: Embedding weights initially [0.4, 0.1, 0.5] -After all-reduce, both will have the same initialized weights, say [0.25, 0.2, 0.35]. - - -Q: What are the 3 pretraining samplers in brrr? - -A: -- Sequential sampler: Walks through each GPU's data shard sequentially -- Random sampler: Shuffles each GPU's shard before walking through it -- Cyclic sampler: After one pass through the datasets, loops back to the beginning diff --git a/docs/docs.md b/docs/docs.md deleted file mode 100644 index b7a6b4ae..00000000 --- a/docs/docs.md +++ /dev/null @@ -1,402 +0,0 @@ - -# Doc on collective operations - -This NVIDIA doc is nice on all collective operations (all_reduce, reduce_scatter, etc): https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html - -# Usage - -We showcase usage in the `examples` directory. - -# Key concepts - -Let's go through some key concepts. - -## ParallelContext - -`ParallelContext` is the base class referencing all the process groups you might need when running parallel workloads. You can initialize it using the following: -```python -from nanotron.parallel import ParallelContext - -# define your topology -parallel_context = ParallelContext( - tensor_parallel_size=2, - data_parallel_size=2, - pipeline_parallel_size=2 -) -``` - -`ProcessGroups` is a mechanism in order to run distributed collectives (`all-reduce`, `all-gather`, ...) on a subgroup of all the ranks. It provides the granularity needed for 3D parallelism. - -From this dataclass you can access multiple process groups: - - `dp_pg`/`tp_pg`/`pp_pg`: This produces your typical process groups linked to 3D parallelism - - `world_pg`: ProcessGroup including all the processes. - - `world_rank_matrix`: This allows one to compute the world rank knowing the 3D ranks of a given process, or inversely when using `get_3d_ranks`. - - `world_ranks_to_pg`: This is a more generic pattern that allows you to store custom set of ProcessGroups, and querying it via a list of world ranks. - -## NanotronParameter - -Given a specific computation workload, we can freely define how we distribute workloads. For example: - -```python -from torch import nn -# Example: let's assume you want to run a Linear without bias -hidden_size = 8 - -# Single process way of running computation -module = nn.Linear(hidden_size, hidden_size) # Parameters: [H, H] -input = torch.randn(batch_size, hidden_size) -output = module(input) - -# Sharded ways of running computation across `tp_pg` (`ProcessGroup`) -# Version 1 -sharded_module = nn.Linear(hidden_size, hidden_size / tp_pg.size()) -input = torch.randn(batch_size, hidden_size) -sharded_output = module(input) -torch.distributed.all_gather(output, sharded_output, group=tp_pg.size()) - -# Version 2 -sharded_module = nn.Linear(hidden_size / tp_pg.size(), hidden_size) -sharded_input = torch.randn(batch_size, hidden_size / tp_pg.size()) -sharded_output = module(sharded_input) -torch.distributed.all_reduce(output, sharded_output, group=tp_pg.size()) - -# Version 3 -sharded_module = nn.Linear(hidden_size, hidden_size) -sharded_input = torch.randn(batch_size / tp_pg.size(), hidden_size) -torch.distributed.all_gather(input, sharded_input, group=tp_pg.size()) -output = module(input) # Duplicate workload - -# Version .... -``` -Distributed workloads have the tendency to generate tradeoffs between duplicated computation and extra communication. There's multiple ways to run the same computation, what we can optimize is the amount of communication we do, as well as duplicated work. Sometimes it's worth duplicating work in order to reduce communication significantly. - -As seen in previous example, sometimes the parameters are sharded across multiple devices, and sometimes they are duplicated. In `nanotron`, we decided to add those additional metadatas to `nn.Parameter`. We call our new datastructure: `NanotronParameter` - -## Sharded parameter - -A sharded parameter has the following metadata attached: - -```python -@dataclasses.dataclass -class SlicesPair: - local_slices: Tuple[slice, ...] - global_slices: Tuple[slice, ...] - -@dataclasses.dataclass -class ShardedInfo: - # All world ranks involved in the sharding. - global_ranks: Tuple[int, ...] - # Info of to what slice of the unsharded tensor (global_slices) the current sharded tensor corresponds (local_slices) - local_global_slices_pairs: Tuple[SlicesPair, ...] - # The shape of the unsharded tensor - unsharded_shape: Tuple[int, ...] -``` -Imagine we sharded a tensor t of shape [8, 64] across 2 ranks, 0 and 3, where rank 0 holds the first shard t[:, :32] and rank 3 holds the second shard t[:, 32:], then the sharded_info for them is: -```python -shard_info = ShardedInfo(global_ranks=(0,3), local_global_slices_pairs=(SlicesPair(local_slices=(slice(0,8), slice(0, 32),), global_slices=(slice(0,8), slice(0, 32)),),), unsharded_shape=(8, 64)) # world rank 0 -shard_info = ShardedInfo(global_ranks=(0,3), local_global_slices_pairs=(SlicesPair(local_slices=(slice(0,8), slice(0, 32),), global_slices=(slice(0,8), slice(32, 64)),),), unsharded_shape=(8, 64)) # world rank 3 -``` - -## Tied parameter - -This signifies that multiple occurrences of a given parameter are duplicated on multiple devices. Therefore we need a mechanism for them to be synced at all time. A typical example would be `lm_head` on top of transformers that's tied to the word embedding parameters. We attach the following metadata to the parameter: -```python -@dataclasses.dataclass -class TiedInfo: - # We usually arbitrarily choose a name of a parameter, either `lm_head.weight` or `wte.weight` for example. - name: str - # This allows us to define the scope in which `name` is valid. - root_module: nn.Module - # All world ranks involved in the tying. - global_ranks: Tuple[int, ...] - # In order to keep parameter synced, we add a `reduce_op` value that defines what kind of reduce operation we apply to the gradient. - # None signifies that we do not reduce - reduce_op: Optional[dist.ReduceOp] -``` - -Most interesting in this dataclass is the `reduce_op` parameter. Sometimes duplicated workload can remove the need to sync gradients as by design gradient computation would have already computed the correct gradient. A typical example of this is classic TP implementation using `all-reduce`/`identity`. - -Note: a parameter can be both sharded and tied. Both notion just have to involve different ranks. For example: lm_head and word embeddings can be sharded across TP, and tied between the first PP rank, and the last one. - -## Tensor parallelism - -Usually the go-to solution when models can't fit within a device. The basic idea is to figure out patterns where one can divide a single workload into multiple smaller workerloads that can run in parallel. We mimic tensor parallelism from Megatron-LM. Current supported modules: - - ColumnLinear/RowLinear - - ParallelVocabulary - - Cross-Entropy over sharded logits - - Distributed samplers for generation - -[Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) introduces that notion upon implementing one of the first large scale transformers: -![Tensor parallelism in transformer model](assets/tensor_parallel_in_transformer.png) -(Source: [link](https://arxiv.org/abs/1909.08053)) - -## Pipeline parallelism - -We can view the neural network as a sequence of operations. Instead of previous assumption where we split operations into smaller workloads that we can distribute. We take contiguous chunks and assign them to specific ranks. Instead of running parallel workloads, those are inherently sequential. In order to run them in parallel, we introduce fancy schedulers that process different batches in parallel:Rank 0 can be processing batch 1, while rank 1 is processing batch 0 - - Rank 0 starts to process batch 0 - - Rank 0 finishes to process batch 0 - - Rank 0 sends outputs to rank 1 - - Rank 1 starts to process batch 0 - - Rank 0 starts to process batch 1 (Rank 1 and Rank 0 are processing in parallel batches 1 and 0 respectively) - - Rank 1 finishes to process batch 0 - - Rank 0 finishes to process batch 1 - -### PipelineBlock - -The core component of our pipeline engine is a `PipelineBlock`. -It acts as the granularity for all our pipeline engines, we can define a specific workload that needs to happen on a specific device, ie rank. -Other ranks run a dummy `forward` where the forward pass returns `TensorPointer` which hold enough metadata in order to know where the output of the computation is. -```python -@dataclass -class TensorPointer: - group_rank: int -``` - -Module defined within `PipelineBlock` can be directly instantiated on the specific device. - -In short, what does `PipelineBlock` does: - - Receives either a set of `torch.Tensor`/`TensorPointer` as input - - In case of `TensorPointer`, query the tensor from the specified rank we extract from its state/context. - - Run the defined computation if current rank is responsible for running computation - - Return a dictionary `Dict[str, Union[torch.Tensor, TensorPointer]]`. - `TensorPointer` as output are for ranks that didn't run computation and require to know where the output of the computation is. - -```python -class PipelineBlock(nn.Module): - def __init__( - self, - p2p, # point-to-point communication class - module_builder, # module constructor in order to build module lazily - module_kwargs, # module constructor arguments in order to build module lazily - module_input_keys, # ranks that are not running compute to know the module input structure. Serves as a validation mechanism. - module_output_keys, # metadata for ranks that are not running compute to know the module output structure. - ): - pass - -# Example -# Lazy instantiation of a `nn.Linear` -model = PipelineBlock( - p2p=p2p, - module_builder=nn.Linear, - module_kwargs={"in_features":3, "out_feature": 5}, - module_input_keys={"input"}, - module_output_keys={"output"} -) - -model.build_and_set_rank(pp_rank) # Instantiate model parameters on `pp_rank` assigned device -``` - -In order to define which rank we use the `build_and_set_rank` method. It attaches the rank as a meta data, and builds the module on that specific rank. - -Models have to be defined using a "surface" of `PipelineBlock`. Typically, above `PipelineBlock` it's all about defining the `PipelineBlock` computational direct acyclic graph, below is where device specific computation is defined. - -As a non trivial example: -```python -class DummyModel(nn.Module): - def __init__( - self, - p2p: P2P, - ): - super().__init__() - self.dense1 = PipelineBlock( - p2p=p2p, - module_builder=nn.Linear, - module_kwargs={"in_features": 10, "out_features": 10}, - module_input_keys={"input"}, - module_output_keys={"output"}, - ) - self.dense2 = PipelineBlock( - p2p=p2p, - module_builder=nn.Linear, - module_kwargs={"in_features": 10, "out_features": 10}, - module_input_keys={"input"}, - module_output_keys={"output"}, - ) - # Doesn't hold any parameter, but we have to specify where the computation happens. - self.loss = PipelineBlock( - p2p=p2p, - module_builder=lambda: lambda x: x.sum(), - module_kwargs={}, - module_input_keys={"x"}, - module_output_keys={"output"}, - ) - - def forward(self, x: Union[torch.Tensor, TensorPointer]): - # x can be a `torch.Tensor` or a `TensorPointer` depending on the current rank, and where the pipeline blocks run their compute - x = self.dense1(input=x)["output"] - x = self.dense2(input=x)["output"] - x = self.loss(x=x)["output"] - return x -``` - - -### Pipeline engine - -We now support two kinds of engines: `AllForwardAllBackward`, `OneForwardOneBackward` - -Pipeline engines are different schedules for the set of workloads. A great illustration for the different schedules we support for training can be found in [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM -](https://arxiv.org/abs/2104.04473). We support `All forward all backward` and `One forward one backward` currently (Figure 3 and top of figure 4). - -![Pipeline engine](assets/pipeline_engine.png) -(Source: [link](https://arxiv.org/abs/2104.04473)) - -> **_IMPORTANT NOTE:_** When preparing your dataloader, make sure every tensor lives on a single rank, and other ranks must have `TensorPointer` to that rank. This is a requirement for the pipeline engine to work. - -## ZeRO-1 optimizer - -ZeRO stands for "Zero Redundancy Optimizer", also known as "FSDP" in Pytorch. The goal of such techniques is to shard tensors across multiple devices instead of duplicating them. Consequently it allows for significant memory gains at the cost of some communication overhead (with potential ability to overlap computation and communication). Sharding is done across data parallel dimension There are three stages: - - `Stage 1`: The optimizer states are sharded. - - `Stage 2`: The gradients are sharded - - `Stage 3`: The model weight are sharded - -As of now, we currently only support `stage 1`. - -![ZeRO](assets/zero.png) -(Source: [link](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)) - -# The awesome to have - -## Recomputation utilities - -Activation recomputation, also known as "activation checkpointing" is a memory saving technique. Pytorch automatically stores a set activation during the forward pass required for backward computation. However with large workloads, it might be worth recomputing specific activation in order to save memory. In `nanotron` we provide a decorator to implement this feature: - -```python -class MyFancyModule(nn.Module): - def __init__(self): - ... - self.do_checkpoint: bool = True - - @checkpoint_method(attr_name="do_checkpoint") - def forward(self, x): - ... -``` - -## On device initialization - -Usual pytorch module constructor instantiate weights on cpu and then move them to gpus. This can blow up cpu memory as well as being overall quite slow. - -```python -with init_on_device_and_dtype(device=torch.device("cuda"), dtype=torch.bfloat16): - module = MyFancyModule() # This directly instantiate the model on your device - -# If you want to bypass Pytorch weight initialization mechanism -with init_on_device_and_dtype(device=torch.device("meta"), dtype=torch.bfloat16): - module = MyFancyModule() -module.to_empty(torch.device("cuda")) # bfloat 16 model loaded in gpu with weight not initialized (only the storage buffers are allocated) -``` - -## Unified API for logging - -We provide a uniform API to logging, whether that's on tensorboard, on stdout or on Hugging Face hub: - -```python -@dataclass -class LogItem: - tag: str - scalar_value: Union[float, int] - log_format: Optional[str] = None -``` - -All logger need to implement a single method: -```python -class BaseLogger: - @abstractmethod - def add_scalars_from_list(self, log_entries: List[LogItem], iteration_step: int): - ... -``` - -If you want to have tensorboard logger support: `pip install -e ".[tb-logger]"`. -If you want to have huggingface-hub tensorboard logger support: `pip install -e ".[hf-logger]"`. - -## Random state handling primitives - -We currently have a mechanism to have an arbitrary number of `RandomState` in a `RandomStates`: -```python -class RandomState: - random - numpy - torch - torch_cuda - -class RandomStates(MutableMapping[str, RandomState]) - pass -``` - -At all time we get/set current random state in the current context -```python -def get_current_random_state(): - # This gets the current random_state from the current context - pass - -def set_random_state(random_state: RandomState): - # This sets random state in the current context - pass -``` - -In order to use specific `RandomState` for specific operations, typically when you want to synchronize `nn.Dropout` across multiple ranks for example, you can run `branch_random_state` context manager: -```python -def branch_random_state(random_states:RandomStates, key:str): - # Context manager which sets the random state associated with `key` when entering - # When exiting, we update the random state at `key` and restore previous random state. - pass - -# Usage -random_states = RandomStates({"my_own_random_state": get_current_random_state()}) -with branch_random_state(random_states, "my_own_random_state"): - output = nn.Dropout(0.1)(input) -``` - -Finally we provide a quick helper in order to get a synchronized random state across a process group. -```python -def get_synced_random_state(random_state: RandomState, pg: ProcessGroup): - # This allows us to get a synchronized random state with other ranks within a single group - -# Usage -random_states = RandomStates({"tp_synced_random_state": get_synced_random_state(random_state=get_current_random_state(), group=tp_pg)}) -with branch_random_state(random_states, "tp_synced_random_state"): - # Assuming that input is synced across TP, all ranks will apply the same random mask. - output = nn.Dropout(0.1)(input) -``` - -# Distributed serialization mechanism - -We rely on compute nodes having access to a single shared filesystem. - -We use `safetensors` to store our checkpoints. - -Current format: -```python -checkpoint_metadata.json # Stores version, topology, other metadata that would make the training resumable -optimizer - optimizer_config.json # Stores enough information to reinstantiate which optimizer this runs. - optimizer_tp-0-of-1_dp-0-of-1_pp-0-of-2.pt - optimizer_tp-0-of-1_dp-0-of-1_pp-0-of-2.pt -lr_scheduler - lr_scheduler_tp-0-of-1_dp-0-of-1_pp-0-of-2.pt - lr_scheduler_tp-0-of-1_dp-0-of-1_pp-0-of-2.pt -random # Stores random states from each process in order to resume training from the point on. - tp-0-of-1_dp-0-of-1_pp-0-of-2.pt - tp-0-of-1_dp-0-of-1_pp-1-of-2.pt -model - dense1 - model_weight.safetensors - model_bias.safetensors - dense2 - model_weight.safetensors - model_bias.safetensors -``` - -Some observations: - - checkpoints are NOT topology agnostic, this is due to both `random_states` and `sharded` tensors. - Instead of trying to reconcile those and obtain a topology agnostic one, we want to support a `checkpoint_reshape` method. - The motivations are the following: - - When training, one spends a LOT more time `saving` checkpoints than loading. In doing so, having the fastest saving mechanism helps. Consequently not having any distributed communication/locking will help this. - - Random states are not so easily reconcilable. Given random states for two separate processes when we have TP=2, it's not obvious what should be the random state if we set to TP=1. - - Optimizer states are aligned with parameters. It's usually the case where for each parameter you can define an optimizer state. But that's a limitation on the current serialization format. - - # Current restrictions: - -- `nn.Module` inside PipelineBlocks have to return a `Dict[str,torch.Tensor]` or `torch.Tensor`. -- No conditional flow on top of pipeline, or at least making sure that all the processes within a data parallel rank are performing the same sequence of operations: - - First all but one process will be things on `TensorPointer` which would make input dependent control flow quite hard. - - Second if you were to have input dependent control flow, causing two processes within a single data parallel rank to be different, then you might end up with weird communication issues. diff --git a/src/nanotron/fp8/linear.py b/src/nanotron/fp8/linear.py index b268ec62..2f03d959 100644 --- a/src/nanotron/fp8/linear.py +++ b/src/nanotron/fp8/linear.py @@ -2,7 +2,6 @@ import torch import torch.nn.functional as F -import transformer_engine as te # noqa from torch import nn from nanotron.fp8.constants import INITIAL_AMAX, INITIAL_SCALING_FACTOR @@ -11,8 +10,18 @@ from nanotron.fp8.meta import FP8Meta from nanotron.fp8.parameter import FP8Parameter from nanotron.fp8.tensor import FP8Tensor, update_scaling_factor - - +from nanotron.logging import log_rank +from nanotron import logging + +te_available = False +try: + import transformer_engine as te # noqa + te_available = True + Linear = te.Linear +except ImportError: + log_rank("Transformer Engine is not available", logging.INFO, rank=0) + Linear = nn.Linear + class FP8LinearMeta(TypedDict): """FP8 metadata for FP8Linear.""" @@ -21,7 +30,7 @@ class FP8LinearMeta(TypedDict): output_grad: FP8Meta -class FP8Linear(nn.Linear): +class FP8Linear(Linear): def __init__( self, in_features: int, From 675726c0944abf2c277f049fc1f3724e26c5fdac Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 15 Mar 2024 14:36:27 +0100 Subject: [PATCH 3/4] forking nanotron --- examples/bench_llama_7b.py | 4 +- examples/config_tiny_llama.py | 4 +- examples/doremi/doremi/config.py | 4 +- examples/doremi/doremi/dataloader.py | 14 +++--- examples/doremi/doremi/llama.py | 20 ++++---- examples/doremi/doremi/loss.py | 4 +- examples/doremi/doremi/trainer.py | 18 +++---- examples/doremi/tests/test_doremi_loss.py | 6 +-- examples/doremi/tests/test_doremi_sampler.py | 6 +-- examples/doremi/train_doremi.py | 2 +- examples/doremi/train_reference.py | 2 +- examples/moe/config_llamoe.py | 6 +-- examples/moe/llamoe.py | 30 ++++++------ examples/moe/moe.py | 8 ++-- examples/moe/train_moe.py | 4 +- run_generate.py | 28 +++++------ run_train.py | 14 +++--- src/{nanotron => fmengine}/__init__.py | 0 src/fmengine/config/__init__.py | 5 ++ src/{nanotron => fmengine}/config/config.py | 16 +++---- .../config/lighteval_config.py | 6 +-- .../config/models_config.py | 0 .../config/parallelism_config.py | 6 +-- .../config/utils_config.py | 6 +-- src/{nanotron => fmengine}/constants.py | 0 .../dataloader/dataloader.py | 18 +++---- src/{nanotron => fmengine}/distributed.py | 2 +- src/{nanotron => fmengine}/fp8/__init__.py | 8 ++-- src/{nanotron => fmengine}/fp8/constants.py | 2 +- src/{nanotron => fmengine}/fp8/dtypes.py | 0 src/{nanotron => fmengine}/fp8/kernel.py | 4 +- src/{nanotron => fmengine}/fp8/linear.py | 16 +++---- src/{nanotron => fmengine}/fp8/meta.py | 4 +- src/{nanotron => fmengine}/fp8/parameter.py | 8 ++-- src/{nanotron => fmengine}/fp8/tensor.py | 6 +-- src/{nanotron => fmengine}/fp8/utils.py | 2 +- .../generation/__init__.py | 0 .../generation/decode.py | 30 ++++++------ .../generation/generate_store.py | 0 .../generation/sampler.py | 2 +- src/{nanotron => fmengine}/helpers.py | 28 +++++------ src/{nanotron => fmengine}/logging.py | 4 +- src/{nanotron => fmengine}/models/__init__.py | 0 src/{nanotron => fmengine}/models/base.py | 16 +++---- src/{nanotron => fmengine}/models/llama.py | 32 ++++++------- src/{nanotron => fmengine}/models/mistral.py | 32 ++++++------- .../models/starcoder2.py | 36 +++++++------- src/{nanotron => fmengine}/nn/__init__.py | 0 src/{nanotron => fmengine}/nn/activations.py | 2 +- src/{nanotron => fmengine}/nn/layer_norm.py | 0 src/fmengine/optim/__init__.py | 15 ++++++ src/{nanotron => fmengine}/optim/base.py | 0 .../optim/clip_grads.py | 8 ++-- .../optim/gradient_accumulator.py | 8 ++-- .../optim/inherit_from_other_optimizer.py | 2 +- .../optim/named_optimizer.py | 2 +- .../optimizer_from_gradient_accumulator.py | 8 ++-- src/{nanotron => fmengine}/optim/zero.py | 16 +++---- src/fmengine/parallel/__init__.py | 2 + .../parallel/context.py | 2 +- .../parallel/data_parallel/utils.py | 4 +- .../parallel/parameters.py | 6 +-- .../parallel/pipeline_parallel/README.md | 0 .../parallel/pipeline_parallel/block.py | 10 ++-- .../pipeline_parallel/context_manager.py | 4 +- .../parallel/pipeline_parallel/engine.py | 20 ++++---- .../parallel/pipeline_parallel/functional.py | 6 +-- .../parallel/pipeline_parallel/p2p.py | 6 +-- .../parallel/pipeline_parallel/state.py | 8 ++-- .../pipeline_parallel/tensor_pointer.py | 0 .../parallel/pipeline_parallel/utils.py | 4 +- .../parallel/sharded_parameters.py | 4 +- .../parallel/tensor_parallel/__init__.py | 0 .../distributed_differentiable_primitives.py | 4 +- .../parallel/tensor_parallel/enum.py | 0 .../parallel/tensor_parallel/functional.py | 8 ++-- .../parallel/tensor_parallel/nn.py | 16 +++---- .../parallel/tied_parameters.py | 14 +++--- src/{nanotron => fmengine}/parallel/utils.py | 6 +-- src/{nanotron => fmengine}/random.py | 4 +- src/{nanotron => fmengine}/sanity_checks.py | 16 +++---- src/fmengine/serialize/__init__.py | 4 ++ src/{nanotron => fmengine}/serialize/main.py | 24 +++++----- .../serialize/metadata.py | 8 ++-- .../serialize/optimizer.py | 16 +++---- .../serialize/random.py | 6 +-- src/{nanotron => fmengine}/serialize/utils.py | 6 +-- .../serialize/weights.py | 18 +++---- src/{nanotron => fmengine}/trainer.py | 48 +++++++++---------- src/{nanotron => fmengine}/utils.py | 2 +- src/nanotron/config/__init__.py | 5 -- src/nanotron/optim/__init__.py | 15 ------ src/nanotron/parallel/__init__.py | 2 - src/nanotron/serialize/__init__.py | 4 -- tests/fp8/test_fp8_parameter.py | 4 +- tests/fp8/test_linear.py | 2 +- tests/fp8/test_tensor.py | 6 +-- tests/helpers/distributed_tensor.py | 4 +- tests/helpers/dummy.py | 22 ++++----- tests/helpers/exception.py | 2 +- tests/helpers/utils.py | 4 +- tests/kernels/run_layer_norm_convergence.py | 4 +- tests/kernels/test_layer_norm.py | 2 +- tests/test_checkpointing.py | 4 +- tests/test_clip_grads.py | 26 +++++----- tests/test_data_parallel.py | 10 ++-- tests/test_distributed.py | 2 +- tests/test_p2p.py | 6 +-- tests/test_parameter.py | 4 +- ..._parameters_accumulate_gradient_in_fp32.py | 30 ++++++------ tests/test_pipeline_parallel.py | 14 +++--- tests/test_random_state.py | 6 +-- tests/test_serialize.py | 26 +++++----- tests/test_tensor_parallel.py | 10 ++-- tests/test_tie_weights.py | 8 ++-- tests/test_zero.py | 24 +++++----- 116 files changed, 523 insertions(+), 523 deletions(-) rename src/{nanotron => fmengine}/__init__.py (100%) create mode 100644 src/fmengine/config/__init__.py rename src/{nanotron => fmengine}/config/config.py (96%) rename src/{nanotron => fmengine}/config/lighteval_config.py (94%) rename src/{nanotron => fmengine}/config/models_config.py (100%) rename src/{nanotron => fmengine}/config/parallelism_config.py (89%) rename src/{nanotron => fmengine}/config/utils_config.py (95%) rename src/{nanotron => fmengine}/constants.py (100%) rename src/{nanotron => fmengine}/dataloader/dataloader.py (98%) rename src/{nanotron => fmengine}/distributed.py (99%) rename src/{nanotron => fmengine}/fp8/__init__.py (51%) rename src/{nanotron => fmengine}/fp8/constants.py (91%) rename src/{nanotron => fmengine}/fp8/dtypes.py (100%) rename src/{nanotron => fmengine}/fp8/kernel.py (96%) rename src/{nanotron => fmengine}/fp8/linear.py (93%) rename src/{nanotron => fmengine}/fp8/meta.py (90%) rename src/{nanotron => fmengine}/fp8/parameter.py (90%) rename src/{nanotron => fmengine}/fp8/tensor.py (96%) rename src/{nanotron => fmengine}/fp8/utils.py (88%) rename src/{nanotron => fmengine}/generation/__init__.py (100%) rename src/{nanotron => fmengine}/generation/decode.py (98%) rename src/{nanotron => fmengine}/generation/generate_store.py (100%) rename src/{nanotron => fmengine}/generation/sampler.py (99%) rename src/{nanotron => fmengine}/helpers.py (96%) rename src/{nanotron => fmengine}/logging.py (99%) rename src/{nanotron => fmengine}/models/__init__.py (100%) rename src/{nanotron => fmengine}/models/base.py (96%) rename src/{nanotron => fmengine}/models/llama.py (98%) rename src/{nanotron => fmengine}/models/mistral.py (98%) rename src/{nanotron => fmengine}/models/starcoder2.py (98%) rename src/{nanotron => fmengine}/nn/__init__.py (100%) rename src/{nanotron => fmengine}/nn/activations.py (99%) rename src/{nanotron => fmengine}/nn/layer_norm.py (100%) create mode 100644 src/fmengine/optim/__init__.py rename src/{nanotron => fmengine}/optim/base.py (100%) rename src/{nanotron => fmengine}/optim/clip_grads.py (95%) rename src/{nanotron => fmengine}/optim/gradient_accumulator.py (98%) rename src/{nanotron => fmengine}/optim/inherit_from_other_optimizer.py (96%) rename src/{nanotron => fmengine}/optim/named_optimizer.py (97%) rename src/{nanotron => fmengine}/optim/optimizer_from_gradient_accumulator.py (92%) rename src/{nanotron => fmengine}/optim/zero.py (98%) create mode 100644 src/fmengine/parallel/__init__.py rename src/{nanotron => fmengine}/parallel/context.py (99%) rename src/{nanotron => fmengine}/parallel/data_parallel/utils.py (94%) rename src/{nanotron => fmengine}/parallel/parameters.py (98%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/README.md (100%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/block.py (96%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/context_manager.py (87%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/engine.py (96%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/functional.py (96%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/p2p.py (99%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/state.py (98%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/tensor_pointer.py (100%) rename src/{nanotron => fmengine}/parallel/pipeline_parallel/utils.py (92%) rename src/{nanotron => fmengine}/parallel/sharded_parameters.py (98%) rename src/{nanotron => fmengine}/parallel/tensor_parallel/__init__.py (100%) rename src/{nanotron => fmengine}/parallel/tensor_parallel/distributed_differentiable_primitives.py (98%) rename src/{nanotron => fmengine}/parallel/tensor_parallel/enum.py (100%) rename src/{nanotron => fmengine}/parallel/tensor_parallel/functional.py (98%) rename src/{nanotron => fmengine}/parallel/tensor_parallel/nn.py (95%) rename src/{nanotron => fmengine}/parallel/tied_parameters.py (95%) rename src/{nanotron => fmengine}/parallel/utils.py (87%) rename src/{nanotron => fmengine}/random.py (98%) rename src/{nanotron => fmengine}/sanity_checks.py (96%) create mode 100644 src/fmengine/serialize/__init__.py rename src/{nanotron => fmengine}/serialize/main.py (95%) rename src/{nanotron => fmengine}/serialize/metadata.py (95%) rename src/{nanotron => fmengine}/serialize/optimizer.py (97%) rename src/{nanotron => fmengine}/serialize/random.py (91%) rename src/{nanotron => fmengine}/serialize/utils.py (93%) rename src/{nanotron => fmengine}/serialize/weights.py (97%) rename src/{nanotron => fmengine}/trainer.py (96%) rename src/{nanotron => fmengine}/utils.py (99%) delete mode 100644 src/nanotron/config/__init__.py delete mode 100644 src/nanotron/optim/__init__.py delete mode 100644 src/nanotron/parallel/__init__.py delete mode 100644 src/nanotron/serialize/__init__.py diff --git a/examples/bench_llama_7b.py b/examples/bench_llama_7b.py index 1ad13b8c..646c9dee 100644 --- a/examples/bench_llama_7b.py +++ b/examples/bench_llama_7b.py @@ -4,7 +4,7 @@ import os -from nanotron.config import ( +from fmengine.config import ( CheckpointsArgs, Config, DataArgs, @@ -20,7 +20,7 @@ TokenizerArgs, TokensArgs, ) -from nanotron.logging import human_format +from fmengine.logging import human_format # Config for a llama model with 6.74M parameters model_config = LlamaConfig() diff --git a/examples/config_tiny_llama.py b/examples/config_tiny_llama.py index 62844c25..c487590b 100644 --- a/examples/config_tiny_llama.py +++ b/examples/config_tiny_llama.py @@ -2,7 +2,7 @@ import os -from nanotron.config import ( +from fmengine.config import ( CheckpointsArgs, Config, DataArgs, @@ -18,7 +18,7 @@ TokenizerArgs, TokensArgs, ) -from nanotron.logging import human_format +from fmengine.logging import human_format model_config = LlamaConfig( # Config for a tiny model model with 1.62M parameters diff --git a/examples/doremi/doremi/config.py b/examples/doremi/doremi/config.py index 83b0c7a8..d3ba01d5 100644 --- a/examples/doremi/doremi/config.py +++ b/examples/doremi/doremi/config.py @@ -5,7 +5,7 @@ import torch import yaml -from nanotron.config import ( +from fmengine.config import ( CheckpointsArgs, DataArgs, GeneralArgs, @@ -18,7 +18,7 @@ TokensArgs, get_config_from_file, ) -from nanotron.config.utils_config import serialize +from fmengine.config.utils_config import serialize @dataclass diff --git a/examples/doremi/doremi/dataloader.py b/examples/doremi/doremi/dataloader.py index 9799150c..f8612adc 100644 --- a/examples/doremi/doremi/dataloader.py +++ b/examples/doremi/doremi/dataloader.py @@ -9,13 +9,13 @@ from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm -from nanotron import distributed as dist -from nanotron import logging -from nanotron.dataloader.dataloader import get_dataloader_worker_init -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks -from nanotron.trainer import DistributedTrainer +from fmengine import distributed as dist +from fmengine import logging +from fmengine.dataloader.dataloader import get_dataloader_worker_init +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.pipeline_parallel.utils import get_input_output_pp_ranks +from fmengine.trainer import DistributedTrainer from .doremi_context import DoReMiContext diff --git a/examples/doremi/doremi/llama.py b/examples/doremi/doremi/llama.py index 65c2c318..af403f57 100644 --- a/examples/doremi/doremi/llama.py +++ b/examples/doremi/doremi/llama.py @@ -3,16 +3,16 @@ import torch from transformers import LlamaConfig -from nanotron import logging -from nanotron.config import ParallelismArgs -from nanotron.models import NanotronModel -from nanotron.models.llama import LlamaModel -from nanotron.nn.layer_norm import TritonRMSNorm -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine import logging +from fmengine.config import ParallelismArgs +from fmengine.models import NanotronModel +from fmengine.models.llama import LlamaModel +from fmengine.nn.layer_norm import TritonRMSNorm +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/examples/doremi/doremi/loss.py b/examples/doremi/doremi/loss.py index a0a190b3..8e4088a2 100644 --- a/examples/doremi/doremi/loss.py +++ b/examples/doremi/doremi/loss.py @@ -4,8 +4,8 @@ import torch.distributed as dist from torch import nn -from nanotron.parallel import ParallelContext -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.parallel import ParallelContext +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy from .doremi_context import DoReMiContext from .utils import masked_mean diff --git a/examples/doremi/doremi/trainer.py b/examples/doremi/doremi/trainer.py index 13db358d..afb151b1 100644 --- a/examples/doremi/doremi/trainer.py +++ b/examples/doremi/doremi/trainer.py @@ -3,15 +3,15 @@ import torch from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import Config, get_config_from_file -from nanotron.logging import log_rank -from nanotron.models import NanotronModel -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.sanity_checks import assert_tensor_synced_across_pg -from nanotron.serialize import load_weights -from nanotron.trainer import DistributedTrainer +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import Config, get_config_from_file +from fmengine.logging import log_rank +from fmengine.models import NanotronModel +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.sanity_checks import assert_tensor_synced_across_pg +from fmengine.serialize import load_weights +from fmengine.trainer import DistributedTrainer from .config import DoReMiConfig from .doremi_context import DoReMiContext diff --git a/examples/doremi/tests/test_doremi_loss.py b/examples/doremi/tests/test_doremi_loss.py index b2a95bd2..547765ba 100644 --- a/examples/doremi/tests/test_doremi_loss.py +++ b/examples/doremi/tests/test_doremi_loss.py @@ -4,9 +4,9 @@ import torch.nn.functional as F from utils import set_system_path -from nanotron.parallel import ParallelContext -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy -from nanotron.sanity_checks import assert_tensor_synced_across_pg +from fmengine.parallel import ParallelContext +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.sanity_checks import assert_tensor_synced_across_pg set_system_path() diff --git a/examples/doremi/tests/test_doremi_sampler.py b/examples/doremi/tests/test_doremi_sampler.py index d644b1c0..337d1be3 100644 --- a/examples/doremi/tests/test_doremi_sampler.py +++ b/examples/doremi/tests/test_doremi_sampler.py @@ -3,9 +3,9 @@ from torch.utils.data import DataLoader from utils import create_dummy_dataset, set_system_path -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.sanity_checks import assert_tensor_synced_across_pg +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.sanity_checks import assert_tensor_synced_across_pg set_system_path() diff --git a/examples/doremi/train_doremi.py b/examples/doremi/train_doremi.py index 58679b5d..bfbf6bf0 100644 --- a/examples/doremi/train_doremi.py +++ b/examples/doremi/train_doremi.py @@ -15,7 +15,7 @@ from doremi.trainer import DoReMiTrainer from doremi.utils import compute_domain_weights_based_on_token_count -from nanotron.config import get_config_from_file +from fmengine.config import get_config_from_file def get_args(): diff --git a/examples/doremi/train_reference.py b/examples/doremi/train_reference.py index 9e81290b..0ce6713e 100644 --- a/examples/doremi/train_reference.py +++ b/examples/doremi/train_reference.py @@ -15,7 +15,7 @@ from doremi.trainer import ReferenceTrainer from doremi.utils import compute_domain_weights_based_on_token_count -from nanotron.config import get_config_from_file +from fmengine.config import get_config_from_file def get_args(): diff --git a/examples/moe/config_llamoe.py b/examples/moe/config_llamoe.py index 96a7cca0..0c21052d 100644 --- a/examples/moe/config_llamoe.py +++ b/examples/moe/config_llamoe.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from typing import Optional -from nanotron.config import ( +from fmengine.config import ( CheckpointsArgs, Config, DataArgs, @@ -18,8 +18,8 @@ TokenizerArgs, TokensArgs, ) -from nanotron.config.config import PretrainDatasetsArgs -from nanotron.logging import human_format +from fmengine.config.config import PretrainDatasetsArgs +from fmengine.logging import human_format @dataclass diff --git a/examples/moe/llamoe.py b/examples/moe/llamoe.py index dd3a89e9..a9acc66c 100644 --- a/examples/moe/llamoe.py +++ b/examples/moe/llamoe.py @@ -26,26 +26,26 @@ from moe import dMoE from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import ParallelismArgs -from nanotron.generation.generate_store import AttachableStore -from nanotron.logging import log_rank -from nanotron.models import NanotronModel -from nanotron.nn.layer_norm import TritonRMSNorm -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import ParallelismArgs +from fmengine.generation.generate_store import AttachableStore +from fmengine.logging import log_rank +from fmengine.models import NanotronModel +from fmengine.nn.layer_norm import TritonRMSNorm +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelLinearMode, TensorParallelRowLinear, ) -from nanotron.random import RandomStates -from nanotron.utils import checkpoint_method +from fmengine.random import RandomStates +from fmengine.utils import checkpoint_method logger = logging.get_logger(__name__) diff --git a/examples/moe/moe.py b/examples/moe/moe.py index 8f69fdf8..aff1c601 100644 --- a/examples/moe/moe.py +++ b/examples/moe/moe.py @@ -9,7 +9,7 @@ import torch.nn.functional as F from config_llamoe import LlaMoEConfig -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode try: import megablocks.ops as ops @@ -22,9 +22,9 @@ from megablocks.layers.activation_fn import act_fn from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import ParallelismArgs +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import ParallelismArgs logger = logging.get_logger(__name__) diff --git a/examples/moe/train_moe.py b/examples/moe/train_moe.py index 362dd343..fc05b031 100644 --- a/examples/moe/train_moe.py +++ b/examples/moe/train_moe.py @@ -13,8 +13,8 @@ from config_llamoe import LlaMoEConfig from llamoe import LlaMoEForTraining -from nanotron import logging -from nanotron.trainer import DistributedTrainer +from fmengine import logging +from fmengine.trainer import DistributedTrainer sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) diff --git a/run_generate.py b/run_generate.py index 153d07fc..d6799cc3 100644 --- a/run_generate.py +++ b/run_generate.py @@ -14,37 +14,37 @@ import torch -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import ( GenerationArgs, LoggingArgs, ParallelismArgs, get_config_from_file, ) -from nanotron.generation.decode import ( +from fmengine.generation.decode import ( GenerationInput, TokenizerConfig, decode_text, decode_tokenized, ) -from nanotron.logging import log_rank, set_logger_verbosity_format -from nanotron.models import build_model -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import sanity_check -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine.logging import log_rank, set_logger_verbosity_format +from fmengine.models import build_model +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import sanity_check +from fmengine.parallel.pipeline_parallel.engine import ( OneForwardOneBackwardPipelineEngine, ) -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.random import ( +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.random import ( RandomStates, get_current_random_state, get_synced_random_state, set_random_seed, ) -from nanotron.serialize import load_weights -from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters +from fmengine.serialize import load_weights +from fmengine.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters try: from transformers import AutoTokenizer diff --git a/run_train.py b/run_train.py index 505d8e46..29e0a850 100644 --- a/run_train.py +++ b/run_train.py @@ -11,18 +11,18 @@ import argparse import os -from nanotron import logging -from nanotron.config import PretrainDatasetsArgs -from nanotron.dataloader.dataloader import ( +from fmengine import logging +from fmengine.config import PretrainDatasetsArgs +from fmengine.dataloader.dataloader import ( clm_process, dummy_infinite_data_generator, get_datasets, get_train_dataloader, ) -from nanotron.logging import log_rank -from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks -from nanotron.trainer import DistributedTrainer -from nanotron.utils import main_rank_first +from fmengine.logging import log_rank +from fmengine.parallel.pipeline_parallel.utils import get_input_output_pp_ranks +from fmengine.trainer import DistributedTrainer +from fmengine.utils import main_rank_first from huggingface_hub import __version__ as hf_hub_version from transformers import AutoTokenizer diff --git a/src/nanotron/__init__.py b/src/fmengine/__init__.py similarity index 100% rename from src/nanotron/__init__.py rename to src/fmengine/__init__.py diff --git a/src/fmengine/config/__init__.py b/src/fmengine/config/__init__.py new file mode 100644 index 00000000..fc59673a --- /dev/null +++ b/src/fmengine/config/__init__.py @@ -0,0 +1,5 @@ +# flake8: noqa +from fmengine.config.config import * +from fmengine.config.lighteval_config import * +from fmengine.config.models_config import * +from fmengine.config.utils_config import * diff --git a/src/nanotron/config/config.py b/src/fmengine/config/config.py similarity index 96% rename from src/nanotron/config/config.py rename to src/fmengine/config/config.py index c1efe0df..bf20e07d 100644 --- a/src/nanotron/config/config.py +++ b/src/fmengine/config/config.py @@ -10,23 +10,23 @@ from dacite import from_dict from yaml.loader import SafeLoader -from nanotron.config.lighteval_config import LightEvalConfig -from nanotron.config.models_config import ( +from fmengine.config.lighteval_config import LightEvalConfig +from fmengine.config.models_config import ( ExistingCheckpointInit, NanotronConfigs, RandomInit, ) -from nanotron.config.parallelism_config import ParallelismArgs -from nanotron.config.utils_config import ( +from fmengine.config.parallelism_config import ParallelismArgs +from fmengine.config.utils_config import ( RecomputeGranularity, cast_str_to_pipeline_engine, cast_str_to_torch_dtype, serialize, ) -from nanotron.generation.sampler import SamplerType -from nanotron.logging import get_logger -from nanotron.parallel.pipeline_parallel.engine import PipelineEngine -from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode +from fmengine.generation.sampler import SamplerType +from fmengine.logging import get_logger +from fmengine.parallel.pipeline_parallel.engine import PipelineEngine +from fmengine.parallel.tensor_parallel.nn import TensorParallelLinearMode logger = get_logger(__name__) diff --git a/src/nanotron/config/lighteval_config.py b/src/fmengine/config/lighteval_config.py similarity index 94% rename from src/nanotron/config/lighteval_config.py rename to src/fmengine/config/lighteval_config.py index b5f12059..3bb7e649 100644 --- a/src/nanotron/config/lighteval_config.py +++ b/src/fmengine/config/lighteval_config.py @@ -2,9 +2,9 @@ from pathlib import Path from typing import Dict, Optional, Union -from nanotron.config.parallelism_config import ParallelismArgs -from nanotron.generation.sampler import SamplerType -from nanotron.logging import get_logger +from fmengine.config.parallelism_config import ParallelismArgs +from fmengine.generation.sampler import SamplerType +from fmengine.logging import get_logger logger = get_logger(__name__) diff --git a/src/nanotron/config/models_config.py b/src/fmengine/config/models_config.py similarity index 100% rename from src/nanotron/config/models_config.py rename to src/fmengine/config/models_config.py diff --git a/src/nanotron/config/parallelism_config.py b/src/fmengine/config/parallelism_config.py similarity index 89% rename from src/nanotron/config/parallelism_config.py rename to src/fmengine/config/parallelism_config.py index ab0fd2b3..79fe264b 100644 --- a/src/nanotron/config/parallelism_config.py +++ b/src/fmengine/config/parallelism_config.py @@ -1,12 +1,12 @@ from dataclasses import dataclass from typing import Optional -from nanotron.config.utils_config import cast_str_to_pipeline_engine -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine.config.utils_config import cast_str_to_pipeline_engine +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, PipelineEngine, ) -from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.nn import TensorParallelLinearMode @dataclass diff --git a/src/nanotron/config/utils_config.py b/src/fmengine/config/utils_config.py similarity index 95% rename from src/nanotron/config/utils_config.py rename to src/fmengine/config/utils_config.py index f282440e..28433dd8 100644 --- a/src/nanotron/config/utils_config.py +++ b/src/fmengine/config/utils_config.py @@ -4,13 +4,13 @@ import torch -from nanotron.generation.sampler import SamplerType -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine.generation.sampler import SamplerType +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, OneForwardOneBackwardPipelineEngine, PipelineEngine, ) -from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.nn import TensorParallelLinearMode class RecomputeGranularity(Enum): diff --git a/src/nanotron/constants.py b/src/fmengine/constants.py similarity index 100% rename from src/nanotron/constants.py rename to src/fmengine/constants.py diff --git a/src/nanotron/dataloader/dataloader.py b/src/fmengine/dataloader/dataloader.py similarity index 98% rename from src/nanotron/dataloader/dataloader.py rename to src/fmengine/dataloader/dataloader.py index 64faf628..0de51597 100644 --- a/src/nanotron/dataloader/dataloader.py +++ b/src/fmengine/dataloader/dataloader.py @@ -6,18 +6,18 @@ from torch.utils.data import BatchSampler, DataLoader from torch.utils.data.distributed import DistributedSampler -from nanotron import distributed as dist -from nanotron.config import Config -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.random import set_random_seed -from nanotron.sanity_checks import ( +from fmengine import distributed as dist +from fmengine.config import Config +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.random import set_random_seed +from fmengine.sanity_checks import ( assert_fail_except_rank_with, assert_tensor_synced_across_pg, ) -from nanotron import logging -from nanotron.logging import log_rank -from nanotron.utils import has_length +from fmengine import logging +from fmengine.logging import log_rank +from fmengine.utils import has_length import datasets from datasets import ( Dataset, diff --git a/src/nanotron/distributed.py b/src/fmengine/distributed.py similarity index 99% rename from src/nanotron/distributed.py rename to src/fmengine/distributed.py index 5f55690b..769bb775 100644 --- a/src/nanotron/distributed.py +++ b/src/fmengine/distributed.py @@ -9,7 +9,7 @@ from torch.distributed import * # noqa from torch.distributed.distributed_c10d import ProcessGroup -from nanotron.utils import find_free_port +from fmengine.utils import find_free_port torch_version_above_1_13 = version.parse(torch.__version__) >= version.parse("1.13.0") Work = dist.Work if torch_version_above_1_13 else dist._Work diff --git a/src/nanotron/fp8/__init__.py b/src/fmengine/fp8/__init__.py similarity index 51% rename from src/nanotron/fp8/__init__.py rename to src/fmengine/fp8/__init__.py index 2adc80c2..af4c49b6 100644 --- a/src/nanotron/fp8/__init__.py +++ b/src/fmengine/fp8/__init__.py @@ -1,9 +1,9 @@ import warnings -from nanotron.fp8.dtypes import DTypes # noqa -from nanotron.fp8.linear import FP8Linear # noqa -from nanotron.fp8.parameter import FP8Parameter # noqa -from nanotron.fp8.tensor import FP8Tensor # noqa +from fmengine.fp8.dtypes import DTypes # noqa +from fmengine.fp8.linear import FP8Linear # noqa +from fmengine.fp8.parameter import FP8Parameter # noqa +from fmengine.fp8.tensor import FP8Tensor # noqa try: import transformer_engine as te # noqa diff --git a/src/nanotron/fp8/constants.py b/src/fmengine/fp8/constants.py similarity index 91% rename from src/nanotron/fp8/constants.py rename to src/fmengine/fp8/constants.py index cfaa9fb7..8901514b 100644 --- a/src/nanotron/fp8/constants.py +++ b/src/fmengine/fp8/constants.py @@ -1,6 +1,6 @@ import torch -from nanotron.fp8.dtypes import DTypes +from fmengine.fp8.dtypes import DTypes FP8_GPU_NAMES = ["h100", "rtx 4090"] diff --git a/src/nanotron/fp8/dtypes.py b/src/fmengine/fp8/dtypes.py similarity index 100% rename from src/nanotron/fp8/dtypes.py rename to src/fmengine/fp8/dtypes.py diff --git a/src/nanotron/fp8/kernel.py b/src/fmengine/fp8/kernel.py similarity index 96% rename from src/nanotron/fp8/kernel.py rename to src/fmengine/fp8/kernel.py index 8747d6f7..3d8b681d 100644 --- a/src/nanotron/fp8/kernel.py +++ b/src/fmengine/fp8/kernel.py @@ -2,8 +2,8 @@ import transformer_engine as te # noqa import transformer_engine_extensions as tex -from nanotron.fp8.meta import FP8Meta -from nanotron.fp8.tensor import FP8Tensor +from fmengine.fp8.meta import FP8Meta +from fmengine.fp8.tensor import FP8Tensor @torch.no_grad() diff --git a/src/nanotron/fp8/linear.py b/src/fmengine/fp8/linear.py similarity index 93% rename from src/nanotron/fp8/linear.py rename to src/fmengine/fp8/linear.py index 2f03d959..692936a0 100644 --- a/src/nanotron/fp8/linear.py +++ b/src/fmengine/fp8/linear.py @@ -4,14 +4,14 @@ import torch.nn.functional as F from torch import nn -from nanotron.fp8.constants import INITIAL_AMAX, INITIAL_SCALING_FACTOR -from nanotron.fp8.dtypes import DTypes -from nanotron.fp8.kernel import fp8_matmul_kernel -from nanotron.fp8.meta import FP8Meta -from nanotron.fp8.parameter import FP8Parameter -from nanotron.fp8.tensor import FP8Tensor, update_scaling_factor -from nanotron.logging import log_rank -from nanotron import logging +from fmengine.fp8.constants import INITIAL_AMAX, INITIAL_SCALING_FACTOR +from fmengine.fp8.dtypes import DTypes +from fmengine.fp8.kernel import fp8_matmul_kernel +from fmengine.fp8.meta import FP8Meta +from fmengine.fp8.parameter import FP8Parameter +from fmengine.fp8.tensor import FP8Tensor, update_scaling_factor +from fmengine.logging import log_rank +from fmengine import logging te_available = False try: diff --git a/src/nanotron/fp8/meta.py b/src/fmengine/fp8/meta.py similarity index 90% rename from src/nanotron/fp8/meta.py rename to src/fmengine/fp8/meta.py index 8b2f776c..12b267ea 100644 --- a/src/nanotron/fp8/meta.py +++ b/src/fmengine/fp8/meta.py @@ -5,8 +5,8 @@ import transformer_engine as te # noqa import transformer_engine_extensions as tex -from nanotron.fp8.constants import DTYPE_TO_FP8_MAX -from nanotron.fp8.tensor import convert_torch_dtype_to_te_dtype +from fmengine.fp8.constants import DTYPE_TO_FP8_MAX +from fmengine.fp8.tensor import convert_torch_dtype_to_te_dtype @dataclass diff --git a/src/nanotron/fp8/parameter.py b/src/fmengine/fp8/parameter.py similarity index 90% rename from src/nanotron/fp8/parameter.py rename to src/fmengine/fp8/parameter.py index aabe3a70..45509b20 100644 --- a/src/nanotron/fp8/parameter.py +++ b/src/fmengine/fp8/parameter.py @@ -1,10 +1,10 @@ import torch from torch import nn -from nanotron.fp8.constants import FP8_DTYPES -from nanotron.fp8.dtypes import DTypes -from nanotron.fp8.meta import FP8Meta -from nanotron.fp8.tensor import FP8Tensor +from fmengine.fp8.constants import FP8_DTYPES +from fmengine.fp8.dtypes import DTypes +from fmengine.fp8.meta import FP8Meta +from fmengine.fp8.tensor import FP8Tensor class FP8Parameter(nn.Parameter): diff --git a/src/nanotron/fp8/tensor.py b/src/fmengine/fp8/tensor.py similarity index 96% rename from src/nanotron/fp8/tensor.py rename to src/fmengine/fp8/tensor.py index 40ba50b5..679ab9a9 100644 --- a/src/nanotron/fp8/tensor.py +++ b/src/fmengine/fp8/tensor.py @@ -2,8 +2,8 @@ import transformer_engine as te # noqa import transformer_engine_extensions as tex -from nanotron.fp8.constants import DTYPE_TO_FP8_MAX, FP8_DTYPES, INITIAL_SCALING_FACTOR -from nanotron.fp8.dtypes import DTypes +from fmengine.fp8.constants import DTYPE_TO_FP8_MAX, FP8_DTYPES, INITIAL_SCALING_FACTOR +from fmengine.fp8.dtypes import DTypes class FP8Tensor(torch.Tensor): @@ -15,7 +15,7 @@ def __new__(cls, tensor: torch.Tensor, dtype: DTypes) -> torch.Tensor: # TODO(xrsrke): there is a circular import issue # between tensor.py and meta.py fix this - from nanotron.fp8.meta import FP8Meta + from fmengine.fp8.meta import FP8Meta # TODO(xrsrke): if the tensor is on cpu, then bypass the quantization # because the current kernels only support gpu tensor diff --git a/src/nanotron/fp8/utils.py b/src/fmengine/fp8/utils.py similarity index 88% rename from src/nanotron/fp8/utils.py rename to src/fmengine/fp8/utils.py index 1f0e23ea..d9fe6b08 100644 --- a/src/nanotron/fp8/utils.py +++ b/src/fmengine/fp8/utils.py @@ -1,7 +1,7 @@ import torch import transformer_engine as te # noqa -from nanotron.fp8.constants import FP8_GPU_NAMES +from fmengine.fp8.constants import FP8_GPU_NAMES def is_fp8_available() -> bool: diff --git a/src/nanotron/generation/__init__.py b/src/fmengine/generation/__init__.py similarity index 100% rename from src/nanotron/generation/__init__.py rename to src/fmengine/generation/__init__.py diff --git a/src/nanotron/generation/decode.py b/src/fmengine/generation/decode.py similarity index 98% rename from src/nanotron/generation/decode.py rename to src/fmengine/generation/decode.py index ed997855..f3d42412 100644 --- a/src/nanotron/generation/decode.py +++ b/src/fmengine/generation/decode.py @@ -5,32 +5,32 @@ import torch -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import BenchArgs, GenerationArgs -from nanotron.distributed import ProcessGroup, get_global_rank -from nanotron.generation.generate_store import Store, attach_store -from nanotron.generation.sampler import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import BenchArgs, GenerationArgs +from fmengine.distributed import ProcessGroup, get_global_rank +from fmengine.generation.generate_store import Store, attach_store +from fmengine.generation.sampler import ( BasicSampler, GreedySampler, SamplerType, TopKSampler, TopPSampler, ) -from nanotron.helpers import log_throughput -from nanotron.models.llama import LlamaModel -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.block import get_min_max_rank -from nanotron.parallel.pipeline_parallel.context_manager import ( +from fmengine.helpers import log_throughput +from fmengine.models.llama import LlamaModel +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.block import get_min_max_rank +from fmengine.parallel.pipeline_parallel.context_manager import ( attach_pipeline_state_to_model, ) -from nanotron.parallel.pipeline_parallel.p2p import ( +from fmengine.parallel.pipeline_parallel.p2p import ( P2PTensorMetaData, view_as_contiguous, ) -from nanotron.parallel.pipeline_parallel.state import PipelineEvalBatchState -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.utils import get_untyped_storage +from fmengine.parallel.pipeline_parallel.state import PipelineEvalBatchState +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.utils import get_untyped_storage if TYPE_CHECKING: try: diff --git a/src/nanotron/generation/generate_store.py b/src/fmengine/generation/generate_store.py similarity index 100% rename from src/nanotron/generation/generate_store.py rename to src/fmengine/generation/generate_store.py diff --git a/src/nanotron/generation/sampler.py b/src/fmengine/generation/sampler.py similarity index 99% rename from src/nanotron/generation/sampler.py rename to src/fmengine/generation/sampler.py index e4ad6545..4ac97209 100644 --- a/src/nanotron/generation/sampler.py +++ b/src/fmengine/generation/sampler.py @@ -4,7 +4,7 @@ import torch -from nanotron import distributed as dist +from fmengine import distributed as dist def all_gather_batches( diff --git a/src/nanotron/helpers.py b/src/fmengine/helpers.py similarity index 96% rename from src/nanotron/helpers.py rename to src/fmengine/helpers.py index 88e2d002..a1b25f02 100644 --- a/src/nanotron/helpers.py +++ b/src/fmengine/helpers.py @@ -16,27 +16,27 @@ from torch.optim.lr_scheduler import LambdaLR from torch.profiler import ProfilerActivity, profile, tensorboard_trace_handler -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import Config, LRSchedulerArgs, OptimizerArgs, ParallelismArgs -from nanotron.distributed import ProcessGroup -from nanotron.logging import LogItem, log_rank -from nanotron.models.base import NanotronModel -from nanotron.optim.base import BaseOptimizer, Optimizer -from nanotron.optim.gradient_accumulator import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import Config, LRSchedulerArgs, OptimizerArgs, ParallelismArgs +from fmengine.distributed import ProcessGroup +from fmengine.logging import LogItem, log_rank +from fmengine.models.base import NanotronModel +from fmengine.optim.base import BaseOptimizer, Optimizer +from fmengine.optim.gradient_accumulator import ( FP32GradBucketManager, FP32GradientAccumulator, GradientAccumulator, get_fp32_accum_hook, ) -from nanotron.optim.named_optimizer import NamedOptimizer -from nanotron.optim.optimizer_from_gradient_accumulator import ( +from fmengine.optim.named_optimizer import NamedOptimizer +from fmengine.optim.optimizer_from_gradient_accumulator import ( OptimizerFromGradientAccumulator, ) -from nanotron.optim.zero import ZeroDistributedOptimizer -from nanotron.parallel import ParallelContext -from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode -from nanotron.random import ( +from fmengine.optim.zero import ZeroDistributedOptimizer +from fmengine.parallel import ParallelContext +from fmengine.parallel.tensor_parallel.nn import TensorParallelLinearMode +from fmengine.random import ( RandomStates, get_current_random_state, get_synced_random_state, diff --git a/src/nanotron/logging.py b/src/fmengine/logging.py similarity index 99% rename from src/nanotron/logging.py rename to src/fmengine/logging.py index d03b2df2..503a38e9 100644 --- a/src/nanotron/logging.py +++ b/src/fmengine/logging.py @@ -34,8 +34,8 @@ import torch from torch import distributed as torch_dist -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext log_levels = { "debug": DEBUG, diff --git a/src/nanotron/models/__init__.py b/src/fmengine/models/__init__.py similarity index 100% rename from src/nanotron/models/__init__.py rename to src/fmengine/models/__init__.py diff --git a/src/nanotron/models/base.py b/src/fmengine/models/base.py similarity index 96% rename from src/nanotron/models/base.py rename to src/fmengine/models/base.py index 6fa7ac24..ac918e95 100644 --- a/src/nanotron/models/base.py +++ b/src/fmengine/models/base.py @@ -6,16 +6,16 @@ import torch from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.distributed import ProcessGroup -from nanotron.logging import log_rank -from nanotron.parallel.context import ParallelContext -from nanotron.parallel.pipeline_parallel.block import PipelineBlock +from fmengine import distributed as dist +from fmengine import logging +from fmengine.distributed import ProcessGroup +from fmengine.logging import log_rank +from fmengine.parallel.context import ParallelContext +from fmengine.parallel.pipeline_parallel.block import PipelineBlock if TYPE_CHECKING: - from nanotron.config import NanotronConfigs - from nanotron.parallel.parameters import NanotronParameter + from fmengine.config import NanotronConfigs + from fmengine.parallel.parameters import NanotronParameter logger = logging.get_logger(__name__) diff --git a/src/nanotron/models/llama.py b/src/fmengine/models/llama.py similarity index 98% rename from src/nanotron/models/llama.py rename to src/fmengine/models/llama.py index 1efd528b..21ac4066 100644 --- a/src/nanotron/models/llama.py +++ b/src/fmengine/models/llama.py @@ -25,27 +25,27 @@ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import LlamaConfig, ParallelismArgs -from nanotron.generation.generate_store import AttachableStore -from nanotron.logging import log_rank -from nanotron.models import NanotronModel -from nanotron.nn.activations import ACT2FN -from nanotron.nn.layer_norm import TritonRMSNorm -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import LlamaConfig, ParallelismArgs +from fmengine.generation.generate_store import AttachableStore +from fmengine.logging import log_rank +from fmengine.models import NanotronModel +from fmengine.nn.activations import ACT2FN +from fmengine.nn.layer_norm import TritonRMSNorm +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelLinearMode, TensorParallelRowLinear, ) -from nanotron.random import RandomStates -from nanotron.utils import checkpoint_method +from fmengine.random import RandomStates +from fmengine.utils import checkpoint_method logger = logging.get_logger(__name__) diff --git a/src/nanotron/models/mistral.py b/src/fmengine/models/mistral.py similarity index 98% rename from src/nanotron/models/mistral.py rename to src/fmengine/models/mistral.py index bc023802..9d029b06 100644 --- a/src/nanotron/models/mistral.py +++ b/src/fmengine/models/mistral.py @@ -25,27 +25,27 @@ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import MistralConfig, ParallelismArgs -from nanotron.generation.generate_store import AttachableStore -from nanotron.logging import log_rank -from nanotron.models import NanotronModel -from nanotron.nn.activations import ACT2FN -from nanotron.nn.layer_norm import TritonRMSNorm -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import MistralConfig, ParallelismArgs +from fmengine.generation.generate_store import AttachableStore +from fmengine.logging import log_rank +from fmengine.models import NanotronModel +from fmengine.nn.activations import ACT2FN +from fmengine.nn.layer_norm import TritonRMSNorm +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.tensor_parallel.functional import sharded_cross_entropy +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelLinearMode, TensorParallelRowLinear, ) -from nanotron.random import RandomStates -from nanotron.utils import checkpoint_method +from fmengine.random import RandomStates +from fmengine.utils import checkpoint_method logger = logging.get_logger(__name__) diff --git a/src/nanotron/models/starcoder2.py b/src/fmengine/models/starcoder2.py similarity index 98% rename from src/nanotron/models/starcoder2.py rename to src/fmengine/models/starcoder2.py index 97f78aec..770fcb1d 100644 --- a/src/nanotron/models/starcoder2.py +++ b/src/fmengine/models/starcoder2.py @@ -34,34 +34,34 @@ from torch.nn import functional as F from torch.nn import init -from nanotron import distributed as dist -from nanotron.config import ParallelismArgs, Starcoder2Config -from nanotron.generation.generate_store import AttachableStore -from nanotron.models import NanotronModel -from nanotron.nn.activations import ACT2FN -from nanotron.nn.layer_norm import TritonLayerNorm -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.sharded_parameters import ( +from fmengine import distributed as dist +from fmengine.config import ParallelismArgs, Starcoder2Config +from fmengine.generation.generate_store import AttachableStore +from fmengine.models import NanotronModel +from fmengine.nn.activations import ACT2FN +from fmengine.nn.layer_norm import TritonLayerNorm +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.sharded_parameters import ( SplitConfig, mark_all_parameters_in_module_as_sharded, ) -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.tensor_parallel.functional import ( +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.functional import ( column_linear, sharded_cross_entropy, ) -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, ) -from nanotron.parallel.tied_parameters import tie_parameters -from nanotron.random import RandomStates, branch_random_state -from nanotron.utils import checkpoint_method +from fmengine.parallel.tied_parameters import tie_parameters +from fmengine.random import RandomStates, branch_random_state +from fmengine.utils import checkpoint_method _flash_supports_window_size = "window_size" in list( inspect.signature(flash_attn_varlen_func).parameters diff --git a/src/nanotron/nn/__init__.py b/src/fmengine/nn/__init__.py similarity index 100% rename from src/nanotron/nn/__init__.py rename to src/fmengine/nn/__init__.py diff --git a/src/nanotron/nn/activations.py b/src/fmengine/nn/activations.py similarity index 99% rename from src/nanotron/nn/activations.py rename to src/fmengine/nn/activations.py index aaaba467..d3552e90 100644 --- a/src/nanotron/nn/activations.py +++ b/src/fmengine/nn/activations.py @@ -19,7 +19,7 @@ from packaging import version from torch import Tensor, nn -from nanotron import logging +from fmengine import logging logger = logging.get_logger(__name__) diff --git a/src/nanotron/nn/layer_norm.py b/src/fmengine/nn/layer_norm.py similarity index 100% rename from src/nanotron/nn/layer_norm.py rename to src/fmengine/nn/layer_norm.py diff --git a/src/fmengine/optim/__init__.py b/src/fmengine/optim/__init__.py new file mode 100644 index 00000000..4f51ada7 --- /dev/null +++ b/src/fmengine/optim/__init__.py @@ -0,0 +1,15 @@ +from fmengine.optim.base import BaseOptimizer +from fmengine.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer +from fmengine.optim.named_optimizer import NamedOptimizer +from fmengine.optim.optimizer_from_gradient_accumulator import ( + OptimizerFromGradientAccumulator, +) +from fmengine.optim.zero import ZeroDistributedOptimizer + +__all__ = [ + "BaseOptimizer", + "InheritFromOtherOptimizer", + "NamedOptimizer", + "OptimizerFromGradientAccumulator", + "ZeroDistributedOptimizer", +] diff --git a/src/nanotron/optim/base.py b/src/fmengine/optim/base.py similarity index 100% rename from src/nanotron/optim/base.py rename to src/fmengine/optim/base.py diff --git a/src/nanotron/optim/clip_grads.py b/src/fmengine/optim/clip_grads.py similarity index 95% rename from src/nanotron/optim/clip_grads.py rename to src/fmengine/optim/clip_grads.py index e6aef19b..a3b94b36 100644 --- a/src/nanotron/optim/clip_grads.py +++ b/src/fmengine/optim/clip_grads.py @@ -2,10 +2,10 @@ import torch -import nanotron.distributed as dist -from nanotron import logging -from nanotron.optim.gradient_accumulator import GradientAccumulator -from nanotron.parallel.parameters import NanotronParameter +import fmengine.distributed as dist +from fmengine import logging +from fmengine.optim.gradient_accumulator import GradientAccumulator +from fmengine.parallel.parameters import NanotronParameter logger = logging.get_logger(__name__) diff --git a/src/nanotron/optim/gradient_accumulator.py b/src/fmengine/optim/gradient_accumulator.py similarity index 98% rename from src/nanotron/optim/gradient_accumulator.py rename to src/fmengine/optim/gradient_accumulator.py index 02d49076..de49cbed 100644 --- a/src/nanotron/optim/gradient_accumulator.py +++ b/src/fmengine/optim/gradient_accumulator.py @@ -7,10 +7,10 @@ import torch from torch.distributed import GradBucket -import nanotron.distributed as dist -from nanotron import logging -from nanotron.parallel.parameters import NanotronParameter -from nanotron.utils import get_untyped_storage, tensor_from_untyped_storage +import fmengine.distributed as dist +from fmengine import logging +from fmengine.parallel.parameters import NanotronParameter +from fmengine.utils import get_untyped_storage, tensor_from_untyped_storage logger = logging.get_logger(__name__) diff --git a/src/nanotron/optim/inherit_from_other_optimizer.py b/src/fmengine/optim/inherit_from_other_optimizer.py similarity index 96% rename from src/nanotron/optim/inherit_from_other_optimizer.py rename to src/fmengine/optim/inherit_from_other_optimizer.py index 2ddd36d0..42194ae2 100644 --- a/src/nanotron/optim/inherit_from_other_optimizer.py +++ b/src/fmengine/optim/inherit_from_other_optimizer.py @@ -3,7 +3,7 @@ import torch -from nanotron.optim.base import BaseOptimizer, Optimizer +from fmengine.optim.base import BaseOptimizer, Optimizer class InheritFromOtherOptimizer(BaseOptimizer): diff --git a/src/nanotron/optim/named_optimizer.py b/src/fmengine/optim/named_optimizer.py similarity index 97% rename from src/nanotron/optim/named_optimizer.py rename to src/fmengine/optim/named_optimizer.py index ca8a3ff7..5e9a8b91 100644 --- a/src/nanotron/optim/named_optimizer.py +++ b/src/fmengine/optim/named_optimizer.py @@ -2,7 +2,7 @@ import torch -from nanotron.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer +from fmengine.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer class NamedOptimizer(InheritFromOtherOptimizer): diff --git a/src/nanotron/optim/optimizer_from_gradient_accumulator.py b/src/fmengine/optim/optimizer_from_gradient_accumulator.py similarity index 92% rename from src/nanotron/optim/optimizer_from_gradient_accumulator.py rename to src/fmengine/optim/optimizer_from_gradient_accumulator.py index 111c96e5..c363c560 100644 --- a/src/nanotron/optim/optimizer_from_gradient_accumulator.py +++ b/src/fmengine/optim/optimizer_from_gradient_accumulator.py @@ -3,10 +3,10 @@ import torch -from nanotron.optim.base import BaseOptimizer -from nanotron.optim.gradient_accumulator import GradientAccumulator -from nanotron.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer -from nanotron.parallel.parameters import NanotronParameter +from fmengine.optim.base import BaseOptimizer +from fmengine.optim.gradient_accumulator import GradientAccumulator +from fmengine.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer +from fmengine.parallel.parameters import NanotronParameter class OptimizerFromGradientAccumulator(InheritFromOtherOptimizer): diff --git a/src/nanotron/optim/zero.py b/src/fmengine/optim/zero.py similarity index 98% rename from src/nanotron/optim/zero.py rename to src/fmengine/optim/zero.py index 74766aa0..69581246 100644 --- a/src/nanotron/optim/zero.py +++ b/src/fmengine/optim/zero.py @@ -10,14 +10,14 @@ from torch import nn from tqdm import tqdm -from nanotron import distributed as dist -from nanotron import logging -from nanotron.distributed import ProcessGroup -from nanotron.logging import human_format, log_rank, warn_once -from nanotron.optim.base import BaseOptimizer -from nanotron.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter +from fmengine import distributed as dist +from fmengine import logging +from fmengine.distributed import ProcessGroup +from fmengine.logging import human_format, log_rank, warn_once +from fmengine.optim.base import BaseOptimizer +from fmengine.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter logger = logging.get_logger(__name__) diff --git a/src/fmengine/parallel/__init__.py b/src/fmengine/parallel/__init__.py new file mode 100644 index 00000000..af97441a --- /dev/null +++ b/src/fmengine/parallel/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from fmengine.parallel.context import ParallelContext diff --git a/src/nanotron/parallel/context.py b/src/fmengine/parallel/context.py similarity index 99% rename from src/nanotron/parallel/context.py rename to src/fmengine/parallel/context.py index b9d3278b..831253a4 100644 --- a/src/nanotron/parallel/context.py +++ b/src/fmengine/parallel/context.py @@ -4,7 +4,7 @@ import numpy as np import torch -import nanotron.distributed as dist +import fmengine.distributed as dist DistributedBackend = Literal["gloo", "mpi", "nccl"] diff --git a/src/nanotron/parallel/data_parallel/utils.py b/src/fmengine/parallel/data_parallel/utils.py similarity index 94% rename from src/nanotron/parallel/data_parallel/utils.py rename to src/fmengine/parallel/data_parallel/utils.py index c6c5c935..03617992 100644 --- a/src/nanotron/parallel/data_parallel/utils.py +++ b/src/fmengine/parallel/data_parallel/utils.py @@ -4,8 +4,8 @@ import torch from torch import nn -from nanotron import distributed as dist -from nanotron.optim.gradient_accumulator import GradientAccumulator +from fmengine import distributed as dist +from fmengine.optim.gradient_accumulator import GradientAccumulator @contextmanager diff --git a/src/nanotron/parallel/parameters.py b/src/fmengine/parallel/parameters.py similarity index 98% rename from src/nanotron/parallel/parameters.py rename to src/fmengine/parallel/parameters.py index 02c44f52..bc207dd5 100644 --- a/src/nanotron/parallel/parameters.py +++ b/src/fmengine/parallel/parameters.py @@ -4,9 +4,9 @@ import torch from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.models import NanotronModel +from fmengine import distributed as dist +from fmengine import logging +from fmengine.models import NanotronModel logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/pipeline_parallel/README.md b/src/fmengine/parallel/pipeline_parallel/README.md similarity index 100% rename from src/nanotron/parallel/pipeline_parallel/README.md rename to src/fmengine/parallel/pipeline_parallel/README.md diff --git a/src/nanotron/parallel/pipeline_parallel/block.py b/src/fmengine/parallel/pipeline_parallel/block.py similarity index 96% rename from src/nanotron/parallel/pipeline_parallel/block.py rename to src/fmengine/parallel/pipeline_parallel/block.py index dcfbf917..baa3bcce 100644 --- a/src/nanotron/parallel/pipeline_parallel/block.py +++ b/src/fmengine/parallel/pipeline_parallel/block.py @@ -3,17 +3,17 @@ import torch from torch import nn -from nanotron import distributed as dist -from nanotron.parallel.pipeline_parallel.functional import ( +from fmengine import distributed as dist +from fmengine.parallel.pipeline_parallel.functional import ( recv_from_pipeline_state_buffer, send_to_pipeline_state_buffer, ) -from nanotron.parallel.pipeline_parallel.p2p import P2P, BatchTensorSendRecvState -from nanotron.parallel.pipeline_parallel.state import ( +from fmengine.parallel.pipeline_parallel.p2p import P2P, BatchTensorSendRecvState +from fmengine.parallel.pipeline_parallel.state import ( PipelineBatchState, PipelineTrainBatchState, ) -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer class PipelineBlock(nn.Module): diff --git a/src/nanotron/parallel/pipeline_parallel/context_manager.py b/src/fmengine/parallel/pipeline_parallel/context_manager.py similarity index 87% rename from src/nanotron/parallel/pipeline_parallel/context_manager.py rename to src/fmengine/parallel/pipeline_parallel/context_manager.py index 1eb1d654..8ceaa2b4 100644 --- a/src/nanotron/parallel/pipeline_parallel/context_manager.py +++ b/src/fmengine/parallel/pipeline_parallel/context_manager.py @@ -2,8 +2,8 @@ from torch import nn as torch_nn -from nanotron.parallel.pipeline_parallel.block import PipelineBlock -from nanotron.parallel.pipeline_parallel.state import PipelineBatchState +from fmengine.parallel.pipeline_parallel.block import PipelineBlock +from fmengine.parallel.pipeline_parallel.state import PipelineBatchState @contextmanager diff --git a/src/nanotron/parallel/pipeline_parallel/engine.py b/src/fmengine/parallel/pipeline_parallel/engine.py similarity index 96% rename from src/nanotron/parallel/pipeline_parallel/engine.py rename to src/fmengine/parallel/pipeline_parallel/engine.py index 14425ea2..24bdf3f3 100644 --- a/src/nanotron/parallel/pipeline_parallel/engine.py +++ b/src/fmengine/parallel/pipeline_parallel/engine.py @@ -5,18 +5,18 @@ from torch import nn as torch_nn from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron import logging -from nanotron.distributed import ProcessGroup -from nanotron.logging import log_rank -from nanotron.optim.gradient_accumulator import GradientAccumulator -from nanotron.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd -from nanotron.parallel.pipeline_parallel.context_manager import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.distributed import ProcessGroup +from fmengine.logging import log_rank +from fmengine.optim.gradient_accumulator import GradientAccumulator +from fmengine.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd +from fmengine.parallel.pipeline_parallel.context_manager import ( attach_pipeline_state_to_model, ) -from nanotron.parallel.pipeline_parallel.state import PipelineTrainBatchState -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.utils import ContextManagers +from fmengine.parallel.pipeline_parallel.state import PipelineTrainBatchState +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.utils import ContextManagers logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/pipeline_parallel/functional.py b/src/fmengine/parallel/pipeline_parallel/functional.py similarity index 96% rename from src/nanotron/parallel/pipeline_parallel/functional.py rename to src/fmengine/parallel/pipeline_parallel/functional.py index f43ff20b..159e6ada 100644 --- a/src/nanotron/parallel/pipeline_parallel/functional.py +++ b/src/fmengine/parallel/pipeline_parallel/functional.py @@ -1,8 +1,8 @@ import torch -from nanotron import logging -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.pipeline_parallel.state import PipelineBatchState +from fmengine import logging +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.pipeline_parallel.state import PipelineBatchState logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/pipeline_parallel/p2p.py b/src/fmengine/parallel/pipeline_parallel/p2p.py similarity index 99% rename from src/nanotron/parallel/pipeline_parallel/p2p.py rename to src/fmengine/parallel/pipeline_parallel/p2p.py index fc9d1fa6..40620a3f 100644 --- a/src/nanotron/parallel/pipeline_parallel/p2p.py +++ b/src/fmengine/parallel/pipeline_parallel/p2p.py @@ -3,9 +3,9 @@ import torch -from nanotron import distributed as dist -from nanotron import logging -from nanotron.utils import get_untyped_storage, tensor_from_untyped_storage +from fmengine import distributed as dist +from fmengine import logging +from fmengine.utils import get_untyped_storage, tensor_from_untyped_storage logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/pipeline_parallel/state.py b/src/fmengine/parallel/pipeline_parallel/state.py similarity index 98% rename from src/nanotron/parallel/pipeline_parallel/state.py rename to src/fmengine/parallel/pipeline_parallel/state.py index b722b276..81a182af 100644 --- a/src/nanotron/parallel/pipeline_parallel/state.py +++ b/src/fmengine/parallel/pipeline_parallel/state.py @@ -5,10 +5,10 @@ import torch -from nanotron import distributed as dist -from nanotron import logging -from nanotron.logging import log_rank -from nanotron.parallel.pipeline_parallel.p2p import P2P +from fmengine import distributed as dist +from fmengine import logging +from fmengine.logging import log_rank +from fmengine.parallel.pipeline_parallel.p2p import P2P logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/pipeline_parallel/tensor_pointer.py b/src/fmengine/parallel/pipeline_parallel/tensor_pointer.py similarity index 100% rename from src/nanotron/parallel/pipeline_parallel/tensor_pointer.py rename to src/fmengine/parallel/pipeline_parallel/tensor_pointer.py diff --git a/src/nanotron/parallel/pipeline_parallel/utils.py b/src/fmengine/parallel/pipeline_parallel/utils.py similarity index 92% rename from src/nanotron/parallel/pipeline_parallel/utils.py rename to src/fmengine/parallel/pipeline_parallel/utils.py index eb4389d4..e97f6ed5 100644 --- a/src/nanotron/parallel/pipeline_parallel/utils.py +++ b/src/fmengine/parallel/pipeline_parallel/utils.py @@ -1,8 +1,8 @@ from torch import nn from torch.nn.parallel import DistributedDataParallel -from nanotron.models import NanotronModel -from nanotron.parallel.pipeline_parallel.block import PipelineBlock +from fmengine.models import NanotronModel +from fmengine.parallel.pipeline_parallel.block import PipelineBlock def get_input_output_pp_ranks(model: NanotronModel | DistributedDataParallel): diff --git a/src/nanotron/parallel/sharded_parameters.py b/src/fmengine/parallel/sharded_parameters.py similarity index 98% rename from src/nanotron/parallel/sharded_parameters.py rename to src/fmengine/parallel/sharded_parameters.py index 1abe398c..34cbe5d1 100644 --- a/src/nanotron/parallel/sharded_parameters.py +++ b/src/fmengine/parallel/sharded_parameters.py @@ -4,8 +4,8 @@ import numpy as np from torch import nn -from nanotron import distributed as dist -from nanotron.parallel.parameters import NanotronParameter, SlicesPair +from fmengine import distributed as dist +from fmengine.parallel.parameters import NanotronParameter, SlicesPair @dataclasses.dataclass diff --git a/src/nanotron/parallel/tensor_parallel/__init__.py b/src/fmengine/parallel/tensor_parallel/__init__.py similarity index 100% rename from src/nanotron/parallel/tensor_parallel/__init__.py rename to src/fmengine/parallel/tensor_parallel/__init__.py diff --git a/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py b/src/fmengine/parallel/tensor_parallel/distributed_differentiable_primitives.py similarity index 98% rename from src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py rename to src/fmengine/parallel/tensor_parallel/distributed_differentiable_primitives.py index dabaed72..b6b7d9bb 100644 --- a/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py +++ b/src/fmengine/parallel/tensor_parallel/distributed_differentiable_primitives.py @@ -17,8 +17,8 @@ import torch from torch import distributed as torch_dist -from nanotron import distributed as dist -from nanotron.distributed import ProcessGroup +from fmengine import distributed as dist +from fmengine.distributed import ProcessGroup class DifferentiableIdentity(torch.autograd.Function): diff --git a/src/nanotron/parallel/tensor_parallel/enum.py b/src/fmengine/parallel/tensor_parallel/enum.py similarity index 100% rename from src/nanotron/parallel/tensor_parallel/enum.py rename to src/fmengine/parallel/tensor_parallel/enum.py diff --git a/src/nanotron/parallel/tensor_parallel/functional.py b/src/fmengine/parallel/tensor_parallel/functional.py similarity index 98% rename from src/nanotron/parallel/tensor_parallel/functional.py rename to src/fmengine/parallel/tensor_parallel/functional.py index 5b643b2e..b661c39c 100644 --- a/src/nanotron/parallel/tensor_parallel/functional.py +++ b/src/fmengine/parallel/tensor_parallel/functional.py @@ -18,15 +18,15 @@ import torch from torch.nn import functional as F -import nanotron.distributed as dist -from nanotron.parallel.tensor_parallel.distributed_differentiable_primitives import ( +import fmengine.distributed as dist +from fmengine.parallel.tensor_parallel.distributed_differentiable_primitives import ( differentiable_all_gather, differentiable_all_reduce_sum, differentiable_identity, differentiable_reduce_scatter_sum, ) -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.utils import assert_cuda_max_connections_set_to_1 +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.utils import assert_cuda_max_connections_set_to_1 class _ShardedCrossEntropy(torch.autograd.Function): diff --git a/src/nanotron/parallel/tensor_parallel/nn.py b/src/fmengine/parallel/tensor_parallel/nn.py similarity index 95% rename from src/nanotron/parallel/tensor_parallel/nn.py rename to src/fmengine/parallel/tensor_parallel/nn.py index 6f22be1e..6f85a8a4 100644 --- a/src/nanotron/parallel/tensor_parallel/nn.py +++ b/src/fmengine/parallel/tensor_parallel/nn.py @@ -17,23 +17,23 @@ import torch from torch import nn -from nanotron import distributed as dist -from nanotron.distributed import get_global_rank -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.sharded_parameters import ( +from fmengine import distributed as dist +from fmengine.distributed import get_global_rank +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.sharded_parameters import ( SplitConfig, create_sharded_parameter_from_config, mark_all_parameters_in_module_as_sharded, ) -from nanotron.parallel.tensor_parallel.distributed_differentiable_primitives import ( +from fmengine.parallel.tensor_parallel.distributed_differentiable_primitives import ( differentiable_all_gather, differentiable_all_reduce_sum, differentiable_identity, differentiable_reduce_scatter_sum, ) -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.tensor_parallel.functional import column_linear, row_linear -from nanotron.parallel.tied_parameters import create_tied_parameter +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.functional import column_linear, row_linear +from fmengine.parallel.tied_parameters import create_tied_parameter class TensorParallelColumnLinear(nn.Linear): def __init__( diff --git a/src/nanotron/parallel/tied_parameters.py b/src/fmengine/parallel/tied_parameters.py similarity index 95% rename from src/nanotron/parallel/tied_parameters.py rename to src/fmengine/parallel/tied_parameters.py index 70c2b4ea..aa334bf1 100644 --- a/src/nanotron/parallel/tied_parameters.py +++ b/src/fmengine/parallel/tied_parameters.py @@ -3,13 +3,13 @@ from torch import nn -from nanotron import distributed as dist -from nanotron import logging -from nanotron.logging import log_rank -from nanotron.optim.gradient_accumulator import GradientAccumulator -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.utils import get_parameter_and_parent_module +from fmengine import distributed as dist +from fmengine import logging +from fmengine.logging import log_rank +from fmengine.optim.gradient_accumulator import GradientAccumulator +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.utils import get_parameter_and_parent_module logger = logging.get_logger(__name__) diff --git a/src/nanotron/parallel/utils.py b/src/fmengine/parallel/utils.py similarity index 87% rename from src/nanotron/parallel/utils.py rename to src/fmengine/parallel/utils.py index 91c07035..cdba9a54 100644 --- a/src/nanotron/parallel/utils.py +++ b/src/fmengine/parallel/utils.py @@ -3,9 +3,9 @@ from torch import nn -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.parallel.tied_parameters import get_tied_id_to_param +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.parallel.tied_parameters import get_tied_id_to_param def assert_cuda_max_connections_set_to_1(func): diff --git a/src/nanotron/random.py b/src/fmengine/random.py similarity index 98% rename from src/nanotron/random.py rename to src/fmengine/random.py index 383907da..d97535e8 100644 --- a/src/nanotron/random.py +++ b/src/fmengine/random.py @@ -6,8 +6,8 @@ import numpy as np import torch -from nanotron import distributed as dist -from nanotron.distributed import ProcessGroup +from fmengine import distributed as dist +from fmengine.distributed import ProcessGroup @dataclass diff --git a/src/nanotron/sanity_checks.py b/src/fmengine/sanity_checks.py similarity index 96% rename from src/nanotron/sanity_checks.py rename to src/fmengine/sanity_checks.py index dbfa0939..e9abd7af 100644 --- a/src/nanotron/sanity_checks.py +++ b/src/fmengine/sanity_checks.py @@ -3,14 +3,14 @@ import torch -from nanotron import distributed as dist -from nanotron import logging, optim -from nanotron.config import Config -from nanotron.logging import get_logger, log_rank -from nanotron.models import NanotronModel -from nanotron.optim.gradient_accumulator import GradientAccumulator -from nanotron.parallel import ParallelContext -from nanotron.parallel.tied_parameters import get_tied_id_to_param +from fmengine import distributed as dist +from fmengine import logging, optim +from fmengine.config import Config +from fmengine.logging import get_logger, log_rank +from fmengine.models import NanotronModel +from fmengine.optim.gradient_accumulator import GradientAccumulator +from fmengine.parallel import ParallelContext +from fmengine.parallel.tied_parameters import get_tied_id_to_param logger = get_logger(__name__) diff --git a/src/fmengine/serialize/__init__.py b/src/fmengine/serialize/__init__.py new file mode 100644 index 00000000..1a1049d9 --- /dev/null +++ b/src/fmengine/serialize/__init__.py @@ -0,0 +1,4 @@ +# flake8: noqa +from fmengine.serialize.main import * +from fmengine.serialize.optimizer import * +from fmengine.serialize.random import * diff --git a/src/nanotron/serialize/main.py b/src/fmengine/serialize/main.py similarity index 95% rename from src/nanotron/serialize/main.py rename to src/fmengine/serialize/main.py index c57365ad..05497b42 100644 --- a/src/nanotron/serialize/main.py +++ b/src/fmengine/serialize/main.py @@ -5,26 +5,26 @@ from torch import nn from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron import logging -from nanotron import optim as optim -from nanotron.config import Config -from nanotron.distributed import get_global_rank -from nanotron.logging import log_rank -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.sanity_checks import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine import optim as optim +from fmengine.config import Config +from fmengine.distributed import get_global_rank +from fmengine.logging import log_rank +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.sanity_checks import ( assert_tensor_synced_across_pg, check_optim_state_in_sync, ) -from nanotron.serialize.metadata import CheckpointMetadata, load_meta, save_meta -from nanotron.serialize.optimizer import ( +from fmengine.serialize.metadata import CheckpointMetadata, load_meta, save_meta +from fmengine.serialize.optimizer import ( load_lr_scheduler, load_optimizer, save_lr_scheduler, save_optimizer, ) -from nanotron.serialize.weights import load_weights, save_weights +from fmengine.serialize.weights import load_weights, save_weights """ We're going to use safetensors. The reason is that loading segments is going to be much easier diff --git a/src/nanotron/serialize/metadata.py b/src/fmengine/serialize/metadata.py similarity index 95% rename from src/nanotron/serialize/metadata.py rename to src/fmengine/serialize/metadata.py index 3673cd50..82d66e3c 100644 --- a/src/nanotron/serialize/metadata.py +++ b/src/fmengine/serialize/metadata.py @@ -8,10 +8,10 @@ from dacite import from_dict from packaging.version import Version -from nanotron import distributed as dist -from nanotron.constants import CHECKPOINT_VERSION -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import SlicesPair +from fmengine import distributed as dist +from fmengine.constants import CHECKPOINT_VERSION +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import SlicesPair @dataclasses.dataclass diff --git a/src/nanotron/serialize/optimizer.py b/src/fmengine/serialize/optimizer.py similarity index 97% rename from src/nanotron/serialize/optimizer.py rename to src/fmengine/serialize/optimizer.py index ffc2ab5b..c15f5326 100644 --- a/src/nanotron/serialize/optimizer.py +++ b/src/fmengine/serialize/optimizer.py @@ -6,20 +6,20 @@ from torch import nn from tqdm import tqdm -from nanotron import distributed as dist -from nanotron import optim -from nanotron.optim.zero import ( +from fmengine import distributed as dist +from fmengine import optim +from fmengine.optim.zero import ( ZeroDistributedOptimizer, extract_parallel_ranks_from_shard_path, find_optim_index_from_param_name, get_sliced_tensor, merge_dp_shard_in_zero1_optimizer, ) -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.sanity_checks import check_optim_state_in_sync -from nanotron.serialize.metadata import TensorMetadata -from nanotron.serialize.utils import ObjectType, merge_and_shard_tp_tensors +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.sanity_checks import check_optim_state_in_sync +from fmengine.serialize.metadata import TensorMetadata +from fmengine.serialize.utils import ObjectType, merge_and_shard_tp_tensors # TODO(xrsrke): take rank instead of parallel_context diff --git a/src/nanotron/serialize/random.py b/src/fmengine/serialize/random.py similarity index 91% rename from src/nanotron/serialize/random.py rename to src/fmengine/serialize/random.py index 79046248..27433afe 100644 --- a/src/nanotron/serialize/random.py +++ b/src/fmengine/serialize/random.py @@ -2,9 +2,9 @@ import torch -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.random import RandomStates +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.random import RandomStates def save_random_states( diff --git a/src/nanotron/serialize/utils.py b/src/fmengine/serialize/utils.py similarity index 93% rename from src/nanotron/serialize/utils.py rename to src/fmengine/serialize/utils.py index 335d75ff..22d33013 100644 --- a/src/nanotron/serialize/utils.py +++ b/src/fmengine/serialize/utils.py @@ -5,9 +5,9 @@ import torch -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import SlicesPair -from nanotron.serialize.metadata import TensorMetadata +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import SlicesPair +from fmengine.serialize.metadata import TensorMetadata class ObjectType(Enum): diff --git a/src/nanotron/serialize/weights.py b/src/fmengine/serialize/weights.py similarity index 97% rename from src/nanotron/serialize/weights.py rename to src/fmengine/serialize/weights.py index 1040d2a3..947b3f59 100644 --- a/src/nanotron/serialize/weights.py +++ b/src/fmengine/serialize/weights.py @@ -8,15 +8,15 @@ from torch import nn from tqdm import tqdm -from nanotron import distributed as dist -from nanotron import logging -from nanotron.constants import CHECKPOINT_VERSION -from nanotron.distributed import get_global_rank -from nanotron.logging import log_rank -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter, ShardedInfo, SlicesPair -from nanotron.serialize.metadata import CheckpointMetadata, TensorMetadata, load_meta -from nanotron.serialize.utils import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.constants import CHECKPOINT_VERSION +from fmengine.distributed import get_global_rank +from fmengine.logging import log_rank +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter, ShardedInfo, SlicesPair +from fmengine.serialize.metadata import CheckpointMetadata, TensorMetadata, load_meta +from fmengine.serialize.utils import ( ObjectType, extract_tp_pp_rank_from_shard_path, get_path, diff --git a/src/nanotron/trainer.py b/src/fmengine/trainer.py similarity index 96% rename from src/nanotron/trainer.py rename to src/fmengine/trainer.py index 4cbccc63..c8754bec 100644 --- a/src/nanotron/trainer.py +++ b/src/fmengine/trainer.py @@ -21,17 +21,17 @@ import torch from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron import logging -from nanotron.config import ( +from fmengine import distributed as dist +from fmengine import logging +from fmengine.config import ( Config, ExistingCheckpointInit, ParallelismArgs, RandomInit, get_config_from_file, ) -from nanotron.dataloader.dataloader import sanity_check_dataloader -from nanotron.helpers import ( +from fmengine.dataloader.dataloader import sanity_check_dataloader +from fmengine.helpers import ( _vocab_size_with_padding, get_profiler, init_optimizer_and_grad_accumulator, @@ -39,7 +39,7 @@ log_throughput, lr_scheduler_builder, ) -from nanotron.logging import ( +from fmengine.logging import ( LoggerWriter, LogItem, human_format, @@ -47,36 +47,36 @@ log_rank, set_logger_verbosity_format, ) -from nanotron.models import NanotronModel, build_model -from nanotron.models.base import check_model_has_grad -from nanotron.models.llama import LlamaForTraining, RotaryEmbedding -from nanotron.models.starcoder2 import Starcoder2ForTraining -from nanotron.models.mistral import MistralForTraining -from nanotron.optim.clip_grads import clip_grad_norm -from nanotron.parallel import ParallelContext -from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp -from nanotron.parallel.parameters import NanotronParameter, sanity_check -from nanotron.parallel.pipeline_parallel.engine import PipelineEngine -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine.models import NanotronModel, build_model +from fmengine.models.base import check_model_has_grad +from fmengine.models.llama import LlamaForTraining, RotaryEmbedding +from fmengine.models.starcoder2 import Starcoder2ForTraining +from fmengine.models.mistral import MistralForTraining +from fmengine.optim.clip_grads import clip_grad_norm +from fmengine.parallel import ParallelContext +from fmengine.parallel.data_parallel.utils import sync_gradients_across_dp +from fmengine.parallel.parameters import NanotronParameter, sanity_check +from fmengine.parallel.pipeline_parallel.engine import PipelineEngine +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.pipeline_parallel.utils import get_pp_rank_of +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelLinearMode, TensorParallelRowLinear, ) -from nanotron.parallel.tied_parameters import ( +from fmengine.parallel.tied_parameters import ( create_pg_for_tied_weights, get_tied_id_to_param, sync_tied_weights_gradients, tie_parameters, ) -from nanotron.random import set_random_seed -from nanotron.sanity_checks import ( +from fmengine.random import set_random_seed +from fmengine.sanity_checks import ( after_optim_step_sanity_checks, after_tbi_sanity_checks, before_optim_step_sanity_checks, before_tbi_sanity_checks, ) -from nanotron.serialize import ( +from fmengine.serialize import ( load_lr_scheduler, load_meta, load_optimizer, @@ -85,7 +85,7 @@ save, save_random_states, ) -from nanotron.utils import init_method_normal, scaled_init_method_normal +from fmengine.utils import init_method_normal, scaled_init_method_normal logger = logging.get_logger(__name__) diff --git a/src/nanotron/utils.py b/src/fmengine/utils.py similarity index 99% rename from src/nanotron/utils.py rename to src/fmengine/utils.py index d92a7e24..440bd80b 100644 --- a/src/nanotron/utils.py +++ b/src/fmengine/utils.py @@ -12,7 +12,7 @@ from torch import nn from torch.utils.checkpoint import checkpoint from datasets import Dataset, IterableDataset -from nanotron import distributed as dist +from fmengine import distributed as dist class ContextManagers: diff --git a/src/nanotron/config/__init__.py b/src/nanotron/config/__init__.py deleted file mode 100644 index 93bcde49..00000000 --- a/src/nanotron/config/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# flake8: noqa -from nanotron.config.config import * -from nanotron.config.lighteval_config import * -from nanotron.config.models_config import * -from nanotron.config.utils_config import * diff --git a/src/nanotron/optim/__init__.py b/src/nanotron/optim/__init__.py deleted file mode 100644 index 74723e1d..00000000 --- a/src/nanotron/optim/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from nanotron.optim.base import BaseOptimizer -from nanotron.optim.inherit_from_other_optimizer import InheritFromOtherOptimizer -from nanotron.optim.named_optimizer import NamedOptimizer -from nanotron.optim.optimizer_from_gradient_accumulator import ( - OptimizerFromGradientAccumulator, -) -from nanotron.optim.zero import ZeroDistributedOptimizer - -__all__ = [ - "BaseOptimizer", - "InheritFromOtherOptimizer", - "NamedOptimizer", - "OptimizerFromGradientAccumulator", - "ZeroDistributedOptimizer", -] diff --git a/src/nanotron/parallel/__init__.py b/src/nanotron/parallel/__init__.py deleted file mode 100644 index 8a704cf4..00000000 --- a/src/nanotron/parallel/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# flake8: noqa -from nanotron.parallel.context import ParallelContext diff --git a/src/nanotron/serialize/__init__.py b/src/nanotron/serialize/__init__.py deleted file mode 100644 index ae6ef264..00000000 --- a/src/nanotron/serialize/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa -from nanotron.serialize.main import * -from nanotron.serialize.optimizer import * -from nanotron.serialize.random import * diff --git a/tests/fp8/test_fp8_parameter.py b/tests/fp8/test_fp8_parameter.py index bdf5f564..9c77b0b9 100644 --- a/tests/fp8/test_fp8_parameter.py +++ b/tests/fp8/test_fp8_parameter.py @@ -1,7 +1,7 @@ import torch -from nanotron.fp8 import DTypes, FP8Parameter, FP8Tensor -from nanotron.fp8.meta import FP8Meta +from fmengine.fp8 import DTypes, FP8Parameter, FP8Tensor +from fmengine.fp8.meta import FP8Meta def test_create_fp8_parameter(): diff --git a/tests/fp8/test_linear.py b/tests/fp8/test_linear.py index 61584284..ac8e0ffd 100644 --- a/tests/fp8/test_linear.py +++ b/tests/fp8/test_linear.py @@ -3,7 +3,7 @@ from torch import nn from torch.optim import Adam -from nanotron.fp8 import DTypes, FP8Linear, FP8Parameter, FP8Tensor +from fmengine.fp8 import DTypes, FP8Linear, FP8Parameter, FP8Tensor @pytest.mark.parametrize("is_bias", [True, False]) diff --git a/tests/fp8/test_tensor.py b/tests/fp8/test_tensor.py index 3e4e6b2c..aa5c3174 100644 --- a/tests/fp8/test_tensor.py +++ b/tests/fp8/test_tensor.py @@ -6,9 +6,9 @@ import transformer_engine as te # noqa import transformer_engine_extensions as tex -from nanotron.fp8 import DTypes, FP8Tensor -from nanotron.fp8.meta import FP8Meta -from nanotron.fp8.tensor import convert_tensor_from_fp8 +from fmengine.fp8 import DTypes, FP8Tensor +from fmengine.fp8.meta import FP8Meta +from fmengine.fp8.tensor import convert_tensor_from_fp8 @pytest.mark.parametrize("size", [4, 8, 16, 64]) diff --git a/tests/helpers/distributed_tensor.py b/tests/helpers/distributed_tensor.py index e5a48d9c..4590193f 100644 --- a/tests/helpers/distributed_tensor.py +++ b/tests/helpers/distributed_tensor.py @@ -1,7 +1,7 @@ import torch -from nanotron import distributed as dist -from nanotron.distributed import ProcessGroup, get_global_rank +from fmengine import distributed as dist +from fmengine.distributed import ProcessGroup, get_global_rank def assert_tensor_equal_over_group( diff --git a/tests/helpers/dummy.py b/tests/helpers/dummy.py index 76f53949..959439c3 100644 --- a/tests/helpers/dummy.py +++ b/tests/helpers/dummy.py @@ -5,17 +5,17 @@ from torch import nn from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron.models import init_on_device_and_dtype -from nanotron.optim.base import BaseOptimizer -from nanotron.optim.named_optimizer import NamedOptimizer -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.block import PipelineBlock -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.tied_parameters import tie_parameters -from nanotron.parallel.utils import initial_sync +from fmengine import distributed as dist +from fmengine.models import init_on_device_and_dtype +from fmengine.optim.base import BaseOptimizer +from fmengine.optim.named_optimizer import NamedOptimizer +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.block import PipelineBlock +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.tied_parameters import tie_parameters +from fmengine.parallel.utils import initial_sync class DummyModel(nn.Module): diff --git a/tests/helpers/exception.py b/tests/helpers/exception.py index cd439b4a..315972bc 100644 --- a/tests/helpers/exception.py +++ b/tests/helpers/exception.py @@ -2,7 +2,7 @@ import signal from typing import Optional -from nanotron import distributed as dist +from fmengine import distributed as dist @contextlib.contextmanager diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py index 343739b4..ffbf2bf5 100644 --- a/tests/helpers/utils.py +++ b/tests/helpers/utils.py @@ -8,7 +8,7 @@ import torch.multiprocessing as mp from packaging import version -from nanotron.parallel import ParallelContext +from fmengine.parallel import ParallelContext def available_gpus(): @@ -288,7 +288,7 @@ def setup_dist_env(rank, world_size, port): def init_distributed(tp: int, dp: int, pp: int): def _init_distributed(func): def wrapper(**kwargs): - from nanotron.utils import find_free_port + from fmengine.utils import find_free_port world_size = tp * pp * dp port = find_free_port() diff --git a/tests/kernels/run_layer_norm_convergence.py b/tests/kernels/run_layer_norm_convergence.py index 184d7cec..0c20a153 100644 --- a/tests/kernels/run_layer_norm_convergence.py +++ b/tests/kernels/run_layer_norm_convergence.py @@ -1,8 +1,8 @@ import torch from torch.nn import LayerNorm -from nanotron.logging import LoggerWriter -from nanotron.nn.layer_norm import TritonLayerNorm +from fmengine.logging import LoggerWriter +from fmengine.nn.layer_norm import TritonLayerNorm def get_time_name(): diff --git a/tests/kernels/test_layer_norm.py b/tests/kernels/test_layer_norm.py index 86a29153..f7b8e30e 100644 --- a/tests/kernels/test_layer_norm.py +++ b/tests/kernels/test_layer_norm.py @@ -2,7 +2,7 @@ import torch from torch.nn import LayerNorm -from nanotron.nn.layer_norm import TritonLayerNorm +from fmengine.nn.layer_norm import TritonLayerNorm @pytest.mark.fa2 diff --git a/tests/test_checkpointing.py b/tests/test_checkpointing.py index d4e66e28..70419d83 100644 --- a/tests/test_checkpointing.py +++ b/tests/test_checkpointing.py @@ -3,8 +3,8 @@ import torch from torch import nn -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.utils import checkpoint_method +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.utils import checkpoint_method class CheckpointedModel(nn.Module): diff --git a/tests/test_clip_grads.py b/tests/test_clip_grads.py index 7a27c852..308031b6 100644 --- a/tests/test_clip_grads.py +++ b/tests/test_clip_grads.py @@ -7,24 +7,24 @@ from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use from torch import nn -from nanotron import distributed as dist -from nanotron.models import init_on_device_and_dtype -from nanotron.optim.clip_grads import clip_grad_norm -from nanotron.optim.gradient_accumulator import FP32GradientAccumulator -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter, sanity_check -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine import distributed as dist +from fmengine.models import init_on_device_and_dtype +from fmengine.optim.clip_grads import clip_grad_norm +from fmengine.optim.gradient_accumulator import FP32GradientAccumulator +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter, sanity_check +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, ) -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.tensor_parallel.nn import TensorParallelColumnLinear -from nanotron.parallel.tied_parameters import ( +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.nn import TensorParallelColumnLinear +from fmengine.parallel.tied_parameters import ( sync_tied_weights_gradients, tie_parameters, ) -from nanotron.parallel.utils import initial_sync -from nanotron.sanity_checks import assert_tensor_synced_across_pg +from fmengine.parallel.utils import initial_sync +from fmengine.sanity_checks import assert_tensor_synced_across_pg @pytest.mark.skipif( diff --git a/tests/test_data_parallel.py b/tests/test_data_parallel.py index b29acc0c..4c0450a7 100644 --- a/tests/test_data_parallel.py +++ b/tests/test_data_parallel.py @@ -7,11 +7,11 @@ from torch import nn from torch.distributed import GradBucket -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd -from nanotron.parallel.parameters import NanotronParameter -from nanotron.sanity_checks import assert_tensor_synced_across_pg +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd +from fmengine.parallel.parameters import NanotronParameter +from fmengine.sanity_checks import assert_tensor_synced_across_pg @pytest.mark.skipif( diff --git a/tests/test_distributed.py b/tests/test_distributed.py index b3d55513..ac7f17ea 100644 --- a/tests/test_distributed.py +++ b/tests/test_distributed.py @@ -9,7 +9,7 @@ ) from torch.distributed import ProcessGroup -from nanotron.parallel import ParallelContext +from fmengine.parallel import ParallelContext def _test_init_parallel_context(parallel_context: ParallelContext): diff --git a/tests/test_p2p.py b/tests/test_p2p.py index c155a161..3d106279 100644 --- a/tests/test_p2p.py +++ b/tests/test_p2p.py @@ -5,9 +5,9 @@ from helpers.exception import assert_fail_with from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.p2p import P2P +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.p2p import P2P @pytest.mark.skipif( diff --git a/tests/test_parameter.py b/tests/test_parameter.py index 16fdaea5..1082b867 100644 --- a/tests/test_parameter.py +++ b/tests/test_parameter.py @@ -2,8 +2,8 @@ from helpers.exception import assert_fail_with from torch import nn -from nanotron.models.base import DTypeInvariantTensor, init_on_device_and_dtype -from nanotron.parallel.parameters import NanotronParameter +from fmengine.models.base import DTypeInvariantTensor, init_on_device_and_dtype +from fmengine.parallel.parameters import NanotronParameter def test_nanotron_parameter_does_not_override_some_parameter_variable(): diff --git a/tests/test_parameters_accumulate_gradient_in_fp32.py b/tests/test_parameters_accumulate_gradient_in_fp32.py index 4f405e11..b3118d53 100644 --- a/tests/test_parameters_accumulate_gradient_in_fp32.py +++ b/tests/test_parameters_accumulate_gradient_in_fp32.py @@ -7,35 +7,35 @@ from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use from torch import nn -import nanotron.distributed as dist -from nanotron.models import init_on_device_and_dtype -from nanotron.optim import ZeroDistributedOptimizer -from nanotron.optim.gradient_accumulator import ( +import fmengine.distributed as dist +from fmengine.models import init_on_device_and_dtype +from fmengine.optim import ZeroDistributedOptimizer +from fmengine.optim.gradient_accumulator import ( FP32GradBucketManager, FP32GradientAccumulator, get_fp32_accum_hook, ) -from nanotron.optim.named_optimizer import NamedOptimizer -from nanotron.optim.optimizer_from_gradient_accumulator import ( +from fmengine.optim.named_optimizer import NamedOptimizer +from fmengine.optim.optimizer_from_gradient_accumulator import ( OptimizerFromGradientAccumulator, ) -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter, sanity_check -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter, sanity_check +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, OneForwardOneBackwardPipelineEngine, PipelineEngine, ) -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of -from nanotron.parallel.tied_parameters import ( +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.pipeline_parallel.utils import get_pp_rank_of +from fmengine.parallel.tied_parameters import ( get_tied_id_to_param, sync_tied_weights_gradients, tie_parameters, ) -from nanotron.parallel.utils import initial_sync -from nanotron.sanity_checks import assert_tensor_synced_across_pg -from nanotron.utils import ContextManagers +from fmengine.parallel.utils import initial_sync +from fmengine.sanity_checks import assert_tensor_synced_across_pg +from fmengine.utils import ContextManagers @pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16]) diff --git a/tests/test_pipeline_parallel.py b/tests/test_pipeline_parallel.py index ab149d4e..308f8083 100644 --- a/tests/test_pipeline_parallel.py +++ b/tests/test_pipeline_parallel.py @@ -7,17 +7,17 @@ from torch import nn from torch.nn import functional as F -from nanotron import distributed as dist -from nanotron.models import init_on_device_and_dtype -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.block import PipelineBlock -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine import distributed as dist +from fmengine.models import init_on_device_and_dtype +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.block import PipelineBlock +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, OneForwardOneBackwardPipelineEngine, PipelineEngine, ) -from nanotron.parallel.pipeline_parallel.p2p import P2P -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.pipeline_parallel.p2p import P2P +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer @pytest.mark.skipif( diff --git a/tests/test_random_state.py b/tests/test_random_state.py index 656fe8c0..09ba6ebd 100644 --- a/tests/test_random_state.py +++ b/tests/test_random_state.py @@ -2,9 +2,9 @@ import torch from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.random import ( +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.random import ( RandomStates, branch_random_state, get_current_random_state, diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 2d307833..d88e2cea 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -11,29 +11,29 @@ ) from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron.constants import CHECKPOINT_VERSION -from nanotron.optim.gradient_accumulator import FP32GradientAccumulator -from nanotron.optim.named_optimizer import NamedOptimizer -from nanotron.optim.optimizer_from_gradient_accumulator import ( +from fmengine import distributed as dist +from fmengine.constants import CHECKPOINT_VERSION +from fmengine.optim.gradient_accumulator import FP32GradientAccumulator +from fmengine.optim.named_optimizer import NamedOptimizer +from fmengine.optim.optimizer_from_gradient_accumulator import ( OptimizerFromGradientAccumulator, ) -from nanotron.optim.zero import ZeroDistributedOptimizer -from nanotron.parallel import ParallelContext -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine.optim.zero import ZeroDistributedOptimizer +from fmengine.parallel import ParallelContext +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, ) -from nanotron.parallel.sharded_parameters import ( +from fmengine.parallel.sharded_parameters import ( SplitConfig, create_sharded_parameter_from_config, ) -from nanotron.parallel.tied_parameters import sync_tied_weights_gradients -from nanotron.random import ( +from fmengine.parallel.tied_parameters import sync_tied_weights_gradients +from fmengine.random import ( RandomStates, get_current_random_state, get_synced_random_state, ) -from nanotron.serialize import ( +from fmengine.serialize import ( load_optimizer, load_random_states, load_weights, @@ -41,7 +41,7 @@ save_random_states, save_weights, ) -from nanotron.serialize.metadata import TensorMetadata +from fmengine.serialize.metadata import TensorMetadata def test_save_and_load_with_changed_topolgy(): diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index a3be892d..0d568fd5 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -5,11 +5,11 @@ from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use from torch import nn as torch_nn -from nanotron import distributed as dist -from nanotron.distributed import get_global_rank -from nanotron.parallel import ParallelContext -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.tensor_parallel.nn import ( +from fmengine import distributed as dist +from fmengine.distributed import get_global_rank +from fmengine.parallel import ParallelContext +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tensor_parallel.nn import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/tests/test_tie_weights.py b/tests/test_tie_weights.py index 130bf527..88a108e9 100644 --- a/tests/test_tie_weights.py +++ b/tests/test_tie_weights.py @@ -4,10 +4,10 @@ from helpers.utils import init_distributed, rerun_if_address_is_in_use from torch import nn -from nanotron import distributed as dist -from nanotron.parallel import ParallelContext -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.tied_parameters import ( +from fmengine import distributed as dist +from fmengine.parallel import ParallelContext +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.tied_parameters import ( get_tied_id_to_param, sync_tied_weights_gradients, tie_parameters, diff --git a/tests/test_zero.py b/tests/test_zero.py index 033ae09f..5d6f3181 100644 --- a/tests/test_zero.py +++ b/tests/test_zero.py @@ -9,20 +9,20 @@ from torch import nn as torch_nn from torch.nn.parallel import DistributedDataParallel -from nanotron import distributed as dist -from nanotron.optim import NamedOptimizer, ZeroDistributedOptimizer -from nanotron.optim.zero import SlicedFlatTensor -from nanotron.parallel import ParallelContext -from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp -from nanotron.parallel.parameters import NanotronParameter -from nanotron.parallel.pipeline_parallel.engine import ( +from fmengine import distributed as dist +from fmengine.optim import NamedOptimizer, ZeroDistributedOptimizer +from fmengine.optim.zero import SlicedFlatTensor +from fmengine.parallel import ParallelContext +from fmengine.parallel.data_parallel.utils import sync_gradients_across_dp +from fmengine.parallel.parameters import NanotronParameter +from fmengine.parallel.pipeline_parallel.engine import ( AllForwardAllBackwardPipelineEngine, ) -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.tensor_parallel import nn -from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode -from nanotron.parallel.tied_parameters import sync_tied_weights_gradients -from nanotron.random import ( +from fmengine.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from fmengine.parallel.tensor_parallel import nn +from fmengine.parallel.tensor_parallel.enum import TensorParallelLinearMode +from fmengine.parallel.tied_parameters import sync_tied_weights_gradients +from fmengine.random import ( RandomStates, branch_random_state, get_current_random_state, From 30ac9dd0124861a8da9e5e923fb3099b1f06c66e Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 15 Mar 2024 15:49:36 +0100 Subject: [PATCH 4/4] tiktoken --- README.md | 1 + {examples => configs}/config_tiny_llama.yaml | 0 .../config_tiny_mistral.yaml | 9 +++++---- run_train.py | 6 ++---- src/fmengine/config/config.py | 2 +- src/fmengine/dataloader/dataloader.py | 20 ++++++++++++------- src/fmengine/tokenizer.py | 15 ++++++++++++++ 7 files changed, 37 insertions(+), 16 deletions(-) rename {examples => configs}/config_tiny_llama.yaml (100%) rename {examples => configs}/config_tiny_mistral.yaml (93%) create mode 100644 src/fmengine/tokenizer.py diff --git a/README.md b/README.md index 4fedbf20..c1b7d3df 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ FMEngine is our opinionated take on foundation model training framework. The first version of FMEngine is built on top of `PyTorch` and `DeepSpeed` and is designed to be a drop-in replacement for `DeepSpeed` with a few additional features. In the `v2` version we forked from HuggingFace's `nanotron` and added some features to make it easier to use. + # Credits We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: diff --git a/examples/config_tiny_llama.yaml b/configs/config_tiny_llama.yaml similarity index 100% rename from examples/config_tiny_llama.yaml rename to configs/config_tiny_llama.yaml diff --git a/examples/config_tiny_mistral.yaml b/configs/config_tiny_mistral.yaml similarity index 93% rename from examples/config_tiny_mistral.yaml rename to configs/config_tiny_mistral.yaml index 06c56794..bd6f678a 100644 --- a/examples/config_tiny_mistral.yaml +++ b/configs/config_tiny_mistral.yaml @@ -9,7 +9,7 @@ data: dataset_overwrite_cache: false dataset_processing_num_proc_per_process: 1 hf_dataset_config_name: default - hf_dataset_or_datasets: cerebras/SlimPajama-627B + hf_dataset_or_datasets: DKYoon/SlimPajama-6B hf_dataset_splits: train text_column_name: text num_loading_workers: 1 @@ -51,7 +51,7 @@ model: sliding_window: 4096 tie_word_embeddings: true use_cache: true - vocab_size: 32000 + vocab_size: 102000 optimizer: accumulate_grad_in_fp32: true adam_beta1: 0.9 @@ -70,7 +70,7 @@ optimizer: weight_decay: 0.01 zero_stage: 0 parallelism: - dp: 2 + dp: 1 pp: 1 pp_engine: 1f1b tp: 1 @@ -78,8 +78,9 @@ parallelism: tp_mode: REDUCE_SCATTER profiler: null tokenizer: + tokenizer_type: openai tokenizer_max_length: null - tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 + tokenizer_name_or_path: cl100k_base tokenizer_revision: null tokens: batch_accumulation_per_replica: 1 diff --git a/run_train.py b/run_train.py index 29e0a850..a0aa027b 100644 --- a/run_train.py +++ b/run_train.py @@ -23,7 +23,7 @@ from fmengine.parallel.pipeline_parallel.utils import get_input_output_pp_ranks from fmengine.trainer import DistributedTrainer from fmengine.utils import main_rank_first - +from fmengine.tokenizer import get_tokenizer from huggingface_hub import __version__ as hf_hub_version from transformers import AutoTokenizer from transformers import __version__ as tf_version @@ -76,9 +76,7 @@ def get_dataloader(trainer: DistributedTrainer): stream=True, )['train'] - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "left" + tokenizer = get_tokenizer(trainer.config) # We apply the Causal Language Modeling preprocessing train_dataset = clm_process( raw_dataset=raw_dataset, diff --git a/src/fmengine/config/config.py b/src/fmengine/config/config.py index bf20e07d..a687079c 100644 --- a/src/fmengine/config/config.py +++ b/src/fmengine/config/config.py @@ -194,7 +194,7 @@ def __post_init__(self): @dataclass class TokenizerArgs: """Arguments related to the tokenizer""" - + tokenizer_type: Optional[str] = "hf" tokenizer_name_or_path: Optional[str] = None tokenizer_revision: Optional[str] = None tokenizer_max_length: Optional[int] = None diff --git a/src/fmengine/dataloader/dataloader.py b/src/fmengine/dataloader/dataloader.py index 0de51597..36caf517 100644 --- a/src/fmengine/dataloader/dataloader.py +++ b/src/fmengine/dataloader/dataloader.py @@ -28,7 +28,7 @@ ) from transformers import PreTrainedTokenizerBase from transformers.trainer_pt_utils import DistributedSamplerWithLoop - +import tiktoken logger = logging.get_logger(__name__) def sanity_check_dataloader( @@ -375,12 +375,18 @@ def _group_texts( return result def _tokenize_texts(texts: List[str]) -> Dict[str, List[np.ndarray]]: - tokenized_batch = tokenizer.encode( - texts, - return_attention_mask=False, - return_token_type_ids=False, - truncation=True, - ) + if isinstance(tokenizer, PreTrainedTokenizerBase): + tokenized_batch = tokenizer.encode( + texts, + return_attention_mask=False, + return_token_type_ids=False, + truncation=True, + ) + print(tokenized_batch) + elif isinstance(tokenizer, tiktoken.core.Encoding): + tokenized_batch = tokenizer.encode_batch(texts) + # flatten the list of lists + tokenized_batch = [item for sublist in tokenized_batch for item in sublist] return {"input_ids": tokenized_batch} train_dataset = raw_dataset.map( diff --git a/src/fmengine/tokenizer.py b/src/fmengine/tokenizer.py new file mode 100644 index 00000000..6f307666 --- /dev/null +++ b/src/fmengine/tokenizer.py @@ -0,0 +1,15 @@ +def get_tokenizer(trainer_config): + tokenizer_type = trainer_config.tokenizer.tokenizer_type.lower() + tokenizer_path = trainer_config.tokenizer.tokenizer_name_or_path + assert tokenizer_type in ['hf', 'openai'], f"Unknown tokenizer type {tokenizer_type}" + if tokenizer_type == 'openai': + import tiktoken + tokenizer = tiktoken.get_encoding(tokenizer_path) + elif tokenizer_type == 'hf': + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + else: + raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented") + return tokenizer \ No newline at end of file