diff --git a/dalm/datasets/backtranslation/LICENSE b/dalm/datasets/backtranslation/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/dalm/datasets/backtranslation/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dalm/datasets/backtranslation/Makefile b/dalm/datasets/backtranslation/Makefile
new file mode 100644
index 0000000..0ee0469
--- /dev/null
+++ b/dalm/datasets/backtranslation/Makefile
@@ -0,0 +1,30 @@
+all: format clean pre test
+	echo 'finished'
+
+.PHONY: format
+format:
+	isort --profile black --filter-files .
+	black .
+
+.PHONY: test
+test:
+	coverage run --source src -m pytest -vv .
+	coverage report -m
+	flake8
+
+.PHONY: pre
+pre:
+	pre-commit run --all-files
+
+.PHONY: debug
+debug:
+	pytest -vv tests/utils/test_logging.py
+
+.PHONY: clean
+clean:
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info/
+	rm -f .coverage
+	rm -f coverage.xml
+	find . | grep -E '(__pycache__|\.pyc|\.pyo$$)' | xargs rm -rf
\ No newline at end of file
diff --git a/dalm/datasets/backtranslation/README.md b/dalm/datasets/backtranslation/README.md
new file mode 100644
index 0000000..c33b690
--- /dev/null
+++ b/dalm/datasets/backtranslation/README.md
@@ -0,0 +1,198 @@
+# 🐋 Humback
+
+An **unofficial** implementation of [Self-Alignment with Instruction Backtranslation](https://arxiv.org/pdf/2308.06259.pdf) .
+
+The proposed Humback is a novel framework that can augment the instruction data for supervised fine-tuning with high quality.
+
+🚧 Currently, this repo is under construction and not finished.
+
+![Humback Framework](./figs/humback.png)
+
+## 🌴 Dependencies
+
+- Dependencies required for training are included in requirements_train.txt
+    - These dependencies are needed when running:
+        - `train_backward_Myx.sh`
+        - `train_seed.sh`
+- Dependencies required for prediction are included in requirements_predict.txt
+    - These dependencies are needed when running:
+        - `self_aug.sh`
+        - `self_curation.sh`
+- Create two separate conda environments for training and predicting. This is because vllm downgrades some of the libraries needed for training (e.g. torch).
+
+## 🚀 QuickStart
+
+Procedure (2 iters):
+1. Prepare seed data and unlabelled data.
+2. Train the backward model $M_{yx}$ on the reversed seed data.
+3. Self-augment the seed data via $M_{yx}$.
+4. Train a forward model $M_{0}$ on the seed data.
+5. Self-curate the unlabelled data $A_{k}^{(1)}$ via $M_{0}$ (tag quality scores).
+6. Train a forward model $M_{1}$ on the self-curated unlabelled data $A_{k}^{(1)}$.
+7. Use $M_{1}$ to self-curate the unlabelled data $A_{k}^{(2)}$.
+8. Train a forward model $M_{2}$ on the self-curated unlabelled data $A_{k}^{(2)}$.
+
+### Seed Data Pre-processing
+
+We follow the original paper and use [oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1) to construct the seed data.
+
+The processed data could be found [here](https://github.com/Spico197/Humback/releases/tag/data) .
+
+```bash
+$ bash data/seed/download.sh
+$ python data/seed/convert.py
+# #data: 3286, #dump: 3200
+# Instruction len: 149±266, Response len: 1184±799
+```
+
+### Unlabelled Data Pre-processing
+
+Since ClueWeb22 is not a free open-source dataset, we sample texts from [falcon-refinedweb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) instead.
+
+The processed data could be found [here](https://github.com/Spico197/Humback/releases/tag/data) .
+
+```bash
+$ python data/unlabelled/falcon_refinedweb.py
+```
+
+### Train Backward Model $M_{yx}$
+
+| Item                   | Value                                                                       |
+| :--------------------- | :-------------------------------------------------------------------------- |
+| Foundation Model       | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
+| GPUs                   | 8 * A100 40GB                                                               |
+| Mixed Precision        | bf16                                                                        |
+| Gradient Checkpointing | on                                                                          |
+| ZeRO-Offload           | Stage 2                                                                     |
+| Batch size             | 32                                                                          |
+| Steps                  | 500                                                                         |
+
+```bash
+# The first Myx training takes about 30min (on the seed data)
+$ bash scripts/train_backward_Myx.sh
+```
+
+The pre-trained $M_{yx}$ is available at [Huggingface](https://huggingface.co/Spico/Humback-Myx).
+
+### Self-Augmentation via $M_{yx}$
+
+The augmentation data is available at [Huggingface](https://huggingface.co/datasets/Spico/Humback/blob/main/unlabelled_gen_instruction.jsonl) .
+
+```bash
+# Taking about 6:40:45 on the unlabelled data with 8*A100
+$ bash scripts/self_aug.sh
+```
+
+### Train Seed Model $M_{0}$
+
+Hyper parameters are the same as $M_{yx}$.
+
+```bash
+$ bash scripts/train_seed.sh
+```
+
+The pre-trained $M_{0}$ is available at [Huggingface](https://huggingface.co/Spico/Humback-M0) (Uploading).
+
+### Self-Curation Prompting
+
+The curated data is available at [Huggingface](https://huggingface.co/datasets/Spico/Humback/blob/main/m1_v2.jsonl) .
+
+```bash
+# 33:54:45 with 8*A100 on 482,963 samples
+$ bash scripts/self_curation.sh
+# scores: [('None', 217203), ('4', 119211), ('3', 102756), ('5', 21301), ('1', 13083), ('2', 9288), ('8', 19), ('0', 15), ('9', 14), ('7', 11), ('6', 9), ('10', 4), ('91', 3), ('83', 2), ('20', 2), ('14', 2), ('75', 2), ('92', 2), ('72', 1), ('93', 1), ('28', 1), ('19', 1), ('728', 1), ('17', 1), ('16', 1), ('100', 1), ('237', 1), ('13', 1), ('73', 1), ('38', 1), ('87', 1), ('94', 1), ('98', 1), ('64', 1), ('52', 1), ('27', 1), ('24', 1), ('762', 1), ('266', 1), ('225', 1), ('80', 1), ('267', 1), ('99', 1), ('90', 1), ('63', 1), ('97', 1), ('78', 1), ('40', 1), ('1986', 1), ('47', 1), ('66', 1), ('45', 1), ('10502', 1), ('21', 1)]
+# Number of qualified results (scores=5): 21301/482963
+# instruction len: 198 ± 351
+# response len: 1601 ± 345
+# ---------------------------------------
+# v2: (Strict Curation Score Matching: add `$` to the matching regex):
+# Scores: [('None', 322324), ('3', 71851), ('4', 53120), ('5', 16460), ('1', 11921), ('2', 7260), ('0', 10), ('7', 4), ('6', 3), ('19', 1), ('8', 1), ('16', 1), ('13', 1), ('10', 1), ('23', 1), ('9', 1), ('90', 1), ('92', 1), ('45', 1)]
+# Number of qualified results (scores=5): 15521/482963
+# instruction len: 124 ± 113
+# response len: 1611 ± 345
+# ---------------------------------------
+$ cat outputs/m1/unlabelled_curated_data.jsonl data/seed/seed.jsonl > data/curated/m1.jsonl
+```
+
+### Train Models $M_{i}$
+
+Most hyper parameters are the same as $M_{yx}$ except for the number of steps (the original Humback trains 1600 steps on 512k samples).
+
+```bash
+# change the `--data_path` in `scripts/train_seed.sh`
+$ bash scripts/train_seed.sh
+```
+
+## 📑 Experimental Results
+
+Other models: [HuggingFaceH4/open_llm_leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) .
+
+| Model                                                                                            | Average |   ARC | HellaSwag |  MMLU | TruthfulQA |
+| :----------------------------------------------------------------------------------------------- | ------: | ----: | --------: | ----: | ---------: |
+| Llama-2-7b                                                                                       |   54.32 | 53.07 |     78.59 | 46.87 |      38.76 |
+| Llama-2-7b-chat                                                                                  |   56.34 | 52.90 |     78.55 | 48.32 |      45.57 |
+| Vicuna-7b-v1.3                                                                                   |   55.62 | 50.43 |     76.92 | 48.14 |      47.01 |
+| Humback $M_{0}$                                                                                  |   58.13 | 56.31 |     81.20 | 47.45 |      47.59 |
+| Humback $M_{1}$                                                                                  |   54.65 | 52.99 |     78.57 | 45.48 |      41.54 |
+| Humback $M_{1,\text{w/o DiffSysPrompt,TemplateVicuna1.1}}$                                       |   55.85 | 52.82 |     78.53 | 45.86 |      46.21 |
+| Humback $M_{1,\text{w/o DiffSysPrompt,TemplateVicuna1.1,StrictCurationScoreMatching}}$           |   54.26 | 53.50 |     78.52 | 45.19 |      39.83 |
+| Humback $M_{1,\text{w/o DiffSysPrompt,TemplateVicuna1.1,StrictCurationScoreMatching,1200steps}}$ |   56.67 | 56.23 |     81.10 | 46.46 |      42.89 |
+| Humback $M_{1,\text{w/o DiffSysPrompt,TemplateVicuna1.1,StrictCurationScoreMatching,1800steps}}$ |   57.58 | 57.68 |     81.78 | 46.13 |      44.74 |
+| Humback $M_{1,\text{w/o DiffSysPrompt,TemplateVicuna1.1,StrictCurationScoreMatching,2400steps}}$ |   56.96 | 55.89 |     80.83 | 45.84 |      45.30 |
+
+The results and the trend are not as good as the original paper, but the performance of $M_{0}$ is better than vanilla llama2-7b.
+Specifically, Humback $M_{1}$ is worse than $M_{0}$, and the different system prompts seem not be helpful on these benchmarks.
+By the way, although $M_{0}$ is good at these benchmarks, it may be not good at generating high quality and diversified responses on more tasks.
+Further experiments should be conducted to verify the effectiveness of the reproduced Humback $M_{0}$ (e.g. [alpaca_eval](https://github.com/tatsu-lab/alpaca_eval) with GPT4 as the judge).
+
+Possible reasons are:
+1. The backward model $M_{yx}$ is not good enough to generate high quality instructions.
+2. The seed model $M_{0}$ is not competent to evaluate the generated quality (not all scores are ranging from 1 to 5).
+
+Since I don't have GPT4 API keys, `chatgpt_fn` is used as the evaluator here (as introduced in [alpaca_eval](https://github.com/tatsu-lab/alpaca_eval)):
+
+```
+                       win_rate  standard_error  n_total  avg_length
+gpt4                      73.79            1.54      805        1365
+claude                    70.37            1.60      805        1082
+chatgpt                   66.09            1.66      805         811
+wizardlm-13b              65.16            1.67      805         985
+vicuna-13b                64.10            1.69      805        1037
+guanaco-65b               62.36            1.71      805        1249
+oasst-rlhf-llama-33b      62.05            1.71      805        1079
+alpaca-farm-ppo-human     60.25            1.72      805         803
+falcon-40b-instruct       56.52            1.74      805         662
+text_davinci_003          50.00            0.00      805         307
+alpaca-7b                 45.22            1.74      805         396
+HumbackM0                 32.30            1.65      805         548
+text_davinci_001          28.07            1.56      805         296
+HumbackM1                 23.35            1.49      805        1522
+```
+
+🔥 Further discussions are fully welcomed.
+
+## 📝 TODO
+
+- [ ] train more steps on $M_{i}$.
+- [ ] remove system prompts when training $M_{0}$, $M_{i}$ and $M_{yx}$.
+
+## 💌 Acknowledgments
+
+- Paper: [Self-Alignment with Instruction Backtranslation](https://arxiv.org/pdf/2308.06259.pdf)
+- Code: [FastChat](https://github.com/lm-sys/FastChat)
+- Code: [vLLM](https://github.com/vllm-project/vllm)
+- Code: [stanford_alpaca](https://github.com/tatsu-lab/stanford_alpaca)
+- Code: [transformers](https://huggingface.co/transformers/)
+
+## 📜 Reference
+
+```bibtex
+@misc{li2023selfalignment,
+    title={Self-Alignment with Instruction Backtranslation},
+    author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Luke Zettlemoyer and Omer Levy and Jason Weston and Mike Lewis},
+    year={2023},
+    eprint={2308.06259},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
diff --git a/dalm/datasets/backtranslation/conf/ds_zero1default.json b/dalm/datasets/backtranslation/conf/ds_zero1default.json
new file mode 100644
index 0000000..10f23ac
--- /dev/null
+++ b/dalm/datasets/backtranslation/conf/ds_zero1default.json
@@ -0,0 +1,14 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 1
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/dalm/datasets/backtranslation/conf/ds_zero2default.json b/dalm/datasets/backtranslation/conf/ds_zero2default.json
new file mode 100644
index 0000000..1939231
--- /dev/null
+++ b/dalm/datasets/backtranslation/conf/ds_zero2default.json
@@ -0,0 +1,22 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 2,
+	    "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/dalm/datasets/backtranslation/conf/fsdp_config.json b/dalm/datasets/backtranslation/conf/fsdp_config.json
new file mode 100644
index 0000000..37baa74
--- /dev/null
+++ b/dalm/datasets/backtranslation/conf/fsdp_config.json
@@ -0,0 +1,3 @@
+{
+    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer"
+}
\ No newline at end of file
diff --git a/dalm/datasets/backtranslation/cuda_outofmemory_log.txt b/dalm/datasets/backtranslation/cuda_outofmemory_log.txt
new file mode 100644
index 0000000..2328e3b
--- /dev/null
+++ b/dalm/datasets/backtranslation/cuda_outofmemory_log.txt
@@ -0,0 +1,633 @@
+(/home/ubuntu/sachira/Humback/.env) ubuntu@ip-172-31-46-114:~/sachira/Humback$ scripts/train_backward_Myx.sh
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2023-11-10 21:31:40,744] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,749] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,751] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,763] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,810] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,838] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,842] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,868] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-11-10 21:31:40,918] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:40,926] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:40,929] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:40,938] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:41,000] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:41,015] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:41,039] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-11-10 21:31:41,040] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2023-11-10 21:31:41,047] [INFO] [comm.py:637:init_distributed] cdb=None
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.18s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.09s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.40s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.11s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.31s/it]
+max_steps is given, it will override any value given in num_train_epochs
+Using auto half precision backend
+[2023-11-10 21:32:35,817] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.2, git-hash=unknown, git-branch=unknown
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.41s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.15s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.54s/it]
+[2023-11-10 21:33:05,414] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2023-11-10 21:33:05,415] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2023-11-10 21:33:05,415] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2023-11-10 21:33:05,423] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
+[2023-11-10 21:33:05,423] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'torch.optim.adamw.AdamW'>
+[2023-11-10 21:33:05,424] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer
+[2023-11-10 21:33:05,424] [INFO] [stage_1_and_2.py:147:__init__] Reduce bucket size 500,000,000
+[2023-11-10 21:33:05,424] [INFO] [stage_1_and_2.py:148:__init__] Allgather bucket size 500,000,000
+[2023-11-10 21:33:05,424] [INFO] [stage_1_and_2.py:149:__init__] CPU Offload: False
+[2023-11-10 21:33:05,424] [INFO] [stage_1_and_2.py:150:__init__] Round robin gradient partitioning: False
+/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/home/ubuntu/sachira/Humback/src/core/train_flash_attn.py", line 10, in <module>
+    train()
+  File "/home/ubuntu/sachira/Humback/src/core/train.py", line 61, in train
+    trainer.train()
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2725, in training_step
+    loss = self.compute_loss(model, inputs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2748, in compute_loss
+    outputs = model(**inputs)
+              ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+              ^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1814, in forward
+    loss = self.module(*inputs, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1034, in forward
+    outputs = self.model(
+              ^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 912, in forward
+    layer_outputs = self._gradient_checkpointing_func(
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_compile.py", line 24, in inner
+    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 451, in checkpoint
+    return CheckpointFunction.apply(function, preserve, *args)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 230, in forward
+    outputs = run_function(*args)
+              ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 672, in forward
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                                                          ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/fastchat/train/llama_flash_attn_monkey_patch.py", line 77, in forward
+    qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 118, in unpad_input
+    index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 17, in forward
+    return torch.gather(
+           ^^^^^^^^^^^^^
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 85.24 GiB. GPU 1 has a total capacty of 39.39 GiB of which 12.82 GiB is free. Including non-PyTorch memory, this process has 26.56 GiB memory in use. Of the allocated memory 22.20 GiB is allocated by PyTorch, and 2.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+[2023-11-10 21:33:28,702] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states
+[2023-11-10 21:33:28,703] [INFO] [utils.py:803:see_memory_usage] MA 15.75 GB         Max_MA 15.75 GB         CA 15.75 GB         Max_CA 16 GB 
+[2023-11-10 21:33:28,703] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory:  used = 66.02 GB, percent = 5.9%
+[2023-11-10 21:33:28,897] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states
+[2023-11-10 21:33:28,897] [INFO] [utils.py:803:see_memory_usage] MA 22.03 GB         Max_MA 28.31 GB         CA 28.31 GB         Max_CA 28 GB 
+[2023-11-10 21:33:28,898] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory:  used = 53.14 GB, percent = 4.7%
+[2023-11-10 21:33:28,898] [INFO] [stage_1_and_2.py:514:__init__] optimizer state initialized
+/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+[2023-11-10 21:33:29,003] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer
+[2023-11-10 21:33:29,004] [INFO] [utils.py:803:see_memory_usage] MA 22.03 GB         Max_MA 22.03 GB         CA 28.31 GB         Max_CA 28 GB 
+[2023-11-10 21:33:29,004] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory:  used = 46.17 GB, percent = 4.1%
+[2023-11-10 21:33:29,006] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
+[2023-11-10 21:33:29,006] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2023-11-10 21:33:29,006] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2023-11-10 21:33:29,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-11-10 21:33:29,006] [INFO] [config.py:972:print] DeepSpeedEngine configuration:
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   amp_enabled .................. False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   amp_params ................... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   autotuning_config ............ {
+    "enabled": false, 
+    "start_step": null, 
+    "end_step": null, 
+    "metric_path": null, 
+    "arg_mappings": null, 
+    "metric": "throughput", 
+    "model_info": null, 
+    "results_dir": "autotuning_results", 
+    "exps_dir": "autotuning_exps", 
+    "overwrite": true, 
+    "fast": true, 
+    "start_profile_step": 3, 
+    "end_profile_step": 5, 
+    "tuner_type": "gridsearch", 
+    "tuner_early_stopping": 5, 
+    "tuner_num_trials": 50, 
+    "model_info_path": null, 
+    "mp_size": 1, 
+    "max_train_batch_size": null, 
+    "min_train_batch_size": 1, 
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
+    "min_train_micro_batch_size_per_gpu": 1, 
+    "num_tuning_micro_batch_sizes": 3
+}
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   bfloat16_enabled ............. True
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   checkpoint_parallel_write_pipeline  False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   checkpoint_tag_validation_enabled  True
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   checkpoint_tag_validation_fail  False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f3e26b7dd50>
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   communication_data_type ...... None
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   curriculum_enabled_legacy .... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   curriculum_params_legacy ..... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   data_efficiency_enabled ...... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   dataloader_drop_last ......... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   disable_allgather ............ False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   dump_state ................... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   dynamic_loss_scale_args ...... None
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_enabled ........... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_gas_boundary_resolution  1
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_layer_num ......... 0
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_max_iter .......... 100
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_stability ......... 1e-06
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_tol ............... 0.01
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   eigenvalue_verbose ........... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   elasticity_enabled ........... False
+[2023-11-10 21:33:29,007] [INFO] [config.py:976:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "recompute_fwd_factor": 0.0, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   fp16_auto_cast ............... None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   fp16_enabled ................. False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   fp16_master_weights_and_gradients  False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   global_rank .................. 0
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   grad_accum_dtype ............. None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   gradient_accumulation_steps .. 1
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   gradient_clipping ............ 1.0
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   gradient_predivide_factor .... 1.0
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   initial_dynamic_scale ........ 1
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   load_universal_checkpoint .... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   loss_scale ................... 1.0
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   memory_breakdown ............. False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   mics_hierarchial_params_gather  False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   mics_shard_size .............. -1
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   nebula_config ................ {
+    "enabled": false, 
+    "persistent_storage_path": null, 
+    "persistent_time_interval": 100, 
+    "num_of_version_in_retention": 2, 
+    "enable_nebula_load": true, 
+    "load_path": null
+}
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   optimizer_legacy_fusion ...... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   optimizer_name ............... None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   optimizer_params ............. None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   pld_enabled .................. False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   pld_params ................... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   prescale_gradients ........... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   scheduler_name ............... None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   scheduler_params ............. None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   seq_parallel_communication_data_type  torch.float32
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   sparse_attention ............. None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   sparse_gradients_enabled ..... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   steps_per_print .............. inf
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   train_batch_size ............. 8
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   train_micro_batch_size_per_gpu  1
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   use_node_local_storage ....... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   wall_clock_breakdown ......... False
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   weight_quantization_config ... None
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   world_size ................... 8
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   zero_allow_untested_optimizer  True
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   zero_enabled ................. True
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   zero_force_ds_cpu_optimizer .. True
+[2023-11-10 21:33:29,008] [INFO] [config.py:976:print]   zero_optimization_stage ...... 2
+[2023-11-10 21:33:29,008] [INFO] [config.py:962:print_user_config]   json = {
+    "bf16": {
+        "enabled": true
+    }, 
+    "zero_optimization": {
+        "stage": 2
+    }, 
+    "gradient_accumulation_steps": 1, 
+    "gradient_clipping": 1.0, 
+    "steps_per_print": inf, 
+    "train_batch_size": 8, 
+    "train_micro_batch_size_per_gpu": 1, 
+    "wall_clock_breakdown": false, 
+    "fp16": {
+        "enabled": false
+    }, 
+    "zero_allow_untested_optimizer": true
+}
+***** Running training *****
+  Num examples = 3,200
+  Num Epochs = 2
+  Instantaneous batch size per device = 1
+  Total train batch size (w. parallel, distributed & accumulation) = 8
+  Gradient Accumulation steps = 1
+  Total optimization steps = 500
+  Number of trainable parameters = 6,738,415,616
+  0%|                                                                                                                                                                                                                                        | 0/500 [00:00<?, ?it/s]/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/home/ubuntu/sachira/Humback/src/core/train_flash_attn.py", line 10, in <module>
+    train()
+  File "/home/ubuntu/sachira/Humback/src/core/train.py", line 61, in train
+    trainer.train()
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2725, in training_step
+    loss = self.compute_loss(model, inputs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2748, in compute_loss
+    outputs = model(**inputs)
+              ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+              ^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1814, in forward
+    loss = self.module(*inputs, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1034, in forward
+    outputs = self.model(
+              ^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 912, in forward
+    layer_outputs = self._gradient_checkpointing_func(
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_compile.py", line 24, in inner
+    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 451, in checkpoint
+    return CheckpointFunction.apply(function, preserve, *args)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 230, in forward
+    outputs = run_function(*args)
+              ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 672, in forward
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                                                          ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/fastchat/train/llama_flash_attn_monkey_patch.py", line 77, in forward
+    qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 118, in unpad_input
+    index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 17, in forward
+    return torch.gather(
+           ^^^^^^^^^^^^^
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 89.10 GiB. GPU 7 has a total capacty of 39.39 GiB of which 12.96 GiB is free. Including non-PyTorch memory, this process has 26.42 GiB memory in use. Of the allocated memory 22.20 GiB is allocated by PyTorch, and 2.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/home/ubuntu/sachira/Humback/src/core/train_flash_attn.py", line 10, in <module>
+    train()
+  File "/home/ubuntu/sachira/Humback/src/core/train.py", line 61, in train
+    trainer.train()
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2725, in training_step
+    loss = self.compute_loss(model, inputs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2748, in compute_loss
+    outputs = model(**inputs)
+              ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+              ^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1814, in forward
+    loss = self.module(*inputs, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1034, in forward
+    outputs = self.model(
+              ^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 912, in forward
+    layer_outputs = self._gradient_checkpointing_func(
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_compile.py", line 24, in inner
+    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/home/ubuntu/sachira/Humback/src/core/train_flash_attn.py", line 10, in <module>
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
+    train()    
+return fn(*args, **kwargs)
+        File "/home/ubuntu/sachira/Humback/src/core/train.py", line 61, in train
+     ^^^^^^^^^^^^^^^^^^^
+    trainer.train()
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 451, in checkpoint
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
+    return CheckpointFunction.apply(function, preserve, *args)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return inner_training_loop(
+           ^^    ^return super().apply(*args, **kwargs)  # type: ignore[misc]^
+^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ 
+  ^  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 230, in forward
+    outputs = run_function(*args)
+              ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    tr_loss_step = self.training_step(model, inputs)
+                   ^^^^^^^    ^return self._call_impl(*args, **kwargs)^
+^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^
+^^  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2725, in training_step
+^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 672, in forward
+    loss = self.compute_loss(model, inputs)
+           ^    ^hidden_states, self_attn_weights, present_key_value = self.self_attn(^
+^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 
+    File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/trainer.py", line 2748, in compute_loss
+                             ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^    ^outputs = model(**inputs)^
+^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+              ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^    ^return self._call_impl(*args, **kwargs)^
+^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/fastchat/train/llama_flash_attn_monkey_patch.py", line 77, in forward
+           ^    ^qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)^
+^^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 
+    File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 118, in unpad_input
+    index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+       return forward_call(*args, **kwargs) 
+^^^^^^^^^^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
+^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    ret_val = func(*args, **kwargs)
+              ^^^^^^^^^^    ^return super().apply(*args, **kwargs)  # type: ignore[misc]^
+^^^^^^^^^ 
+    File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1814, in forward
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 17, in forward
+    return torch.gather(
+           ^^^^^^^^^^^^^
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 78.63 GiB. GPU 3 has a total capacty of 39.39 GiB of which 12.82 GiB is free. Including non-PyTorch memory, this process has 26.56 GiB memory in use. Of the allocated memory 22.20 GiB is allocated by PyTorch, and 2.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+    loss = self.module(*inputs, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1034, in forward
+    outputs = self.model(
+              ^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 912, in forward
+    layer_outputs = self._gradient_checkpointing_func(
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_compile.py", line 24, in inner
+    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 451, in checkpoint
+    return CheckpointFunction.apply(function, preserve, *args)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 230, in forward
+    outputs = run_function(*args)
+              ^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 672, in forward
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                                                          ^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/fastchat/train/llama_flash_attn_monkey_patch.py", line 77, in forward
+    qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 118, in unpad_input
+    index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/autograd/function.py", line 539, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 17, in forward
+    return torch.gather(
+           ^^^^^^^^^^^^^
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 67.81 GiB. GPU 2 has a total capacty of 39.39 GiB of which 12.82 GiB is free. Including non-PyTorch memory, this process has 26.56 GiB memory in use. Of the allocated memory 22.19 GiB is allocated by PyTorch, and 2.98 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+[2023-11-10 21:33:29,813] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215718 closing signal SIGTERM
+[2023-11-10 21:33:29,813] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215720 closing signal SIGTERM
+[2023-11-10 21:33:29,813] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215721 closing signal SIGTERM
+[2023-11-10 21:33:29,814] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215722 closing signal SIGTERM
+[2023-11-10 21:33:29,814] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215723 closing signal SIGTERM
+[2023-11-10 21:33:29,814] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215724 closing signal SIGTERM
+[2023-11-10 21:33:29,814] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 215725 closing signal SIGTERM
+[2023-11-10 21:33:31,158] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 1 (pid: 215719) of binary: /home/ubuntu/sachira/Humback/.env/bin/python
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/run.py", line 810, in <module>
+    main()
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/run.py", line 806, in main
+    run(args)
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/run.py", line 797, in run
+    elastic_launch(
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/ubuntu/sachira/Humback/.env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+src.core.train_flash_attn FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-11-10_21:33:29
+  host      : ip-172-31-46-114.ec2.internal
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 215719)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+(/home/ubuntu/sachira/Humback/.env) ubuntu@ip-172-31-46-114:~/sachira/Humback$ 
\ No newline at end of file
diff --git a/dalm/datasets/backtranslation/data/prompts/self_curation.txt b/dalm/datasets/backtranslation/data/prompts/self_curation.txt
new file mode 100644
index 0000000..7e043a1
--- /dev/null
+++ b/dalm/datasets/backtranslation/data/prompts/self_curation.txt
@@ -0,0 +1,11 @@
+Below is an instruction from an user and a candidate answer. Evaluate whether or not the answer is a good example of how AI Assistant should respond to the user's instruction. Please assign a score using the following 5-point scale:
+1: It means the answer is incomplete, vague, off-topic, controversial, or not exactly what the user asked for. For example, some content seems missing, numbered list does not start from the beginning, the opening sentence repeats user's question. Or the response is from another person’s perspective with their personal experience (e.g. taken from blog posts), or looks like an answer from a forum. Or it contains promotional text, navigation text, or other irrelevant information.
+2: It means the answer addresses most of the asks from the user. It does not directly address the user's question. For example, it only provides a high-level methodology instead of the exact solution to user's question.
+3: It means the answer is helpful but not written by an AI Assistant. It addresses all the basic asks from the user. It is complete and self contained with the drawback that the response is not written from an AI assistant's perspective, but from other people's perspective. The content looks like an excerpt from a blog post, web page, or web search results. For example, it contains personal experience or opinion, mentions comments section, or share on social media, etc.
+4: It means the answer is written from an AI assistant's perspective with a clear focus of addressing the instruction. It provide a complete, clear, and comprehensive response to user’s question or instruction without missing or irrelevant information. It is well organized, self-contained, and written in a helpful tone. It has minor room for improvement, e.g. more concise and focused.
+5: It means it is a perfect answer from an AI Assistant. It has a clear focus on being a helpful AI Assistant, where the response looks like intentionally written to address the user's question or instruction without any irrelevant sentences. The answer provides high quality content, demonstrating expert knowledge in the area, is very well written, logical, easy-to-follow, engaging and insightful.
+
+Please first provide a brief reasoning you used to derive the rating score, and then write "Score: <rating>" in the last line.
+
+{generated_instruction}
+{response}
diff --git a/dalm/datasets/backtranslation/data/seed/convert.py b/dalm/datasets/backtranslation/data/seed/convert.py
new file mode 100644
index 0000000..be1e200
--- /dev/null
+++ b/dalm/datasets/backtranslation/data/seed/convert.py
@@ -0,0 +1,73 @@
+import statistics as sts
+
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from src.utils.io import dump_jsonlines, load_jsonlines
+
+
+def load_tree_data(
+    tree_filepath,
+    instruction_quality: float = 0.6,
+    response_quality: float = 0.6,
+    instruction_word_num: int = 5,
+    response_word_num: int = 5,
+    lang: str = "en",
+    response_rank: int = 0,
+):
+    trees = load_jsonlines(tree_filepath)
+    pairs = []
+
+    def _traverse(ins: dict):
+        for reply in ins["replies"]:
+            if (
+                ins.get("lang") == lang
+                and reply.get("lang") == lang
+                and reply.get("rank") == response_rank
+            ):
+                inst_qlt = ins["labels"].get("quality", {"value": 0.0})["value"]
+                resp_qlt = reply["labels"].get("quality", {"value": 0.0})["value"]
+                if inst_qlt > instruction_quality and resp_qlt > response_quality:
+                    if (
+                        len(ins["text"].split()) > instruction_word_num
+                        and len(reply["text"].split()) > response_word_num
+                    ):
+                        pairs.append(
+                            {
+                                "instruction": ins["text"],
+                                "instruction_quality": inst_qlt,
+                                "response": reply["text"],
+                                "response_quality": resp_qlt,
+                            }
+                        )
+        for reply in ins["replies"]:
+            _traverse(reply)
+
+    for tree in trees:
+        prompt = tree["prompt"]
+        _traverse(prompt)
+
+    return pairs
+
+
+if __name__ == "__main__":
+    dump_num = 3200
+    pairs = load_tree_data("data/seed/2023-04-12_oasst_ready.trees.jsonl")
+    print(f"#data: {len(pairs)}, #dump: {dump_num}")
+    pairs.sort(
+        key=lambda ins: ins["instruction_quality"] + ins["response_quality"],
+        reverse=True,
+    )
+    dump_data = pairs[:dump_num]
+    instruction_lens = []
+    response_lens = []
+    for ins in dump_data:
+        instruction_lens.append(len(ins["instruction"]))
+        response_lens.append(len(ins["response"]))
+    print(
+        f"Instruction len: {sts.mean(instruction_lens):.0f}±{sts.stdev(instruction_lens):.0f}, "
+        f"Response len: {sts.mean(response_lens):.0f}±{sts.stdev(response_lens):.0f}"
+    )
+    dump_jsonlines(dump_data, "data/seed/seed.jsonl")
diff --git a/dalm/datasets/backtranslation/data/seed/download.sh b/dalm/datasets/backtranslation/data/seed/download.sh
new file mode 100644
index 0000000..a069601
--- /dev/null
+++ b/dalm/datasets/backtranslation/data/seed/download.sh
@@ -0,0 +1,5 @@
+wget -P data/seed https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
+gzip -d data/seed/2023-04-12_oasst_ready.trees.jsonl.gz
+
+# wget -P data/seed https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.messages.jsonl.gz
+# gzip -d data/seed/2023-04-12_oasst_ready.messages.jsonl.gz
diff --git a/dalm/datasets/backtranslation/data/unlabelled/falcon_refinedweb.py b/dalm/datasets/backtranslation/data/unlabelled/falcon_refinedweb.py
new file mode 100644
index 0000000..6506fae
--- /dev/null
+++ b/dalm/datasets/backtranslation/data/unlabelled/falcon_refinedweb.py
@@ -0,0 +1,44 @@
+import json
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+def is_clean_text(text: str) -> bool:
+    # according to Table 2, the response lengths of A_{5}^{(2)} range from 1047 to 2279
+    if len(text) < 1000 or len(text) > 2200:
+        return False
+
+    segs = text.split("\n")
+    if any(len(seg) < 5 for seg in segs):
+        return False
+
+    return True
+
+
+def main():
+    num_samples = 502000
+    dump_filepath = "data/unlabelled/falcon-refinedweb-sampled.jsonl"
+
+    ds = load_dataset("tiiuae/falcon-refinedweb", streaming=True, split="train")
+    fout = open(dump_filepath, "a")
+    cnt = 0
+    tot = 0
+    pbar = tqdm(total=num_samples)
+    for ins in ds:
+        if cnt >= num_samples:
+            break
+        if is_clean_text(ins["content"]):
+            ins["timestamp"] = ins["timestamp"].strftime("%Y%m%d%H%M%S")
+            ins_str = json.dumps(ins, ensure_ascii=False)
+            fout.write(f"{ins_str}\n")
+            fout.flush()
+            cnt += 1
+            pbar.update(1)
+        tot += 1
+        pbar.set_postfix({"tot": tot, "valid": cnt})
+    fout.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dalm/datasets/backtranslation/figs/humback.png b/dalm/datasets/backtranslation/figs/humback.png
new file mode 100644
index 0000000..6eec5e4
Binary files /dev/null and b/dalm/datasets/backtranslation/figs/humback.png differ
diff --git a/dalm/datasets/backtranslation/requirements_predict b/dalm/datasets/backtranslation/requirements_predict
new file mode 100644
index 0000000..4de7fa0
--- /dev/null
+++ b/dalm/datasets/backtranslation/requirements_predict
@@ -0,0 +1,91 @@
+aiohttp==3.9.0
+aiosignal==1.3.1
+anyio==3.7.1
+async-timeout==4.0.3
+attrs==23.1.0
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+cmake==3.27.7
+exceptiongroup==1.1.3
+fastapi==0.104.1
+filelock==3.13.1
+frozenlist==1.4.0
+fschat==0.2.32
+fsspec==2023.10.0
+h11==0.14.0
+httpcore==1.0.2
+httptools==0.6.1
+httpx==0.25.1
+huggingface-hub==0.19.4
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.1
+lit==17.0.5
+markdown-it-py==3.0.0
+markdown2==2.4.10
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+networkx==3.2.1
+nh3==0.2.14
+ninja==1.11.1.1
+numpy==1.26.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+packaging==23.2
+pandas==2.1.3
+prompt-toolkit==3.0.41
+protobuf==4.25.1
+psutil==5.9.6
+pyarrow==14.0.1
+pydantic==1.10.13
+Pygments==2.17.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+ray==2.8.0
+referencing==0.31.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.0
+safetensors==0.4.0
+sentencepiece==0.1.99
+shortuuid==1.0.11
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+svgwrite==1.4.3
+sympy==1.12
+tiktoken==0.5.1
+tokenizers==0.15.0
+torch==2.0.1
+tqdm==4.66.1
+transformers==4.35.2
+triton==2.0.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+uvloop==0.19.0
+vllm==0.2.1.post1
+watchfiles==0.21.0
+wavedrom==2.0.3.post3
+wcwidth==0.2.10
+websockets==12.0
+xformers==0.0.22
+yarl==1.9.2
diff --git a/dalm/datasets/backtranslation/requirements_train.txt b/dalm/datasets/backtranslation/requirements_train.txt
new file mode 100644
index 0000000..735b52a
--- /dev/null
+++ b/dalm/datasets/backtranslation/requirements_train.txt
@@ -0,0 +1,90 @@
+accelerate==0.24.1
+aiohttp==3.8.6
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==3.7.1
+async-timeout==4.0.3
+attrs==23.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.2
+click==8.1.7
+datasets==2.15.0
+deepspeed==0.12.3
+dill==0.3.7
+einops==0.7.0
+exceptiongroup==1.1.3
+fastapi==0.104.1
+filelock==3.13.1
+flash-attn==2.3.3
+frozenlist==1.4.0
+fschat==0.2.32
+fsspec==2023.10.0
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.2
+httpx==0.25.1
+huggingface-hub==0.19.4
+idna==3.4
+Jinja2==3.1.2
+markdown-it-py==3.0.0
+markdown2==2.4.10
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nh3==0.2.14
+ninja==1.11.1.1
+numpy==1.26.2
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+packaging==23.2
+pandas==2.1.3
+prompt-toolkit==3.0.41
+psutil==5.9.6
+py-cpuinfo==9.0.0
+pyarrow==14.0.1
+pyarrow-hotfix==0.5
+pydantic==1.10.13
+pydantic_core==2.14.3
+Pygments==2.16.1
+pynvml==11.5.0
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+safetensors==0.4.0
+sentencepiece==0.1.99
+shortuuid==1.0.11
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+svgwrite==1.4.3
+sympy==1.12
+tiktoken==0.5.1
+tokenizers==0.15.0
+torch==2.1.0
+tqdm==4.66.1
+transformers==4.35.2
+triton==2.1.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+wavedrom==2.0.3.post3
+wcwidth==0.2.10
+xxhash==3.4.1
+yarl==1.9.2
diff --git a/dalm/datasets/backtranslation/scripts/falcon.sh b/dalm/datasets/backtranslation/scripts/falcon.sh
new file mode 100644
index 0000000..6a0a9f4
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/falcon.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/bash
+
+#SBATCH --job-name=falcon
+#SBATCH --output=logs/%x-%j.log
+#SBATCH --error=logs/%x-%j.log
+
+#SBATCH --partition=Partition
+#SBATCH --cpus-per-task=8
+#SBATCH -n 1
+#SBATCH -N 1
+
+
+source ~/anaconda3/bin/activate torch
+
+python data/unlabelled/falcon_refinedweb.py
diff --git a/dalm/datasets/backtranslation/scripts/predict_instruction.sh b/dalm/datasets/backtranslation/scripts/predict_instruction.sh
new file mode 100644
index 0000000..28b8720
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/predict_instruction.sh
@@ -0,0 +1,23 @@
+# TODO: to add inference executive file with multiple GPUs
+#!/usr/bin/bash
+
+export CUDA_VISIBLE_DEVICES=7
+
+num_nodes=1
+num_gpu_per_node=1
+
+bsz=4
+model_path="/dev/shm/tzhu/outputs/forward_model_on_seed_data_scheduled"
+
+bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc)
+
+torchrun \
+    --nnodes ${num_nodes} \
+    --nproc_per_node ${num_gpu_per_node} \
+    -m src.core.predict \
+        --mixed_precision="bf16" \
+        --model_path=${model_path} \
+        --data_filepath="data/seed/seed.jsonl" \
+        --save_filepath="outputs/seed_pred.jsonl" \
+        --prompt_column_name="instruction" \
+        --batch_size=${bsz_per_dev}
diff --git a/dalm/datasets/backtranslation/scripts/predict_instruction_vllm.sh b/dalm/datasets/backtranslation/scripts/predict_instruction_vllm.sh
new file mode 100644
index 0000000..93ae6f8
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/predict_instruction_vllm.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+# model_path="/dev/shm/tzhu/outputs/forward_model_on_seed_data_scheduled"
+# data_filepath="data/seed/seed.jsonl"
+# save_filepath="outputs/seed_pred.jsonl"
+# prompt_column_name="instruction"
+
+# model_path="/dev/shm/tzhu/outputs/backward_model_on_seed_data_scheduled"
+# data_filepath="data/unlabelled/sampled.jsonl"
+# save_filepath="outputs/sampled_unlabelled_gen_instruction.jsonl"
+# prompt_column_name="content"
+
+# model_path="/dev/shm/tzhu/Humback/models/m1_with_diff_sys_prompt"
+# data_filepath="data/alpaca_eval/alpaca_eval.jsonl"
+# save_filepath="outputs/m1_alpaca_eval_pred.jsonl"
+# prompt_column_name="instruction"
+
+model_path="/dev/shm/tzhu/Humback/models/m0"
+data_filepath="data/alpaca_eval/alpaca_eval.jsonl"
+save_filepath="outputs/m0_alpaca_eval_pred.jsonl"
+prompt_column_name="instruction"
+
+python -m src.core.predict_vllm \
+    --reverse \
+    --model_path=${model_path} \
+    --data_filepath=${data_filepath} \
+    --save_filepath=${save_filepath} \
+    --prompt_column_name=${prompt_column_name} \
+    --tensor_parallel_size=8
diff --git a/dalm/datasets/backtranslation/scripts/self_aug.sh b/dalm/datasets/backtranslation/scripts/self_aug.sh
new file mode 100755
index 0000000..c5562b8
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/self_aug.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+model_path="/dev/shm/Humpback/outputs/backward_model_on_seed_data_scheduled"
+data_filepath="data/unlabelled/falcon-refinedweb-sampled.jsonl"
+save_filepath="/dev/shm/Humpback/outputs/m1/unlabelled_gen_instruction.jsonl"
+prompt_column_name="content"
+
+# python -m src.core.predict_vllm \
+#     --reverse \ 
+#     --model_path=${model_path} \
+#     --data_filepath=${data_filepath} \
+#     --save_filepath=${save_filepath} \
+#     --prompt_column_name=${prompt_column_name} \
+#     --tensor_parallel_size=8
+
+python src/core/predict_vllm.py \
+    --reverse \
+    --model_path=${model_path} \
+    --data_filepath=${data_filepath} \
+    --save_filepath=${save_filepath} \
+    --prompt_column_name=${prompt_column_name} \
+    --tensor_parallel_size=8
diff --git a/dalm/datasets/backtranslation/scripts/self_curation.sh b/dalm/datasets/backtranslation/scripts/self_curation.sh
new file mode 100755
index 0000000..6c7a259
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/self_curation.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+model_path="/dev/shm/Humpback/models/m1_strict_score_matching_2400steps"
+unlabelled_data_filepath="/dev/shm/Humpback/outputs/m1/unlabelled_gen_instruction.jsonl"
+middle_save_filepath="/dev/shm/Humpback/outputs/m1/mid1_unlabelled_data_for_curation.jsonl"
+predicted_save_filepath="/dev/shm/Humpback/outputs/m1/mid2_unlabelled_data_curation_predicted.jsonl"
+curation_results_save_filepath="/dev/shm/Humpback/outputs/m1/mid3_unlabelled_data_curation_results.jsonl"
+curated_save_filepath="/dev/shm/Humpback/outputs/m1/unlabelled_curated_data.jsonl"
+
+echo "(1/3) => Build dataset for curation ..."
+python -m src.core.build_curation_dataset \
+    --data_filepath=${unlabelled_data_filepath} \
+    --save_filepath=${middle_save_filepath} \
+    --curation_prompt_filepath="data/prompts/self_curation.txt" \
+    --generated_instruction_column_name="response" \
+    --response_column_name="prompt"
+
+echo "(2/3) => Predict curation results ..."
+python -m src.core.predict_vllm \
+    --model_path=${model_path} \
+    --data_filepath=${middle_save_filepath} \
+    --save_filepath=${predicted_save_filepath} \
+    --prompt_column_name="prompt" \
+    --tensor_parallel_size=8
+
+echo "(3/3) => Curate results ..."
+python -m src.core.filter_curation_results \
+    --data_filepath=${predicted_save_filepath} \
+    --middle_save_filepath=${curation_results_save_filepath} \
+    --save_filepath=${curated_save_filepath} \
+    --curation_response_column_name="response" \
+    --score=5
diff --git a/dalm/datasets/backtranslation/scripts/train_backward_Myx.sh b/dalm/datasets/backtranslation/scripts/train_backward_Myx.sh
new file mode 100755
index 0000000..b4c10c0
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/train_backward_Myx.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/bash
+
+num_nodes=1
+num_gpu_per_node=8
+
+bsz=32
+output_dir="/dev/shm/Humpback/outputs/backward_model_on_seed_data_scheduled"
+
+mkdir -p $output_dir
+bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc)
+
+python -m torch.distributed.run \
+    --nnodes ${num_nodes} \
+    --nproc_per_node ${num_gpu_per_node} \
+    -m src.core.train_flash_attn \
+        --reverse \
+        --deepspeed conf/ds_zero2default.json \
+        --model_name_or_path "mistralai/Mistral-7B-v0.1" \
+        --data_path "data/seed/seed.jsonl" \
+        --per_device_train_batch_size ${bsz_per_dev} \
+        --per_device_eval_batch_size ${bsz_per_dev} \
+        --adam_beta1 0.9 \
+        --adam_beta2 0.95 \
+        --learning_rate "1e-5" \
+        --final_lr "9e-6" \
+        --weight_decay 0.1 \
+        --max_grad_norm 1.0 \
+        --evaluation_strategy "no" \
+        --logging_strategy steps \
+        --logging_steps 1 \
+        --max_steps 500 \
+        --save_strategy steps \
+        --save_steps 100 \
+        --save_total_limit 1 \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --ddp_timeout 30000 \
+        --logging_first_step True \
+        --bf16 True \
+        --ddp_find_unused_parameters False \
+        --gradient_checkpointing \
+        --report_to none \
+        --log_level info \
+        --lazy_preprocess True
diff --git a/dalm/datasets/backtranslation/scripts/train_backward_Myx_slurm.sh b/dalm/datasets/backtranslation/scripts/train_backward_Myx_slurm.sh
new file mode 100644
index 0000000..1f5bfb6
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/train_backward_Myx_slurm.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/bash
+
+#SBATCH --job-name=backward
+#SBATCH --output=logs/%x-%j.log
+#SBATCH --error=logs/%x-%j.log
+
+#SBATCH --partition=Partition
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=128G
+#SBATCH -x SH-IDCA1404-10-140-54-116
+
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+
+
+source ~/anaconda3/bin/activate torch
+
+num_nodes=1
+num_gpu_per_node=4
+
+bsz=32
+output_dir="outputs/backward_model_on_seed_data_scheduled_ds1"
+
+mkdir -p $output_dir
+bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc)
+
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+echo "Node: $head_node"
+echo "Node IP: $head_node_ip"
+
+srun torchrun \
+    --nnodes ${num_nodes} \
+    --nproc_per_node ${num_gpu_per_node} \
+    --node_rank $SLURM_NODEID \
+    --rdzv_id $RANDOM \
+    --rdzv_backend c10d \
+    --rdzv_endpoint $head_node:29518 \
+    -m src.core.train_flash_attn \
+        --reverse \
+        --deepspeed conf/ds_zero2default.json \
+        --model_name_or_path /home/zhutong/Llama-2-7b-hf \
+        --data_path data/seed/seed.jsonl \
+        --per_device_train_batch_size ${bsz_per_dev} \
+        --per_device_eval_batch_size ${bsz_per_dev} \
+        --num_train_epochs 15 \
+        --adam_beta1 0.9 \
+        --adam_beta2 0.95 \
+        --learning_rate "1e-5" \
+        --final_lr "9e-6" \
+        --weight_decay 0.1 \
+        --max_grad_norm 1.0 \
+        --evaluation_strategy "no" \
+        --logging_strategy steps \
+        --logging_steps 1 \
+        --save_strategy epoch \
+        --save_total_limit 1 \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --ddp_timeout 30000 \
+        --logging_first_step True \
+        --bf16 True \
+        --tf32 True \
+        --ddp_find_unused_parameters False \
+        --gradient_checkpointing \
+        --report_to none \
+        --log_level info \
+        --lazy_preprocess True
+
+        # --fsdp "full_shard auto_wrap" \
+        # --fsdp_config conf/fsdp_config.json \
diff --git a/dalm/datasets/backtranslation/scripts/train_seed.sh b/dalm/datasets/backtranslation/scripts/train_seed.sh
new file mode 100755
index 0000000..d3cb0e7
--- /dev/null
+++ b/dalm/datasets/backtranslation/scripts/train_seed.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/bash
+
+num_nodes=1
+num_gpu_per_node=8
+
+bsz=32
+# max_steps=500
+# data_path="data/seed/seed.jsonl"
+# output_dir="/dev/shm/tzhu/outputs/forward_model_on_seed_data_scheduled"
+# max_steps=765  # 21,301 curated instances (score=5) + 3,200 seed data for M1 training
+# data_path="data/curated/m1.jsonl"
+# output_dir="/dev/shm/tzhu/Humback/models/m1_with_diff_sys_prompt"
+
+num_nodes=1
+num_gpu_per_node=8
+bsz=32
+
+max_steps=2400
+data_path=data/seed/seed.jsonl
+output_dir="/dev/shm/Humpback/models/m1_strict_score_matching_2400steps"
+
+mkdir -p $output_dir
+bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc)
+
+python -m torch.distributed.run \
+    --nnodes ${num_nodes} \
+    --nproc_per_node ${num_gpu_per_node} \
+    -m src.core.train_flash_attn \
+        --deepspeed conf/ds_zero2default.json \
+        --model_name_or_path "mistralai/Mistral-7B-v0.1" \
+        --data_path ${data_path} \
+        --per_device_train_batch_size ${bsz_per_dev} \
+        --per_device_eval_batch_size ${bsz_per_dev} \
+        --adam_beta1 0.9 \
+        --adam_beta2 0.95 \
+        --learning_rate "1e-5" \
+        --final_lr "9e-6" \
+        --weight_decay 0.1 \
+        --max_grad_norm 1.0 \
+        --evaluation_strategy "no" \
+        --logging_strategy steps \
+        --logging_steps 1 \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --ddp_timeout 30000 \
+        --logging_first_step True \
+        --bf16 True \
+        --tf32 True \
+        --ddp_find_unused_parameters False \
+        --gradient_checkpointing \
+        --report_to none \
+        --log_level info \
+        --lazy_preprocess True \
+        --save_total_limit 1 \
+        --max_steps ${max_steps} \
+        --save_strategy steps \
+        --save_steps 100
diff --git a/dalm/datasets/backtranslation/src/__init__.py b/dalm/datasets/backtranslation/src/__init__.py
new file mode 100644
index 0000000..c26fa2b
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/__init__.py
@@ -0,0 +1 @@
+import src.utils.template  # noqa: F401
diff --git a/dalm/datasets/backtranslation/src/core/build_curation_dataset.py b/dalm/datasets/backtranslation/src/core/build_curation_dataset.py
new file mode 100644
index 0000000..aba671f
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/build_curation_dataset.py
@@ -0,0 +1,42 @@
+import argparse
+
+from tqdm import tqdm
+
+from src.utils.io import dump_jsonlines, load_jsonlines
+
+
+def main(args):
+    prompt_template = open(args.curation_prompt_filepath, "r").read().strip()
+    data = load_jsonlines(args.data_filepath)
+    results = []
+    for ins in tqdm(data, desc="Building curation dataset"):
+        generated_instruction = ins[args.generated_instruction_column_name]
+        response = ins[args.response_column_name]
+        prompt = prompt_template.format(
+            generated_instruction=generated_instruction,
+            response=response,
+        )
+        results.append(
+            {
+                "prompt": prompt,
+                "generated_instruction": generated_instruction,
+                "response": response,
+            }
+        )
+    dump_jsonlines(results, args.save_filepath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_filepath", type=str)
+    parser.add_argument("--save_filepath", type=str)
+    parser.add_argument(
+        "--curation_prompt_filepath", type=str, default="data/prompts/self_curation.txt"
+    )
+    parser.add_argument(
+        "--generated_instruction_column_name", type=str, default="response"
+    )
+    parser.add_argument("--response_column_name", type=str, default="prompt")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/dalm/datasets/backtranslation/src/core/filter_curation_results.py b/dalm/datasets/backtranslation/src/core/filter_curation_results.py
new file mode 100644
index 0000000..657726f
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/filter_curation_results.py
@@ -0,0 +1,83 @@
+import argparse
+import re
+import statistics as sts
+from collections import Counter
+
+from tqdm import tqdm
+
+from src.utils.io import dump_jsonlines, load_jsonlines
+
+
+def main(args):
+    data = load_jsonlines(args.data_filepath)
+    regex = re.compile(r"[Ss]core:\s*(\d+)$")
+    tgt_scores = list(map(int, args.scores.split(",")))
+
+    scores = []
+    qualified_results = []
+    all_results = []
+    instruction_lens = []
+    response_lens = []
+    for ins in tqdm(data, desc="Filtering curation results"):
+        raw = ins["raw"]
+        curation_response = ins[args.curation_response_column_name]
+        score_matched = regex.search(curation_response)
+        if score_matched:
+            score = int(score_matched.group(1))
+        else:
+            score = None
+
+        scores.append(str(score))
+
+        all_results.append(
+            {
+                "instruction": raw["generated_instruction"],
+                "response": raw["response"],
+                "score": score,
+            }
+        )
+        if isinstance(score, int) and score is not None and score in tgt_scores:
+            if (
+                args.min_instruction_len
+                <= len(raw["generated_instruction"])
+                <= args.max_instruction_len
+            ):
+                qualified_results.append(
+                    {
+                        "instruction": raw["generated_instruction"],
+                        "response": raw["response"],
+                        "score": score,
+                    }
+                )
+                instruction_lens.append(len(raw["generated_instruction"]))
+                response_lens.append(len(raw["response"]))
+
+    dump_jsonlines(all_results, args.middle_save_filepath)
+    dump_jsonlines(qualified_results, args.save_filepath)
+
+    print(f"Scores: {Counter(scores).most_common()}")
+    print(
+        f"Number of qualified results (scores={args.scores}): {len(qualified_results)}/{len(all_results)}"
+    )
+    print(
+        f"instruction len: {sts.mean(instruction_lens):.0f} ± {sts.stdev(instruction_lens):.0f}"
+    )
+    print(
+        f"response len: {sts.mean(response_lens):.0f} ± {sts.stdev(response_lens):.0f}"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_filepath", type=str)
+    parser.add_argument("--middle_save_filepath", type=str)
+    parser.add_argument("--save_filepath", type=str)
+    parser.add_argument("--curation_response_column_name", type=str, default="response")
+    parser.add_argument(
+        "--scores", type=str, default="5", help="scores separated in `,`. e.g. `3,4,5`."
+    )
+    parser.add_argument("--min_instruction_len", type=int, default=10)
+    parser.add_argument("--max_instruction_len", type=int, default=800)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/dalm/datasets/backtranslation/src/core/predict.py b/dalm/datasets/backtranslation/src/core/predict.py
new file mode 100644
index 0000000..1e23041
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/predict.py
@@ -0,0 +1,96 @@
+import argparse
+
+import torch
+from accelerate import Accelerator
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from src.data import CollateFnWithTokenization, InferenceDataset
+from src.utils.io import dump_jsonlines, load_jsonlines
+
+
+@torch.inference_mode()
+def main(args):
+    accelerator = Accelerator(mixed_precision=args.mixed_precision)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(args.model_path)
+    model.half()
+
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+
+    dataset = InferenceDataset(
+        load_jsonlines(args.data_filepath),
+        content_name=args.prompt_column_name,
+        reverse=args.reverse,
+    )
+    data_loader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        collate_fn=CollateFnWithTokenization(tokenizer),
+        shuffle=False,
+    )
+
+    model, data_loader = accelerator.prepare(model, data_loader)
+
+    results = []
+
+    for batch in tqdm(data_loader):
+        output_ids = accelerator.unwrap_model(model).generate(
+            **batch,
+            do_sample=True if args.temperature > 1e-5 else False,
+            temperature=args.temperature,
+            repetition_penalty=args.repetition_penalty,
+            max_new_tokens=args.max_new_tokens,
+        )
+
+        output_ids = accelerator.pad_across_processes(
+            output_ids, dim=1, pad_index=tokenizer.pad_token_id
+        )
+        input_ids = accelerator.gather(batch["input_ids"]).cpu().numpy()
+        output_ids = accelerator.gather(output_ids).cpu().numpy()
+
+        decoded_inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        decoded_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        for decoded_input, decoded_pred in zip(decoded_inputs, decoded_preds):
+            results.append(
+                {
+                    "prompt": decoded_input,
+                    "response": decoded_pred,
+                }
+            )
+            results.append(decoded_pred)
+
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        dump_jsonlines(results, args.save_filepath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--repetition_penalty", type=float, default=1.0)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--data_filepath", type=str)
+    parser.add_argument("--save_filepath", type=str)
+    parser.add_argument("--prompt_column_name", type=str, default="instruction")
+    parser.add_argument("--reverse", action="store_true")
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--batch_size", type=int, default=2)
+    parser.add_argument("--mixed_precision", type=str, default="bf16")
+    args = parser.parse_args()
+
+    # Reset default repetition penalty for T5 models.
+    if "t5" in args.model_path and args.repetition_penalty == 1.0:
+        args.repetition_penalty = 1.2
+
+    main(args)
diff --git a/dalm/datasets/backtranslation/src/core/predict_vllm.py b/dalm/datasets/backtranslation/src/core/predict_vllm.py
new file mode 100644
index 0000000..cbbc153
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/predict_vllm.py
@@ -0,0 +1,77 @@
+
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+import argparse
+
+from vllm import LLM, SamplingParams
+
+from src.data import InferenceDataset
+from src.utils.io import dump_jsonlines, load_jsonlines
+
+
+def main(args):
+    print("LLM")
+    llm = LLM(
+        args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        dtype=args.dtype,
+    )
+    print("Sampling Params")
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_new_tokens,
+    )
+
+    print("Loading data")
+
+    raw_data = load_jsonlines(args.data_filepath)
+    data = InferenceDataset(
+        raw_data,
+        content_name=args.prompt_column_name,
+        reverse=args.reverse,
+    )
+    prompts = data.get_all()
+
+    print("Generating")
+
+    # 00:25 / 100 prompts on one GPU
+    results = llm.generate(prompts, use_tqdm=True, sampling_params=sampling_params)
+
+    # 07:24 / 100 prompts on one GPU
+    # results = []
+    # for prompt in tqdm(prompts):
+    #     result = llm.generate(prompt, use_tqdm=False, sampling_params=sampling_params)
+    #     results.append(result)
+
+    dump_jsonl = []
+    for raw, result in zip(raw_data, results):
+        dump_jsonl.append(
+            {
+                "raw": raw,
+                "full_prompt": result.prompt,
+                "prompt": raw[args.prompt_column_name],
+                "response": result.outputs[0].text,
+            }
+        )
+    dump_jsonlines(dump_jsonl, args.save_filepath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--data_filepath", type=str)
+    parser.add_argument("--save_filepath", type=str)
+    parser.add_argument("--prompt_column_name", type=str, default="instruction")
+    parser.add_argument("--reverse", action="store_true")
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--dtype", type=str, default="bfloat16")
+    parser.add_argument("--tensor_parallel_size", type=int, default=1)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/dalm/datasets/backtranslation/src/core/train.py b/dalm/datasets/backtranslation/src/core/train.py
new file mode 100644
index 0000000..3412bd1
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/train.py
@@ -0,0 +1,69 @@
+"""
+The code is borrowed from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
+
+Here is the original license:
+    https://github.com/lm-sys/FastChat/blob/main/LICENSE
+"""
+
+import math
+import pathlib
+
+import transformers
+
+from src.core.trainer import ScheduledTrainer
+from src.data import make_supervised_data_module
+from src.utils.config import DataArguments, ModelArguments, TrainingArguments
+from src.utils.io import safe_save_model_for_hf_trainer
+
+
+def train():
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len and training_args.model_max_length > orig_ctx_len:
+        scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    config.use_cache = False
+
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        use_flash_attention_2=True,
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+
+    # Load data
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+
+    # Start trainner
+    trainer = ScheduledTrainer(
+        model=model, tokenizer=tokenizer, args=training_args, **data_module
+    )
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    model.config.use_cache = True
+    trainer.save_state()
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/dalm/datasets/backtranslation/src/core/train_flash_attn.py b/dalm/datasets/backtranslation/src/core/train_flash_attn.py
new file mode 100644
index 0000000..8d9e208
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/train_flash_attn.py
@@ -0,0 +1,13 @@
+# from fastchat.train.llama_flash_attn_monkey_patch import (
+#     replace_llama_attn_with_flash_attn,
+# )
+
+# replace_llama_attn_with_flash_attn()
+
+import os
+os.environ['TRANSFORMERS_CACHE'] = '/dev/shm/huggingface/'
+
+from src.core.train import train  # noqa: E402
+
+if __name__ == "__main__":
+    train()
diff --git a/dalm/datasets/backtranslation/src/core/trainer.py b/dalm/datasets/backtranslation/src/core/trainer.py
new file mode 100644
index 0000000..c6f0f33
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/core/trainer.py
@@ -0,0 +1,77 @@
+from functools import partial
+
+import torch
+from torch.optim.lr_scheduler import LambdaLR
+from transformers import Trainer
+
+
+def _get_linear_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    final_lr: float = 1e-6,
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    return max(
+        final_lr,
+        float(num_training_steps - current_step)
+        / float(max(1, num_training_steps - num_warmup_steps)),
+    )
+
+
+def get_linear_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps,
+    num_training_steps,
+    last_epoch=-1,
+    final_lr: float = 1e-6,
+):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    lr_lambda = partial(
+        _get_linear_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        final_lr=final_lr,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+class ScheduledTrainer(Trainer):
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
+    ):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+        """
+        if self.lr_scheduler is None:
+            self.lr_scheduler = get_linear_schedule_with_warmup(
+                self.optimizer if optimizer is None else optimizer,
+                num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                num_training_steps=num_training_steps,
+                final_lr=self.args.final_lr,
+            )
+            self._created_lr_scheduler = True
+        return self.lr_scheduler
diff --git a/dalm/datasets/backtranslation/src/data.py b/dalm/datasets/backtranslation/src/data.py
new file mode 100644
index 0000000..8ff4caf
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/data.py
@@ -0,0 +1,261 @@
+import torch
+from fastchat.conversation import Conversation, SeparatorStyle, get_conv_template
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer
+
+from src.utils.constant import IGNORE_TOKEN_ID
+from src.utils.io import load_jsonlines
+from src.utils.print import rank0_print
+
+
+def preprocess(
+    sources,
+    tokenizer: PreTrainedTokenizer,
+    reverse: bool = False,
+) -> dict:
+    if reverse:
+        conv = get_conv_template("vicuna_v1.1_reverse")
+        aug_conv = conv
+    else:
+        conv = get_conv_template("vicuna_v1.1_seed")
+        aug_conv = get_conv_template("vicuna_v1.1_aug")
+    assert conv.roles == aug_conv.roles
+    assert conv.sep == aug_conv.sep
+    assert conv.sep2 == aug_conv.sep2
+
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        conv_src = source["source"]
+        if conv_src == "aug":
+            _conv = aug_conv
+        else:
+            _conv = conv
+        if roles[source["conversations"][0]["from"]] != _conv.roles[0]:
+            # Skip the first one if it is not from human
+            source["conversations"] = source["conversations"][1:]
+
+        _conv.messages = []
+        for j, sentence in enumerate(source["conversations"]):
+            role = roles[sentence["from"]]
+            assert role == _conv.roles[j % 2], f"{i}"
+            _conv.append_message(role, sentence["value"])
+        conversations.append(_conv.get_prompt())
+
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+
+    assert conv.sep_style == SeparatorStyle.ADD_COLON_TWO
+
+    # Mask targets. Only compute loss on the assistant outputs.
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        turns = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_TOKEN_ID
+        for i, turn in enumerate(turns):
+            if turn == "":
+                break
+            turn_len = len(tokenizer(turn).input_ids)
+
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            # "-2" is hardcoded for the LLaMA tokenizer to make the offset correct.
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            # Ignore the user instructions
+            target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
+            cur_len += turn_len
+
+        target[cur_len:] = IGNORE_TOKEN_ID
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_TOKEN_ID
+                rank0_print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.ne(tokenizer.pad_token_id),
+    )
+
+
+def convert_inst_resp_pairs_into_fastchat(ins: dict, reverse: bool = False) -> dict:
+    inst = ins["instruction"] if not reverse else ins["response"]
+    resp = ins["response"] if not reverse else ins["instruction"]
+    if "score" in ins:
+        source = "aug"
+    else:
+        source = "seed"
+    return {
+        "id": "",
+        "source": source,
+        "conversations": [
+            {"from": "human", "value": inst},
+            {"from": "gpt", "value": resp},
+        ],
+    }
+
+
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        raw_data,
+        tokenizer: PreTrainedTokenizer,
+        reverse: bool = False,
+    ):
+        super(SupervisedDataset, self).__init__()
+
+        rank0_print("Formatting inputs...")
+        sources = [
+            convert_inst_resp_pairs_into_fastchat(example, reverse=reverse)
+            for example in raw_data
+        ]
+        data_dict = preprocess(sources, tokenizer, reverse=reverse)
+
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+        self.attention_mask = data_dict["attention_mask"]
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, i) -> dict[str, torch.Tensor]:
+        return dict(
+            input_ids=self.input_ids[i],
+            labels=self.labels[i],
+            attention_mask=self.attention_mask[i],
+        )
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        raw_data,
+        tokenizer: PreTrainedTokenizer,
+        reverse: bool = False,
+    ):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.reverse = reverse
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.raw_data = raw_data
+        self.cached_data_dict = {}
+
+    def __len__(self):
+        return len(self.raw_data)
+
+    def __getitem__(self, i) -> dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+
+        ret = preprocess(
+            [
+                convert_inst_resp_pairs_into_fastchat(
+                    self.raw_data[i], reverse=self.reverse
+                )
+            ],
+            self.tokenizer,
+            reverse=self.reverse,
+        )
+        ret = dict(
+            input_ids=ret["input_ids"][0],
+            labels=ret["labels"][0],
+            attention_mask=ret["attention_mask"][0],
+        )
+        self.cached_data_dict[i] = ret
+
+        return ret
+
+
+def make_supervised_data_module(tokenizer: PreTrainedTokenizer, data_args) -> dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = (
+        LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
+    )
+    rank0_print("Loading data...")
+
+    train_data = load_jsonlines(data_args.data_path)
+    train_dataset = dataset_cls(
+        train_data, tokenizer=tokenizer, reverse=data_args.reverse
+    )
+
+    if data_args.eval_data_path:
+        eval_data = load_jsonlines(data_args.eval_data_path)
+        eval_dataset = dataset_cls(
+            eval_data, tokenizer=tokenizer, reverse=data_args.reverse
+        )
+    else:
+        eval_dataset = None
+
+    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
+
+
+class InferenceDataset(Dataset):
+    def __init__(
+        self,
+        data,
+        content_name: str = "content",
+        reverse: bool = False,
+    ):
+        self.data = data
+        self.reverse = reverse
+        self.content_name = content_name
+
+        if reverse:
+            self.conv: Conversation = get_conv_template("vicuna_v1.1_reverse")
+        else:
+            self.conv: Conversation = get_conv_template("vicuna_v1.1")
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        ins = self.data[idx]
+        self.conv.messages.clear()
+        self.conv.append_message(self.conv.roles[0], ins[self.content_name])
+        self.conv.append_message(self.conv.roles[1], None)
+        prompt = self.conv.get_prompt()
+        return prompt
+
+    def get_all(self):
+        return [self[i] for i in range(len(self))]
+
+
+class CollateFnWithTokenization:
+    def __init__(self, tokenizer: PreTrainedTokenizer, max_seq_len: int = 2048) -> None:
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+
+    def __call__(self, batch):
+        outputs = self.tokenizer(
+            batch,
+            return_tensors="pt",
+            max_length=self.max_seq_len,
+            padding=True,
+            truncation=True,
+        )
+        return outputs
diff --git a/dalm/datasets/backtranslation/src/utils/__init__.py b/dalm/datasets/backtranslation/src/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dalm/datasets/backtranslation/src/utils/config.py b/dalm/datasets/backtranslation/src/utils/config.py
new file mode 100644
index 0000000..6e1fd48
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/config.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: str = field(default="meta-llama/Llama-2-7b-hf")
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    eval_data_path: str = field(
+        default=None, metadata={"help": "Path to the evaluation data."}
+    )
+    lazy_preprocess: bool = field(default=False)
+    reverse: bool = field(
+        default=False,
+        metadata={"help": "Reverse inst-resp for backward model $M_{yx}$ training"},
+    )
+
+
+@dataclass
+class TrainingArguments(TrainingArguments):
+    cache_dir: str | None = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    final_lr: float = field(default=1e-6, metadata={"help": "Final minimal lr"})
diff --git a/dalm/datasets/backtranslation/src/utils/constant.py b/dalm/datasets/backtranslation/src/utils/constant.py
new file mode 100644
index 0000000..bd4fcf9
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/constant.py
@@ -0,0 +1,3 @@
+from transformers.trainer_pt_utils import LabelSmoother
+
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
diff --git a/dalm/datasets/backtranslation/src/utils/convert_vllm_to_alpaca_eval.py b/dalm/datasets/backtranslation/src/utils/convert_vllm_to_alpaca_eval.py
new file mode 100644
index 0000000..e87cdd8
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/convert_vllm_to_alpaca_eval.py
@@ -0,0 +1,27 @@
+import argparse
+
+from src.utils.io import dump_json, load_jsonlines
+
+
+def main(args):
+    vllm_data = load_jsonlines(args.input_filepath)
+    alpaca_eval_data = []
+    for ins in vllm_data:
+        alpaca_eval_data.append(
+            {
+                "dataset": ins["raw"]["dataset"],
+                "instruction": ins["raw"]["instruction"],
+                "output": ins["response"],
+                "generator": args.generator,
+            }
+        )
+    dump_json(alpaca_eval_data, args.output_filepath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_filepath", "-i", type=str, required=True)
+    parser.add_argument("--output_filepath", "-o", type=str, required=True)
+    parser.add_argument("--generator", "-g", default="Humback", type=str)
+    args = parser.parse_args()
+    main(args)
diff --git a/dalm/datasets/backtranslation/src/utils/io.py b/dalm/datasets/backtranslation/src/utils/io.py
new file mode 100644
index 0000000..7bbbf74
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/io.py
@@ -0,0 +1,46 @@
+import json
+from pathlib import Path
+
+import transformers
+
+
+def dump_jsonlines(obj, filepath, **kwargs):
+    path = Path(filepath)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(filepath, "wt", encoding="utf-8") as fout:
+        for d in obj:
+            line_d = json.dumps(d, ensure_ascii=False, **kwargs)
+            fout.write("{}\n".format(line_d))
+
+
+def load_jsonlines(filepath, **kwargs):
+    data = list()
+    with open(filepath, "rt", encoding="utf-8") as fin:
+        for line in fin:
+            line_data = json.loads(line.strip())
+            data.append(line_data)
+    return data
+
+
+def load_json(filepath, **kwargs):
+    with open(filepath, "rt", encoding="utf-8") as fin:
+        data = json.load(fin, **kwargs)
+    return data
+
+
+def dump_json(obj, filepath, **kwargs):
+    path = Path(filepath)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(filepath, "wt", encoding="utf-8") as fout:
+        json.dump(obj, fout, ensure_ascii=False, **kwargs)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
diff --git a/dalm/datasets/backtranslation/src/utils/print.py b/dalm/datasets/backtranslation/src/utils/print.py
new file mode 100644
index 0000000..94086c6
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/print.py
@@ -0,0 +1,6 @@
+import os
+
+
+def rank0_print(*args, **kwargs):
+    if os.environ.get("LOCAL_RANK", 0) == 0:
+        print(*args, **kwargs)
diff --git a/dalm/datasets/backtranslation/src/utils/template.py b/dalm/datasets/backtranslation/src/utils/template.py
new file mode 100644
index 0000000..0441086
--- /dev/null
+++ b/dalm/datasets/backtranslation/src/utils/template.py
@@ -0,0 +1,49 @@
+from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template
+
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1_reverse",
+        system_message=(
+            "This is a chat between a curious user and a helpful artificial intelligence assistant. "
+            "Given the assistant's reponse, please predict the user's instruction."
+        ),
+        roles=("RESPONSE", "INSTRUCTION"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1_aug",
+        system_message=(
+            "This is a chat between a curious user and a helpful artificial intelligence assistant. "
+            "Given the assistant's reponse, please predict the user's instruction in the style of an AI Assistant."
+        ),
+        roles=("USER", "ASSISTANT"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1_seed",
+        system_message=(
+            "This is a chat between a curious user and a helpful artificial intelligence assistant. "
+            "The assistant gives helpful, detailed, and polite answers to the user's questions with knowledge from web search."
+        ),
+        roles=("USER", "ASSISTANT"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
diff --git a/dalm/datasets/backtranslation/tox.ini b/dalm/datasets/backtranslation/tox.ini
new file mode 100644
index 0000000..e149be7
--- /dev/null
+++ b/dalm/datasets/backtranslation/tox.ini
@@ -0,0 +1,21 @@
+[flake8]
+ignore=
+    # line length
+    E501,
+    # whitespace before ':'
+    E203,
+    # line break before binary operator
+    W503
+exclude =
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # This contains our built documentation
+    build,
+    # This contains builds of flake8 that we don't want to check
+    dist,
+    bak,
+    data,
+    outputs,
+    debug.py
\ No newline at end of file