From d35c01480d01be2aa8ac42a0d9424bfae78d3d3c Mon Sep 17 00:00:00 2001 From: Zhe Chen Date: Wed, 21 Feb 2024 22:04:06 +0800 Subject: [PATCH] Bump version to v1.2.2 (#48) * Remove repetition_penalty * Update grounding evaluation * Improve model code * Don't use repetition_penalty * Update BLOG.md * Update README.md * Update zero configs * Update training code --- BLOG.md | 29 +++++++- README.md | 53 +++++++++++--- internvl_chat/README.md | 59 ++++++++------- .../eval/llava_bench/evaluate_llava_bench.py | 4 +- .../eval/mmbench/evaluate_mmbench.py | 2 +- internvl_chat/eval/mmvet/evaluate_mmvet.py | 2 +- .../eval/refcoco/evaluate_grounding.py | 22 +++--- internvl_chat/evaluate.sh | 10 +++ internvl_chat/internvl/dist_utils.py | 3 +- .../internvl_chat/modeling_internvl_chat.py | 5 +- internvl_chat/internvl/serve/model_worker.py | 2 +- .../internvl/train/internvl_chat_finetune.py | 3 +- .../internvl/train/internvl_chat_pretrain.py | 3 +- internvl_chat/pyproject.toml | 2 +- ...1_2_hermes2_yi34b_448_finetune_continue.sh | 72 +++++++++++++++++++ internvl_chat/zero_stage1_config.json | 9 --- internvl_chat/zero_stage1_config_wo_opt.json | 29 -------- internvl_chat/zero_stage2_config.json | 9 --- 18 files changed, 212 insertions(+), 106 deletions(-) create mode 100644 internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh delete mode 100644 internvl_chat/zero_stage1_config_wo_opt.json diff --git a/BLOG.md b/BLOG.md index 22d33052..9cc905e5 100644 --- a/BLOG.md +++ b/BLOG.md @@ -1,5 +1,30 @@ # InternVL's Blog +## InternVL-Chat-V1.2-Plus + +> Date: 2024/02/21
+> Developed by: Zhe Chen, Weiyun Wang, Wenhai Wang, Erfei Cui, Zhangwei Gao, Xizhou Zhu, Lewei Lu, Tong Lu, Yu Qiao, Jifeng Dai + +[InternVL-Chat-V1.2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) uses the same model architecture as InternVL-Chat-V1.2, but the difference lies in the SFT dataset. InternVL-Chat-V1.2 only utilizes an SFT dataset with 1.2M samples, while our plus version employs an SFT dataset with 12M samples. + +### Performance + +\* Proprietary Model      † Training Set Observed + +| name | image size | MMMU
(val) | MMMU
(test) | MathVista
(testmini) | MMB
(test) | MMB−CN
(test) | MMVP | MME | ScienceQA
(image) | POPE | TextVQA
(val) | SEEDv1
(image) | VizWiz
(test) | GQA
(test) | +| ----------------------- | ---------- | ------------- | -------------- | ----------------------- | ------------- | ---------------- | ---- | -------- | -------------------- | ---- | ---------------- | ----------------- | ---------------- | ------------- | +| GPT-4V\* | unknown | 56.8 | 55.7 | 49.9 | 77.0 | 74.4 | 38.7 | 1409/517 | - | - | 78.0 | 71.6 | - | - | +| Gemini Ultra\* | unknown | 59.4 | - | 53.0 | - | - | - | - | - | - | 82.3 | - | - | - | +| Gemini Pro\* | unknown | 47.9 | - | 45.2 | 73.6 | 74.3 | 40.7 | 1497/437 | - | - | 74.6 | 70.7 | - | - | +| Qwen−VL−Plus\* | unknown | 45.2 | 40.8 | 43.3 | 67.0 | 70.7 | - | 1681/502 | - | - | 78.9 | 65.7 | - | - | +| Qwen−VL−Max\* | unknown | 51.4 | 46.8 | 51.0 | 77.6 | 75.7 | - | - | - | - | 79.5 | - | - | - | +| | | | | | | | | | | | | | | | +| LLaVA−NEXT−34B | 672x672 | 51.1 | 44.7 | 46.5 | 79.3 | 79.0 | - | 1631/397 | 81.8 | 87.7 | 69.5 | 75.9 | 63.8 | 67.1† | +| InternVL−Chat−V1.2 | 448x448 | 51.6 | 46.2 | 47.7 | 82.2 | 81.2 | 56.7 | 1672/509 | 83.3 | 88.0 | 69.7 | 75.6 | 60.0 | 64.0† | +| InternVL−Chat−V1.2−Plus | 448x448 | 50.3 | 45.6 | 59.9 | 83.8 | 82.0 | 58.7 | 1624/551 | 98.1† | 88.7 | 71.3† | 76.4 | - | 66.9† | + +- MMBench results are collected from the [leaderboard](https://mmbench.opencompass.org.cn/leaderboard). + ## InternVL-Chat-V1.2 > Date: 2024/02/12
@@ -31,8 +56,8 @@ For more details about data preparation, please see [here](./internvl_chat#prepa | Qwen-VL-Plus\* | unknown | 45.2 | 40.8 | 43.3 | 67.0 | 70.7 | - | 1681/502 | - | - | 78.9 | 65.7 | - | - | | Qwen-VL-Max\* | unknown | 51.4 | 46.8 | 51.0 | 77.6 | 75.7 | - | - | - | - | 79.5 | - | - | - | | | | | | | | | | | | | | | | | -| LLaVA-NEXT-34B | 672x672 | 51.1 | 44.7 | 46.5 | 79.3 | 79.0 | - | 1631/397 | 81.8 | 87.7 | 69.5 | 75.9 | 63.8 | 67.1 | -| InternVL-Chat-V1.2 | 448x448 | 51.6 | 46.2 | 47.7 | 82.2 | 81.2 | 56.7 | 1672/509 | 83.3 | 88.0 | 69.7 | 75.6 | 60.0 | 64.0 | +| LLaVA−NEXT−34B | 672x672 | 51.1 | 44.7 | 46.5 | 79.3 | 79.0 | - | 1631/397 | 81.8 | 87.7 | 69.5 | 75.9 | 63.8 | 67.1 | +| InternVL−Chat−V1.2 | 448x448 | 51.6 | 46.2 | 47.7 | 82.2 | 81.2 | 56.7 | 1672/509 | 83.3 | 88.0 | 69.7 | 75.6 | 60.0 | 64.0 | - MMBench results are collected from the [leaderboard](https://mmbench.opencompass.org.cn/leaderboard). - In most benchmarks, InternVL-Chat-V1.2 achieves better performance than LLaVA-NeXT-34B. diff --git a/README.md b/README.md index bca7eef6..42e40dec 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # image InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks —— An Open-Source Alternative to ViT-22B -\[[InternVL-Chat-V1.2 Blog](./BLOG.md)\] \[[Paper](https://arxiv.org/abs/2312.14238)\] \[[Chat Demo](https://internvl.opengvlab.com/)\] \[[Quick Start](#quick-start-with-huggingface)\] \[[中文解读](https://mp.weixin.qq.com/s/bdfAJRqOF9tUk8Vy9KC_XQ)\] +\[[Update Blog](./BLOG.md)\] \[[Paper](https://arxiv.org/abs/2312.14238)\] \[[Chat Demo](https://internvl.opengvlab.com/)\] \[[Quick Start](#quick-start-with-huggingface)\] \[[中文解读](https://mp.weixin.qq.com/s/bdfAJRqOF9tUk8Vy9KC_XQ)\] ## News🚀🚀🚀 +- `2024/02/21`: [InternVL-Chat-V1.2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) achieves SOTA performance on MathVista (59.9), MMBench (83.8), and MMVP (58.7). See our [blog](BLOG.md) for more details. - `2024/02/12`: InternVL-Chat-V1.2 has been released, utilizing [Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) as the LLM. It achieves 51.6 on MMMU val and 82.3 on MMBench test. For more details, please refer to our [blog](BLOG.md) or try our [demo](https://internvl.opengvlab.com/). The model is now available on [HuggingFace](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2), and both training/evaluation data and scripts are open-sourced. - `2024/02/04`: [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) achieves 44.67% on [MMVP](https://github.com/tsb0601/MMVP), higher than GPT-4V! - `2024/01/27`: We release 448 resolution model, achieving 76.6 on MMBench dev, see [here](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat#-evaluation-chinese-models). @@ -27,13 +28,14 @@ InternVL scales up the ViT to _**6B parameters**_ and aligns it with LLM. **Vision Large Language Model** -| Model | Date | Download | Note | -| ----------------------- | ---------- | ------------------------------------------------------------------------------------ | -------------------------------- | -| InternVL-Chat-13B | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B) | English multimodal dialogue | -| InternVL-Chat-19B | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B) | English multimodal dialogue | -| InternVL-Chat-19B-448px | 2024.02.03 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B-448px) | 448 resolution | -| InternVL-Chat-V1.1 | 2024.01.24 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | support Chinese and stronger OCR | -| InternVL-Chat-V1.2 | 2024.02.11 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | scaling up LLM to 34B (🔥new) | +| Model | Date | Download | Note | +| ----------------------- | ---------- | ------------------------------------------------------------------------------------ | ---------------------------------- | +| InternVL-Chat-13B | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B) | English multimodal dialogue | +| InternVL-Chat-19B | 2023.12.25 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B) | English multimodal dialogue | +| InternVL-Chat-19B-448px | 2024.02.03 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B-448px) | 448 resolution | +| InternVL-Chat-V1.1 | 2024.01.24 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | support Chinese and stronger OCR | +| InternVL-Chat-V1.2 | 2024.02.11 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | scaling up LLM to 34B (🔥new) | +| InternVL-Chat-V1.2-Plus | 2024.02.21 | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | more SFT data and stronger (🔥new) | ## What can InternVL do? @@ -503,6 +505,41 @@ caption = tokenizer.decode(pred[0].cpu(), skip_special_tokens=True).strip()
using InternVL-Chat (click to expand) +- Single GPU + +```python +import torch +from PIL import Image +from transformers import AutoModel, CLIPImageProcessor +from transformers import AutoTokenizer + +path = "OpenGVLab/InternVL-Chat-Chinese-V1-1" +model = AutoModel.from_pretrained( + path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True).eval().cuda() + +tokenizer = AutoTokenizer.from_pretrained(path) +image = Image.open('./examples/image2.jpg').convert('RGB') +image = image.resize((448, 448)) +image_processor = CLIPImageProcessor.from_pretrained(path) + +pixel_values = image_processor(images=image, return_tensors='pt').pixel_values +pixel_values = pixel_values.to(torch.bfloat16).cuda() + +generation_config = dict( + num_beams=1, + max_new_tokens=512, + do_sample=False, +) + +question = "请详细描述图片" +response = model.chat(tokenizer, pixel_values, question, generation_config) +``` + +- Multiple GPUs + ```python import torch from PIL import Image diff --git a/internvl_chat/README.md b/internvl_chat/README.md index 85d94577..cbcd9338 100644 --- a/internvl_chat/README.md +++ b/internvl_chat/README.md @@ -133,41 +133,52 @@ PARTITION='your partition' GPUS=32 PER_DEVICE_BATCH_SIZE=8 sh shell/hermes2_yi34 PARTITION='your partition' GPUS=64 PER_DEVICE_BATCH_SIZE=8 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune.sh ``` -The hyperparameters used for finetuning are listed in the following table. +The hyperparameters used for fine-tuning are listed in the following table. And, you can view the training logs in tensorboard at [here](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2/tensorboard). -| Hyperparameter | Trainable Param | Global Batch Size | Learning rate | Epochs | Max length | Weight decay | -| ------------------ | --------------- | ----------------- | ------------- | ------ | ---------- | ------------ | -| InternVL-Chat-V1.2 | 40B | 512 | 1e-5 | 1 | 2048 | 0.05 | +| Hyperparameter | Trainable Param | Global Batch Size | Learning rate | Epochs | Max length | Weight decay | +| ------------------ | ---------------- | ----------------- | ------------- | ------ | ---------- | ------------ | +| InternVL-Chat-V1.2 | 40B (full model) | 512 | 1e-5 | 1 | 2048 | 0.05 | -## 📊 Evaluation +## Continue Fine-tune -\* Training set observed. +You can continue to fine-tune the checkpoint from the previous training process use this [script](./shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh). + +Before fine-tuning, you should set the `--meta_path` in to your custom meta file of training data. + +```sh +# using 16 GPUs +PARTITION='your partition' GPUS=16 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh +``` + +## 📊 Evaluation **MultiModal Benchmark** -| model | MME | MMBdev/test | MMB-CNdev/test | POPE | MMVP | MathVista | -| --------------------------------------------------------------------------------- | -------------- | ---------------------- | ------------------------- | ---- | ---- | --------- | -| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 1672.3 / 341.1 | 76.6 / 75.4 | 71.5 / 70.1 | 87.2 | 44.7 | 34.5 | -| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 1672.1 / 509.3 | 81.4 / 82.2 | 79.5 / 81.2 | 88.0 | 56.7 | 47.7 | +\* Training set observed. -| model | MMMUval/test | CMMMUval/test | TinyLVLM | LLaVAbench | MM-Vet | -| --------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------ | ------------------- | --------------------- | ------ | -| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 39.1 / 35.3 | 34.8 / 34.0 | 344.5 | 76.3 | 45.0 | -| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 51.6 / [46.2](https://eval.ai/web/challenges/challenge-page/2179/leaderboard/5377) | - | 350.3 | - | 48.9 | +| name | model size | MathVista
(testmini) | MMB
(dev/test) | MMB−CN
(dev/test) | MMMU
(val/test) | CMMMU
(val/test) | MMVP | MME | POPE | Tiny LVLM | SEEDv1
(image) | LLaVA Wild | MM−Vet | +| ------------------------------------------------------------------------------------------- | ---------- | ----------------------- | ----------------- | -------------------- | ---------------------------------------------------------------------------------- | ------------------- | ---- | -------------- | ---- | --------- | ----------------- | ---------- | ------ | +| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 19B | 34.5 | 76.7 / 75.4 | 71.9 / 70.3 | 39.1 / 35.3 | 34.8 / 34.0 | 44.7 | 1675.1 / 348.6 | 87.1 | 343.2 | 73.2 | 73.2 | 46.7 | +| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 40B | 47.7 | 81.4 / 82.2 | 79.5 / 81.2 | 51.6 / [46.2](https://eval.ai/web/challenges/challenge-page/2179/leaderboard/5377) | TODO | 56.7 | 1672.1 / 509.3 | 88.0 | 350.3 | 75.6 | 85.0 | 48.9 | +| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B | 59.9 | 83.4 / 83.8 | 81.6 / 82.0 | 50.3 / 45.6 | TODO | 58.7 | 1623.6 / 550.7 | 88.7 | 353.9 | 76.4 | 84.6 | 47.9 | -**Visual Question Answering** +**Image Captioning & Visual Question Answering** + +\* Training set observed. -| model | VQAv2test | OKVQAval | TextVQAval | VizWizval/test | AI2Dtest | GQAtest | SQAtest | -| --------------------------------------------------------------------------------- | -------------------- | ------------------- | --------------------- | ------------------------- | ------------------- | ------------------ | ------------------ | -| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 80.9\* | 64.2\* | 65.8 | 58.3 / 57.3 | 70.2\* | 62.4\* | 91.2\* | -| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | - | 62.5\* | 69.7 | 61.9 / 60.0 | 71.6\* | 64.0\* | 83.3 | +| name | model size | COCO
(test) | Flickr30K
(test) | NoCaps
(val) | VQAv2
(testdev) | OKVQA
(val) | TextVQA
(val) | VizWiz
(val/test) | AI2D
(test) | GQA
(test) | ScienceQA
(image) | +| ------------------------------------------------------------------------------------------- | ---------- | -------------- | ------------------- | --------------- | ------------------ | -------------- | ---------------- | -------------------- | -------------- | ------------- | -------------------- | +| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 19B | 142.2\* | 85.3 | 120.8 | 80.9\* | 64.1\* | 65.9 | 59.0 / 57.3 | 70.3\* | 62.5\* | 90.1\* | +| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 40B | 113.9 | 92.4 | 112.5 | - | 62.5\* | 69.7 | 61.9 / 60.0 | 71.6\* | 64.0\* | 83.3 | +| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B | 143.4\* | 90.5 | 125.8 | - | 67.6\* | 71.3\* | 61.3 / - | 74.2\* | 66.9\* | 98.1\* | -**Image Captioning** +**Visual Grounding** -| model | COCOtest | Flickr30Ktest | NoCapsval | -| --------------------------------------------------------------------------------- | ------------------- | ------------------------ | -------------------- | -| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 141.8\* | 84.3 | 120.4 | -| [InternVL-Chat-V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 113.9 | 92.4 | 112.5 | +| name | model size | RefCOCO
(val) | RefCOCO
(testA) | RefCOCO
(testB) | RefCOCO+
(val) | RefCOCO+
(testA) | RefCOCO+
(testB) | RefCOCO−g
(val) | RefCOCO−g
(test) | +| ------------------------------------------------------------------------------------------- | ---------- | ---------------- | ------------------ | ------------------ | ----------------- | ------------------- | ------------------- | ------------------ | ------------------- | +| [InternVL−Chat−V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 19B | 84.7 | 89.9 | 78.6 | 78.5 | 85.6 | 70.1 | 81.0 | 81.4 | +| [InternVL−Chat−V1.2](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2) | 40B | 74.4 | 80.3 | 66.5 | 70.7 | 77.6 | 62.0 | 69.2 | 70.0 | +| [InternVL−Chat−V1.2−Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus) | 40B | 90.2 | 93.4 | 85.5 | 85.3 | 90.4 | 79.7 | 88.5 | 88.8 | ## 📊 Evaluation (Legacy Models) diff --git a/internvl_chat/eval/llava_bench/evaluate_llava_bench.py b/internvl_chat/eval/llava_bench/evaluate_llava_bench.py index 74cccc25..669f66b1 100644 --- a/internvl_chat/eval/llava_bench/evaluate_llava_bench.py +++ b/internvl_chat/eval/llava_bench/evaluate_llava_bench.py @@ -62,7 +62,7 @@ def evaluate_chat_model(): max_new_tokens=ds_collections[ds_name]['max_new_tokens'], min_new_tokens=ds_collections[ds_name]['min_new_tokens'], length_penalty=1, - repetition_penalty=1.5, + # repetition_penalty=1.5, do_sample=True if args.temperature > 0 else False, temperature=args.temperature, ) @@ -97,7 +97,7 @@ def evaluate_chat_model(): parser.add_argument('--batch-size', type=int, default=1) parser.add_argument('--num-workers', type=int, default=1) parser.add_argument('--num-beams', type=int, default=5) - parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--temperature', type=float, default=0.0) parser.add_argument('--out-dir', type=str, default='results') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() diff --git a/internvl_chat/eval/mmbench/evaluate_mmbench.py b/internvl_chat/eval/mmbench/evaluate_mmbench.py index 962fa53c..e77b543e 100644 --- a/internvl_chat/eval/mmbench/evaluate_mmbench.py +++ b/internvl_chat/eval/mmbench/evaluate_mmbench.py @@ -188,7 +188,7 @@ def evaluate_chat_model(): max_new_tokens=ds_collections[ds_name]['max_new_tokens'], min_new_tokens=ds_collections[ds_name]['min_new_tokens'], length_penalty=1, - repetition_penalty=1.2, + # repetition_penalty=1.2, do_sample=True if args.temperature > 0 else False, temperature=args.temperature, ) diff --git a/internvl_chat/eval/mmvet/evaluate_mmvet.py b/internvl_chat/eval/mmvet/evaluate_mmvet.py index 793923f1..377de0da 100644 --- a/internvl_chat/eval/mmvet/evaluate_mmvet.py +++ b/internvl_chat/eval/mmvet/evaluate_mmvet.py @@ -74,7 +74,7 @@ def evaluate_chat_model(): max_new_tokens=ds_collections[ds_name]['max_new_tokens'], min_new_tokens=ds_collections[ds_name]['min_new_tokens'], length_penalty=1.0, - repetition_penalty=1.2, + # repetition_penalty=1.2, do_sample=True if args.temperature > 0 else False, temperature=args.temperature, ) diff --git a/internvl_chat/eval/refcoco/evaluate_grounding.py b/internvl_chat/eval/refcoco/evaluate_grounding.py index 8c55b043..f33baf70 100644 --- a/internvl_chat/eval/refcoco/evaluate_grounding.py +++ b/internvl_chat/eval/refcoco/evaluate_grounding.py @@ -128,16 +128,14 @@ def evaluate_chat_model(): ) outputs = [] - for _, (pixel_values, questions, bboxes, hws) in tqdm(enumerate(dataloader)): + for _, (pixel_values, questions, bboxes, hws) in enumerate(tqdm(dataloader)): pixel_values = pixel_values.to(torch.bfloat16).cuda() generation_config = dict( - do_sample=args.sample, num_beams=args.num_beams, max_new_tokens=100, - min_new_tokens=20, + min_new_tokens=1, length_penalty=1, - top_k=args.top_k, - top_p=args.top_p, + do_sample=True if args.temperature > 0 else False, temperature=args.temperature, ) pred = model.chat( @@ -182,7 +180,8 @@ def evaluate_chat_model(): dtype=torch.float32).view(-1, 4) predict_bbox = torch.tensor(predict_bbox, dtype=torch.float32).view(-1, 4) - predict_bbox = predict_bbox / divisor + if predict_bbox.sum() >= 4: + predict_bbox = predict_bbox / 1000 predict_bbox[:, 0::2] *= output['hw'][1] predict_bbox[:, 1::2] *= output['hw'][0] iou, _ = box_iou(predict_bbox, target_bbox) @@ -217,10 +216,8 @@ def evaluate_chat_model(): parser.add_argument('--num-workers', type=int, default=1) parser.add_argument('--num-beams', type=int, default=5) parser.add_argument('--out-dir', type=str, default='results') - parser.add_argument('--top-k', type=int, default=50) - parser.add_argument('--top-p', type=float, default=0.9) - parser.add_argument('--sample', type=bool, default=True) - parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--sample', type=bool, default=False) + parser.add_argument('--temperature', type=float, default=0.0) parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() @@ -240,6 +237,7 @@ def evaluate_chat_model(): torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) tokenizer = LlamaTokenizer.from_pretrained(args.checkpoint) + PATTERN = re.compile(r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*') if 'qllama' in args.checkpoint.lower(): from internvl.model.internvl_chat_with_qllama import InternVLChatModel @@ -247,8 +245,6 @@ def evaluate_chat_model(): args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).cuda().eval() image_size = model.internvl.config.force_image_size or model.config.internvl_config.vision_config.image_size pad2square = model.config.pad2square - PATTERN = re.compile(r'\[(.*?),(.*?),(.*?),(.*?)\]') - divisor = 1 # TODO: divisor prompt = 'Please provide the bounding box coordinate of the region this sentence describes: {}' else: from internvl.model.internvl_chat import InternVLChatModel @@ -256,8 +252,6 @@ def evaluate_chat_model(): args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).cuda().eval() image_size = model.config.force_image_size or model.config.vision_config.image_size pad2square = model.config.pad2square - PATTERN = re.compile(r'\[\[(.*?),(.*?),(.*?),(.*?)\]\]') - divisor = 1 # TODO: divisor prompt = 'Please provide the bounding box coordinate of the region this sentence describes: {}' total_params = sum(p.numel() for p in model.parameters()) / 1e9 diff --git a/internvl_chat/evaluate.sh b/internvl_chat/evaluate.sh index 1fa94c7f..e9a0d084 100644 --- a/internvl_chat/evaluate.sh +++ b/internvl_chat/evaluate.sh @@ -231,6 +231,16 @@ if [ ${DATASET} == "refcoco" ]; then eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} fi +if [ ${DATASET} == "refcoco-val" ]; then + torchrun \ + --nnodes=1 \ + --node_rank=0 \ + --master_addr=127.0.0.1 \ + --nproc_per_node=${GPUS} \ + --master_port=${MASTER_PORT} \ + eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_val +fi + if [ ${DATASET} == "llava-bench" ]; then rm -rf results/llava_bench_results_review.jsonl python eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT} diff --git a/internvl_chat/internvl/dist_utils.py b/internvl_chat/internvl/dist_utils.py index 81875e7e..0eb8ae27 100644 --- a/internvl_chat/internvl/dist_utils.py +++ b/internvl_chat/internvl/dist_utils.py @@ -47,7 +47,8 @@ def _init_dist_pytorch(backend, **kwargs): rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) - dist.init_process_group(backend=backend, **kwargs) + # dist.init_process_group(backend=backend, **kwargs) + deepspeed.init_distributed(dist_backend=backend) def _init_dist_mpi(backend, **kwargs): diff --git a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py index 3246741d..5697ff06 100644 --- a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py +++ b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py @@ -33,6 +33,7 @@ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model self.select_layer = config.select_layer self.template = config.template self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2)) + self.downsample_ratio = config.downsample_ratio logger.info(f'num_image_token: {self.num_image_token}') if vision_model is not None: self.vision_model = vision_model @@ -182,13 +183,13 @@ def extract_feature(self, pixel_values): vit_embeds = self.vision_model( pixel_values=pixel_values, output_hidden_states=True, - return_dict=True).hidden_states[-4] + return_dict=True).hidden_states[self.select_layer] vit_embeds = vit_embeds[:, 1:, :] # if torch.distributed.get_rank() == 0: # print("before pixel shuffle:", vit_embeds.shape) h = w = int(vit_embeds.shape[1] ** 0.5) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) - vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=0.5) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) # if torch.distributed.get_rank() == 0: # print("after pixel shuffle:", vit_embeds.shape) diff --git a/internvl_chat/internvl/serve/model_worker.py b/internvl_chat/internvl/serve/model_worker.py index 7f145c22..8fd9f045 100644 --- a/internvl_chat/internvl/serve/model_worker.py +++ b/internvl_chat/internvl/serve/model_worker.py @@ -189,7 +189,7 @@ def generate_stream(self, params): input_ids=input_ids, do_sample=do_sample, temperature=temperature, - repetition_penalty=1.1, + repetition_penalty=1.0, top_p=top_p, max_new_tokens=max_new_tokens, streamer=streamer, diff --git a/internvl_chat/internvl/train/internvl_chat_finetune.py b/internvl_chat/internvl/train/internvl_chat_finetune.py index a7a8c756..1d9b357b 100644 --- a/internvl_chat/internvl/train/internvl_chat_finetune.py +++ b/internvl_chat/internvl/train/internvl_chat_finetune.py @@ -514,7 +514,8 @@ def main(): # Parse input arguments # See all possible arguments in src/transformers/training_args.py # If use DeepSpeed zero3, init_dist must before HfArgumentParser - init_dist(launcher='slurm', backend='nccl') + launcher = os.environ.get('LAUNCHER', 'slurm') + init_dist(launcher=launcher, backend='nccl') parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith('.json'): # If we pass only one argument to the script, and it's the path to a json file, diff --git a/internvl_chat/internvl/train/internvl_chat_pretrain.py b/internvl_chat/internvl/train/internvl_chat_pretrain.py index 21d48d4c..60039ecd 100644 --- a/internvl_chat/internvl/train/internvl_chat_pretrain.py +++ b/internvl_chat/internvl/train/internvl_chat_pretrain.py @@ -500,7 +500,8 @@ def main(): # Parse input arguments # See all possible arguments in src/transformers/training_args.py # If use DeepSpeed zero3, init_dist must before HfArgumentParser - init_dist(launcher='slurm', backend='nccl') + launcher = os.environ.get('LAUNCHER', 'slurm') + init_dist(launcher=launcher, backend='nccl') parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith('.json'): # If we pass only one argument to the script, and it's the path to a json file, diff --git a/internvl_chat/pyproject.toml b/internvl_chat/pyproject.toml index f0491d13..e6b2ab0a 100644 --- a/internvl_chat/pyproject.toml +++ b/internvl_chat/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "internvl_chat" -version = "1.2.1" +version = "1.2.2" description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks." readme = "README.md" requires-python = ">=3.8" diff --git a/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh b/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh new file mode 100644 index 00000000..8d387cfa --- /dev/null +++ b/internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue.sh @@ -0,0 +1,72 @@ +set -x + +PARTITION=${PARTITION:-"INTERN2"} +GPUS=${GPUS:-16} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} +NODES=$((GPUS / GPUS_PER_NODE)) +CPUS_PER_TASK=${CPUS_PER_TASK:-1} +SRUN_ARGS=${SRUN_ARGS:-""} +BATCH_SIZE=${BATCH_SIZE:-128} +PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} +GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) + + +export PYTHONPATH="${PYTHONPATH}:$(pwd)" +export MASTER_PORT=34223 + +OUTPUT_DIR='work_dirs/internvl_chat_v1_2_hermes2_yi34b_448_finetune_continue' + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +# number of gpus: 16 +# batch size per gpu: 4 +# gradient accumulation steps: 1 +# total batch size: 128 +# epoch: 1 +srun -p ${PARTITION} \ + --gres=gpu:${GPUS_PER_NODE} \ + --nodes=${NODES} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + --quotatype=${QUOTA_TYPE} \ + ${SRUN_ARGS} \ + python -u internvl/train/internvl_chat_finetune.py \ + --model_name_or_path "./pretrained/InternVL-Chat-Chinese-V1-2" \ + --conv_style "Hermes-2" \ + --output_dir ${OUTPUT_DIR} \ + --meta_path "./path/to/your/custom/meta/file" \ + --overwrite_output_dir True \ + --force_image_size 448 \ + --down_sample_ratio 0.5 \ + --drop_path_rate 0.0 \ + --pad2square False \ + --freeze_llm False \ + --freeze_mlp False \ + --freeze_backbone True \ + --vision_select_layer -1 \ + --use_data_resampling False \ + --dataloader_num_workers 2 \ + --bf16 True \ + --num_train_epochs 1 \ + --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ + --gradient_accumulation_steps ${GRADIENT_ACC} \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 200 \ + --save_total_limit 1 \ + --learning_rate 1e-5 \ + --weight_decay 0.05 \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --max_seq_length 2048 \ + --do_train True \ + --grad_checkpoint True \ + --deepspeed "zero_stage3_config.json" \ + --report_to "tensorboard" \ + 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" diff --git a/internvl_chat/zero_stage1_config.json b/internvl_chat/zero_stage1_config.json index 71fca6e4..9cd513d0 100644 --- a/internvl_chat/zero_stage1_config.json +++ b/internvl_chat/zero_stage1_config.json @@ -20,15 +20,6 @@ "bf16": { "enabled": "auto" }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, "optimizer": { "type": "AdamW", "params": { diff --git a/internvl_chat/zero_stage1_config_wo_opt.json b/internvl_chat/zero_stage1_config_wo_opt.json deleted file mode 100644 index e832ceac..00000000 --- a/internvl_chat/zero_stage1_config_wo_opt.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "zero_optimization": { - "stage": 1, - "allgather_partitions": true, - "allgather_bucket_size": 1e9, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 1e9, - "contiguous_gradients": true - }, - "fp16": { - "enabled": "auto", - "auto_cast": true, - "loss_scale": 0, - "initial_scale_power": 32, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "bf16": { - "enabled": "auto" - }, - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 2000, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false -} diff --git a/internvl_chat/zero_stage2_config.json b/internvl_chat/zero_stage2_config.json index b6f859ac..9e831dca 100644 --- a/internvl_chat/zero_stage2_config.json +++ b/internvl_chat/zero_stage2_config.json @@ -20,15 +20,6 @@ "bf16": { "enabled": "auto" }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, "optimizer": { "type": "AdamW", "params": {