From 766dfd8c68e41f1c33794477290567a3366bb185 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 15:50:22 -0400
Subject: [PATCH 01/10] Basic

---
 Dockerfile                          | 16 ++++++++++++----
 pyproject.toml                      | 10 +++++-----
 requirements/basic_requirements.txt |  8 ++++----
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4869c792..89009fa0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-wheel
 
 RUN pip3 install --upgrade pip \
+    && pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 --index-url https://download.pytorch.org/whl/cu113 \
     && pip3 install   \
         gradio        \
         opencv-python \
@@ -25,7 +26,7 @@ RUN pip3 install --upgrade pip \
         mmengine      \
         setuptools    \
         openmim       \
-    && mim install mmcv==2.0.0 \
+    && mim install 'mmcv<=2.2.0' \
     && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
         wheel         \
         torch         \
@@ -35,9 +36,16 @@ RUN pip3 install --upgrade pip \
 COPY . /yolo
 WORKDIR /yolo
 
-RUN pip3 install -e .
+RUN pip3 install -e .[demo]
+
+RUN pip3 install onnx
+
+RUN apt install -y curl
+RUN mkdir weights
 
 RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
 
-ENTRYPOINT [ "python3", "demo.py" ]
-CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
\ No newline at end of file
+RUN pip3 install onnxsim
+
+ENTRYPOINT [ "python3", "demo/gradio_demo.py" ]
+CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
diff --git a/pyproject.toml b/pyproject.toml
index 8351de59..42bd1838 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,13 +35,13 @@ dependencies = [
     "tokenizers",
     "numpy",
     "opencv-python",
-    "supervision==0.19.0",
+    "supervision",
     "openmim",
-    "mmcv-lite>=2.0.0rc4",
-    "mmdet==3.0.0",
+    "mmcv-lite<2.2.0",
+    "mmdet>=3.0.0",
     "mmengine>=0.7.1",
     "openmim",
-    "mmcv",
+    "mmcv<2.2.0",
     'mmyolo @ git+https://github.com/onuralpszr/mmyolo.git',
 
 ]
@@ -54,4 +54,4 @@ zip-safe = true
 
 [tool.setuptools.packages.find]
 include = ["yolo_world*"]
-exclude = ["docs*", "tests*","third_party*","assets*"]
\ No newline at end of file
+exclude = ["docs*", "tests*","third_party*","assets*"]
diff --git a/requirements/basic_requirements.txt b/requirements/basic_requirements.txt
index d9c56e20..4efa3d8d 100644
--- a/requirements/basic_requirements.txt
+++ b/requirements/basic_requirements.txt
@@ -1,9 +1,9 @@
 opencv-python==4.9.0.80
 opencv-python-headless==4.2.0.34
-mmcv==2.0.0
-mmdet==3.0.0
+mmcv
+mmdet
 mmengine==0.10.3
-mmyolo==0.6.0
+mmyolo
 timm==0.6.13
 transformers==4.36.2
-albumentations
\ No newline at end of file
+albumentations

From d7bebb28107356bbe1f9210357153450ae4e1976 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 18:09:06 -0400
Subject: [PATCH 02/10] Reasonable facimile of working dependencies

---
 .dockerignore                       |  3 +-
 Dockerfile                          | 45 +++++++++++++----------------
 pyproject.toml                      | 21 +++++++-------
 requirements/basic_requirements.txt | 17 +++++++----
 requirements/demo_requirements.txt  |  2 +-
 5 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 1aefdd17..94c2aed7 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,3 @@
 docs
-Dockerfile
\ No newline at end of file
+Dockerfile
+.idea
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 89009fa0..fbffea49 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
 
 ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
 ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
@@ -15,37 +15,32 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libglib2.0-0    \
     git             \
     python3-dev     \
-    python3-wheel
+    python3-wheel   \
+    curl
+
+RUN mkdir weights
+RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
 
 RUN pip3 install --upgrade pip \
-    && pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 --index-url https://download.pytorch.org/whl/cu113 \
+    && pip3 install wheel \
+    && pip3 install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \
     && pip3 install   \
-        gradio        \
-        opencv-python \
-        supervision   \
-        mmengine      \
-        setuptools    \
-        openmim       \
-    && mim install 'mmcv<=2.2.0' \
-    && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
-        wheel         \
-        torch         \
-        torchvision   \
-        torchaudio
+        gradio==4.16.0 \
+        opencv-python==4.9.0.80 \
+        supervision \
+        mmengine==0.10.4 \
+        setuptools \
+        openmim \
+    && mim install mmcv==2.1.0 \
+    && mim install mmdet==3.3.0 \
+    && pip install git+https://github.com/onuralpszr/mmyolo.git
 
 COPY . /yolo
 WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
-RUN pip3 install onnx
-
-RUN apt install -y curl
-RUN mkdir weights
-
-RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
-
-RUN pip3 install onnxsim
+RUN pip3 install onnx onnxsim
 
-ENTRYPOINT [ "python3", "demo/gradio_demo.py" ]
-CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
+CMD [ "python3", "demo/gradio_demo.py" ]
+# CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 42bd1838..3ffdf156 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,21 +29,20 @@ requires-python = ">= 3.7"
 
 dependencies = [
     "wheel",
-    "torch>=1.11.0",
-    "torchvision>=0.16.2",
-    "transformers",
+    "torch==2.1.2",
+    "torchvision==0.16.2",
+    "transformers==4.36.2",
     "tokenizers",
     "numpy",
-    "opencv-python",
+    "opencv-python==4.9.0.80",
     "supervision",
     "openmim",
-    "mmcv-lite<2.2.0",
-    "mmdet>=3.0.0",
-    "mmengine>=0.7.1",
-    "openmim",
     "mmcv<2.2.0",
-    'mmyolo @ git+https://github.com/onuralpszr/mmyolo.git',
-
+    "mmdet>=3.0.0",
+    "mmengine==0.10.4",
+    "mmyolo @ git+https://github.com/onuralpszr/mmyolo.git",
+    "timm==0.6.13",
+    "albumentations",
 ]
 
 [tool.setuptools]
@@ -54,4 +53,4 @@ zip-safe = true
 
 [tool.setuptools.packages.find]
 include = ["yolo_world*"]
-exclude = ["docs*", "tests*","third_party*","assets*"]
+exclude = ["docs*", "tests*","third_party*","assets*"]
\ No newline at end of file
diff --git a/requirements/basic_requirements.txt b/requirements/basic_requirements.txt
index 4efa3d8d..d05e66b2 100644
--- a/requirements/basic_requirements.txt
+++ b/requirements/basic_requirements.txt
@@ -1,9 +1,16 @@
+torch==2.1.2
+torchvision==0.16.2
+torchaudio==2.1.2
 opencv-python==4.9.0.80
-opencv-python-headless==4.2.0.34
-mmcv
-mmdet
-mmengine==0.10.3
-mmyolo
+mmcv<2.2.0
+mmdet>=3.0.0
+mmengine==0.10.4
+git+https://github.com/onuralpszr/mmyolo.git
 timm==0.6.13
 transformers==4.36.2
 albumentations
+gradio==4.16.0
+supervision
+onnx
+onnxruntime
+onnxsim
\ No newline at end of file
diff --git a/requirements/demo_requirements.txt b/requirements/demo_requirements.txt
index 0268ad3c..30d26ba1 100644
--- a/requirements/demo_requirements.txt
+++ b/requirements/demo_requirements.txt
@@ -1,2 +1,2 @@
-gradio==4.16.0
+gradio
 supervision
\ No newline at end of file

From 91543e4077f09db9819fb3f25c22d6651aa7cbb4 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 18:23:43 -0400
Subject: [PATCH 03/10] Use off the shelf clip

---
 ...3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py | 1 +
 ...3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py | 4 ++--
 ...eg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py | 2 +-
 ..._v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
index 59507204..a1ad7aac 100644
--- a/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -16,6 +16,7 @@
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
+text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
 text_model_name = 'openai/clip-vit-base-patch32'
diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
index ab4cd23f..40c2e5c1 100644
--- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -15,8 +15,8 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
+text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
 # model settings
diff --git a/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py b/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
index 01885dd5..21e25797 100644
--- a/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
+++ b/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
@@ -18,7 +18,7 @@
 load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
 persistent_workers = False
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+text_model_name = 'openai/clip-vit-base-patch32'
 # Polygon2Mask
 downsample_ratio = 4
 mask_overlap = False
diff --git a/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
index d196d4ee..f8ce6c96 100644
--- a/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
+++ b/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
@@ -17,7 +17,7 @@
 train_batch_size_per_gpu = 8
 load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth'
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 
 # Polygon2Mask

From 76ffdc77816937c4fb5e1a48beb3e86ab954f471 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 19:20:30 -0400
Subject: [PATCH 04/10] Latest working dockerfile

---
 Dockerfile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index fbffea49..1df674d5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,12 +35,19 @@ RUN pip3 install --upgrade pip \
     && mim install mmdet==3.3.0 \
     && pip install git+https://github.com/onuralpszr/mmyolo.git
 
-COPY . /yolo
+RUN git clone https://github.com/tim-win/YOLO-World /yolo/
+
 WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
 RUN pip3 install onnx onnxsim
+RUN cd third_party && git clone https://github.com/onuralpszr/mmyolo.git ./mmyolo/
+
+RUN chmod a+rwx /weights/
+RUN chmod a+rwx /yolo/configs/*/*
+
+
 
 CMD [ "python3", "demo/gradio_demo.py" ]
 # CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
\ No newline at end of file

From a34bae56221f9f7493ed034226b547478f2906a8 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 20:29:39 -0400
Subject: [PATCH 05/10] Experimental update to libraries

---
 Dockerfile       |  6 +++---
 build_and_run.sh | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)
 create mode 100755 build_and_run.sh

diff --git a/Dockerfile b/Dockerfile
index 1df674d5..3b39ae4a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,14 +35,14 @@ RUN pip3 install --upgrade pip \
     && mim install mmdet==3.3.0 \
     && pip install git+https://github.com/onuralpszr/mmyolo.git
 
-RUN git clone https://github.com/tim-win/YOLO-World /yolo/
-
+# RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/
+COPY . /yolo
 WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
 RUN pip3 install onnx onnxsim
-RUN cd third_party && git clone https://github.com/onuralpszr/mmyolo.git ./mmyolo/
+# RUN cd third_party/ && rm -rf ./mmyolo && git clone https://github.com/onuralpszr/mmyolo.git .
 
 RUN chmod a+rwx /weights/
 RUN chmod a+rwx /yolo/configs/*/*
diff --git a/build_and_run.sh b/build_and_run.sh
new file mode 100755
index 00000000..b8677d48
--- /dev/null
+++ b/build_and_run.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -e
+
+export MODEL=yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
+export WEIGHT=yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth
+
+export MODEL=yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
+export WEIGHT=yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth
+
+docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t yolo-demo . && docker run --runtime nvidia -p 8080:8080 yolo-demo python3 demo/gradio_demo.py "configs/pretrain/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file

From 57b9244adc8dba3340669ab4cff608ee28fde66b Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 20:31:56 -0400
Subject: [PATCH 06/10] Fully featured build and run script

---
 build_and_run.sh | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/build_and_run.sh b/build_and_run.sh
index b8677d48..1202ce78 100755
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -1,10 +1,44 @@
 #!/usr/bin/env bash
 set -e
 
-export MODEL=yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
-export WEIGHT=yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth
+declare -A models
+models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth"
+models["pretrain-l-clip-800ft"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth"
+models["pretrain-l-clip"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth"
+models["pretrain-l-1280ft"]="yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth"
+models["pretrain-l"]="yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
+models["pretrain-m-1280ft"]="yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth"
+models["pretrain-m"]="yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth"
+models["pretrain-s-1280ft"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth"
+models["pretrain-s"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth"
+models["pretrain-x-cc3mlite"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth"
+models["pretrain-x-1280ft"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
 
-export MODEL=yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
-export WEIGHT=yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth
+if [ $# -eq 0 ]; then
+    echo "Available model keys:"
+    for key in "${!models[@]}"; do
+        echo "  $key"
+    done
+    echo "Usage: $0 <model-key>"
+    exit 1
+fi
 
-docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t yolo-demo . && docker run --runtime nvidia -p 8080:8080 yolo-demo python3 demo/gradio_demo.py "configs/pretrain/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file
+model_key=$1
+
+if [ -z "${models[$model_key]}" ]; then
+    echo "Invalid model key. Available keys are:"
+    for key in "${!models[@]}"; do
+        echo "  $key"
+    done
+    exit 1
+fi
+
+read MODEL WEIGHT <<< "${models[$model_key]}"
+
+config_dir="configs/pretrain"
+if [[ $model_key == seg-* ]]; then
+    config_dir="configs/segmentation"
+fi
+
+docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:$model_key" . && \
+docker run --runtime nvidia -p 8080:8080 "yolo-demo:$model_key" python3 demo/gradio_demo.py "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file

From 512e9a1264823c85eff46a3a891fefc6f7bfad6f Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 20:53:26 -0400
Subject: [PATCH 07/10] Add basic segmentation demo support

---
 Dockerfile                |   9 +-
 build_and_run.sh          |   6 +-
 demo/segmentation_demo.py | 167 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+), 6 deletions(-)
 create mode 100644 demo/segmentation_demo.py

diff --git a/Dockerfile b/Dockerfile
index 3b39ae4a..4a733f9a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,8 +18,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-wheel   \
     curl
 
-RUN mkdir weights
-RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
+# Uncomment the following if you want to download a specific set of weights
+# RUN mkdir weights
+# RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
 
 RUN pip3 install --upgrade pip \
     && pip3 install wheel \
@@ -47,7 +48,5 @@ RUN pip3 install onnx onnxsim
 RUN chmod a+rwx /weights/
 RUN chmod a+rwx /yolo/configs/*/*
 
-
-
-CMD [ "python3", "demo/gradio_demo.py" ]
+CMD [ "python3", "demo/gradio_demo.py", "", ""]
 # CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
\ No newline at end of file
diff --git a/build_and_run.sh b/build_and_run.sh
index 1202ce78..2aa5ad68 100755
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -1,6 +1,8 @@
 #!/usr/bin/env bash
 set -e
 
+MODEL_DIR="../models/models-yoloworld"
+
 declare -A models
 models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth"
 models["pretrain-l-clip-800ft"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth"
@@ -36,9 +38,11 @@ fi
 read MODEL WEIGHT <<< "${models[$model_key]}"
 
 config_dir="configs/pretrain"
+demo_file=demo/gradio_demo.py
 if [[ $model_key == seg-* ]]; then
     config_dir="configs/segmentation"
+    demo_file="demo/segmentation_demo.py"
 fi
 
 docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:$model_key" . && \
-docker run --runtime nvidia -p 8080:8080 "yolo-demo:$model_key" python3 demo/gradio_demo.py "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file
+docker run -it -v "$MODEL_DIR:/weights/" --runtime nvidia -p 8080:8080 "yolo-demo:$model_key" bash  # python3 demo/gradio_demo.py "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file
diff --git a/demo/segmentation_demo.py b/demo/segmentation_demo.py
new file mode 100644
index 00000000..4ab7a465
--- /dev/null
+++ b/demo/segmentation_demo.py
@@ -0,0 +1,167 @@
+# Copyright (c) Tencent Inc. All rights reserved.
+import os
+import sys
+import argparse
+import os.path as osp
+from functools import partial
+
+import cv2
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+import supervision as sv
+from torchvision.ops import nms
+from mmengine.runner import Runner
+from mmengine.dataset import Compose
+from mmengine.runner.amp import autocast
+from mmengine.config import Config, DictAction
+from mmdet.datasets import CocoDataset
+from mmyolo.registry import RUNNERS
+
+sys.path.append('./deploy')
+
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
+MASK_ANNOTATOR = sv.MaskAnnotator()
+LABEL_ANNOTATOR = sv.LabelAnnotator(text_padding=4, text_scale=0.5, text_thickness=1)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO-World Segmentation Demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics',
+        default='output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+def run_segmentation(runner,
+                     image,
+                     text,
+                     max_num_boxes,
+                     score_thr,
+                     nms_thr):
+    texts = [[t.strip()] for t in text.split(',')] + [[' ']]
+    data_info = dict(img_id=0, img=np.array(image), texts=texts)
+    data_info = runner.pipeline(data_info)
+    data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
+                      data_samples=[data_info['data_samples']])
+
+    with autocast(enabled=False), torch.no_grad():
+        output = runner.model.test_step(data_batch)[0]
+        pred_instances = output.pred_instances
+
+    keep = nms(pred_instances.bboxes,
+               pred_instances.scores,
+               iou_threshold=nms_thr)
+    pred_instances = pred_instances[keep]
+    pred_instances = pred_instances[pred_instances.scores.float() > score_thr]
+
+    if len(pred_instances.scores) > max_num_boxes:
+        indices = pred_instances.scores.float().topk(max_num_boxes)[1]
+        pred_instances = pred_instances[indices]
+
+    pred_instances = pred_instances.cpu().numpy()
+    masks = pred_instances['masks'] if 'masks' in pred_instances else None
+    detections = sv.Detections(xyxy=pred_instances['bboxes'],
+                               class_id=pred_instances['labels'],
+                               confidence=pred_instances['scores'],
+                               mask=masks)
+    labels = [
+        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+
+    image = np.array(image)
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
+    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
+    if masks is not None:
+        image = MASK_ANNOTATOR.annotate(image, detections)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = Image.fromarray(image)
+    return image
+
+def demo(runner, args):
+    with gr.Blocks(title="YOLO-World Segmentation") as demo:
+        gr.Markdown('<h1><center>YOLO-World: Real-Time Open-Vocabulary '
+                    'Object Detector and Segmentation</center></h1>')
+        with gr.Row():
+            with gr.Column(scale=0.5):
+                image = gr.Image(type='pil', label='Input Image')
+                input_text = gr.Textbox(
+                    lines=3,
+                    label='Enter the classes to be detected and segmented, '
+                    'separated by comma',
+                    value=', '.join(CocoDataset.METAINFO['classes'][:5]),
+                    elem_id='textbox')
+                max_num_boxes = gr.Slider(minimum=1,
+                                          maximum=300,
+                                          value=100,
+                                          step=1,
+                                          interactive=True,
+                                          label='Maximum Number of Boxes')
+                score_thr = gr.Slider(minimum=0,
+                                      maximum=1,
+                                      value=0.3,
+                                      step=0.01,
+                                      interactive=True,
+                                      label='Score Threshold')
+                nms_thr = gr.Slider(minimum=0,
+                                    maximum=1,
+                                    value=0.5,
+                                    step=0.01,
+                                    interactive=True,
+                                    label='NMS Threshold')
+                submit = gr.Button('Submit')
+                clear = gr.Button('Clear')
+            with gr.Column(scale=0.5):
+                output_image = gr.Image(type='pil', label='Output Image')
+
+        submit.click(partial(run_segmentation, runner),
+                     [image, input_text, max_num_boxes, score_thr, nms_thr],
+                     [output_image])
+        clear.click(lambda: [None, '', None], None,
+                    [image, input_text, output_image])
+
+        demo.launch(server_name='0.0.0.0', server_port=8080)
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if 'runner_type' not in cfg:
+        runner = Runner.from_cfg(cfg)
+    else:
+        runner = RUNNERS.build(cfg)
+
+    runner.call_hook('before_run')
+    runner.load_or_resume()
+    pipeline = cfg.test_dataloader.dataset.pipeline
+    pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+    runner.pipeline = Compose(pipeline)
+    runner.model.eval()
+    demo(runner, args)
\ No newline at end of file

From 797b29389d1ba599e8ffc1c80ca027cdbffe65f7 Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 21:00:13 -0400
Subject: [PATCH 08/10] Reference remote code to avoid duplicating build steps

---
 Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4a733f9a..1f7a3bbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -36,17 +36,17 @@ RUN pip3 install --upgrade pip \
     && mim install mmdet==3.3.0 \
     && pip install git+https://github.com/onuralpszr/mmyolo.git
 
-# RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/
-COPY . /yolo
-WORKDIR /yolo
+RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/
+#COPY . /yolo
+#WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
 RUN pip3 install onnx onnxsim
 # RUN cd third_party/ && rm -rf ./mmyolo && git clone https://github.com/onuralpszr/mmyolo.git .
 
-RUN chmod a+rwx /weights/
+RUN mkdir /weights/
 RUN chmod a+rwx /yolo/configs/*/*
 
 CMD [ "python3", "demo/gradio_demo.py", "", ""]
-# CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
\ No newline at end of file
+# CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]

From 40419490611fd80d83e9eeb5b40061d34fcdcc7f Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 21:31:33 -0400
Subject: [PATCH 09/10] Make it sort of work all together now

---
 Dockerfile       | 11 ++++++-----
 build_and_run.sh | 11 +++++++----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1f7a3bbd..e53efcbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS DEPENDENCIES
 
 ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
 ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
@@ -36,17 +36,18 @@ RUN pip3 install --upgrade pip \
     && mim install mmdet==3.3.0 \
     && pip install git+https://github.com/onuralpszr/mmyolo.git
 
+FROM DEPENDENCIES as INSTALLING_YOLO
 RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/
 #COPY . /yolo
-#WORKDIR /yolo
+WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
 RUN pip3 install onnx onnxsim
-# RUN cd third_party/ && rm -rf ./mmyolo && git clone https://github.com/onuralpszr/mmyolo.git .
+
+FROM INSTALLING_YOLO as OK_THIS_PART_IS_TRICKY_DONT_HATE
 
 RUN mkdir /weights/
 RUN chmod a+rwx /yolo/configs/*/*
 
-CMD [ "python3", "demo/gradio_demo.py", "", ""]
-# CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
+CMD [ "bash" ]
diff --git a/build_and_run.sh b/build_and_run.sh
index 2aa5ad68..edb73942 100755
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -5,6 +5,9 @@ MODEL_DIR="../models/models-yoloworld"
 
 declare -A models
 models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth"
+models["seg-l-seghead"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth"
+models["seg-m"]="yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-ca465825.pth"
+models["seg-m-seghead"]="yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-7bca59a7.pth"
 models["pretrain-l-clip-800ft"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth"
 models["pretrain-l-clip"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth"
 models["pretrain-l-1280ft"]="yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth"
@@ -40,9 +43,9 @@ read MODEL WEIGHT <<< "${models[$model_key]}"
 config_dir="configs/pretrain"
 demo_file=demo/gradio_demo.py
 if [[ $model_key == seg-* ]]; then
-    config_dir="configs/segmentation"
-    demo_file="demo/segmentation_demo.py"
+    export config_dir="configs/segmentation"
+    export demo_file="demo/segmentation_demo.py"
 fi
 
-docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:$model_key" . && \
-docker run -it -v "$MODEL_DIR:/weights/" --runtime nvidia -p 8080:8080 "yolo-demo:$model_key" bash  # python3 demo/gradio_demo.py "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file
+# docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:latest" . && \
+docker run -it -v "$(readlink -f $MODEL_DIR):/weights/" --runtime nvidia -p 8080:8080 "yolo-demo:latest" python3 $demo_file "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file

From bf71d2bfba1329949ee56acbcf76cfc5329c7aba Mon Sep 17 00:00:00 2001
From: tim-win <tim@timsthebomb.com>
Date: Sat, 31 Aug 2024 21:47:15 -0400
Subject: [PATCH 10/10] Cleanup MR so its a little more professional

---
 Dockerfile                                    | 76 +++++++++++--------
 README.md                                     |  1 +
 build_and_run.sh                              | 70 +++++++++++++----
 ...e-4_80e_8gpus_mask-refine_finetune_coco.py |  1 -
 ...bj365v1_goldg_train_1280ft_lvis_minival.py |  1 -
 ...8gpus_obj365v1_goldg_train_lvis_minival.py |  1 -
 ...e_4x8gpus_obj365v1_goldg_train_lvis_val.py |  1 -
 ...365v1_goldg_cc3mlite_train_lvis_minival.py |  1 -
 ...bj365v1_goldg_train_1280ft_lvis_minival.py |  1 -
 ...8gpus_obj365v1_goldg_train_lvis_minival.py |  1 -
 ...bn_2e-4_80e_8gpus_seghead_finetune_lvis.py |  1 -
 demo/README.md                                | 19 ++++-
 12 files changed, 117 insertions(+), 57 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e53efcbd..7c43e743 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,53 +1,63 @@
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS DEPENDENCIES
+# Base image with CUDA support
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS base
 
-ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
-ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
-
-ENV FORCE_CUDA="1"
-ENV MMCV_WITH_OPS=1
+# Set environment variables
+ENV FORCE_CUDA="1" \
+    MMCV_WITH_OPS=1 \
+    DEBIAN_FRONTEND=noninteractive
 
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3-pip     \
+    python3-pip \
     libgl1-mesa-glx \
-    libsm6          \
-    libxext6        \
-    libxrender-dev  \
-    libglib2.0-0    \
-    git             \
-    python3-dev     \
-    python3-wheel   \
-    curl
-
-# Uncomment the following if you want to download a specific set of weights
-# RUN mkdir weights
-# RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
-
-RUN pip3 install --upgrade pip \
-    && pip3 install wheel \
-    && pip3 install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \
-    && pip3 install   \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libglib2.0-0 \
+    git \
+    python3-dev \
+    python3-wheel \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+FROM base AS python_deps
+
+RUN pip3 install --upgrade pip wheel \
+    && pip3 install --no-cache-dir torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \
+    && pip3 install --no-cache-dir \
         gradio==4.16.0 \
         opencv-python==4.9.0.80 \
         supervision \
         mmengine==0.10.4 \
         setuptools \
         openmim \
+        onnx \
+        onnxsim \
     && mim install mmcv==2.1.0 \
     && mim install mmdet==3.3.0 \
-    && pip install git+https://github.com/onuralpszr/mmyolo.git
+    && pip3 install --no-cache-dir git+https://github.com/onuralpszr/mmyolo.git
+
+# Clone and install YOLO-World
+FROM python_deps AS yolo_world
 
-FROM DEPENDENCIES as INSTALLING_YOLO
-RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/
-#COPY . /yolo
+RUN git clone --recursive https://github.com/AILab-CVC/YOLO-World /yolo/
 WORKDIR /yolo
 
 RUN pip3 install -e .[demo]
 
-RUN pip3 install onnx onnxsim
+# Final stage
+FROM yolo_world AS final
+
+ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
+ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
 
-FROM INSTALLING_YOLO as OK_THIS_PART_IS_TRICKY_DONT_HATE
+# Create weights directory and set permissions
+RUN mkdir /weights/ \
+    && chmod a+rwx /yolo/configs/*/*
 
-RUN mkdir /weights/
-RUN chmod a+rwx /yolo/configs/*/*
+# Optionally download weights (commented out by default)
+# RUN curl -o /weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
 
-CMD [ "bash" ]
+# Set the default command
+CMD ["bash"]
\ No newline at end of file
diff --git a/README.md b/README.md
index d89e9a4f..c213924d 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ We recommend that everyone **use English to communicate on issues**, as this hel
 For business licensing and other related inquiries, don't hesitate to contact `yixiaoge@tencent.com`.
 
 ## 🔥 Updates 
+`[2024-8-31]`: Segmentation demo added to the demo/ folder. Try it out in docker with `./build_and_run.sh seg-l`!  
 `[2024-7-8]`: YOLO-World now has been integrated into [ComfyUI](https://github.com/StevenGrove/ComfyUI-YOLOWorld)! Come and try adding YOLO-World to your workflow now! You can access it at [StevenGrove/ComfyUI-YOLOWorld](https://github.com/StevenGrove/ComfyUI-YOLOWorld)!  
 `[2024-5-18]:` YOLO-World models have been [integrated with the FiftyOne computer vision toolkit](https://docs.voxel51.com/integrations/ultralytics.html#open-vocabulary-detection) for streamlined open-vocabulary inference across image and video datasets.  
 `[2024-5-16]:` Hey guys! Long time no see! This update contains (1) [fine-tuning guide](https://github.com/AILab-CVC/YOLO-World?#highlights--introduction) and (2) [TFLite Export](./docs/tflite_deploy.md) with INT8 Quantization.  
diff --git a/build_and_run.sh b/build_and_run.sh
index edb73942..2ee55e8d 100755
--- a/build_and_run.sh
+++ b/build_and_run.sh
@@ -1,8 +1,33 @@
 #!/usr/bin/env bash
+
+# Exit immediately if a command exits with a non-zero status.
 set -e
 
-MODEL_DIR="../models/models-yoloworld"
+# Set MODEL_DIR if not already set in the environment
+: "${MODEL_DIR:="../models/models-yoloworld"}"
+
+# DocString for the script
+: '
+This script builds and runs a Docker container for YOLO-World demos.
+It supports various pre-trained models and configurations for object detection and segmentation.
+
+Usage:
+    ./build_and_run.sh <model-key>
+
+Environment Variables:
+    MODEL_DIR: Path to the directory containing model weights (default: "../models/models-yoloworld")
 
+Arguments:
+    <model-key>: Key for the desired model configuration (see available keys below)
+
+Available model keys:
+    seg-l, seg-l-seghead, seg-m, seg-m-seghead,
+    pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l,
+    pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s,
+    pretrain-x-cc3mlite, pretrain-x-1280ft
+'
+
+# Define associative array for model configurations
 declare -A models
 models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth"
 models["seg-l-seghead"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth"
@@ -19,33 +44,52 @@ models["pretrain-s"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_
 models["pretrain-x-cc3mlite"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth"
 models["pretrain-x-1280ft"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
 
-if [ $# -eq 0 ]; then
+# Function to display usage information
+show_usage() {
+    echo "Usage: $0 <model-key>"
     echo "Available model keys:"
     for key in "${!models[@]}"; do
         echo "  $key"
     done
-    echo "Usage: $0 <model-key>"
+}
+
+# Check if a model key is provided
+if [ $# -eq 0 ]; then
+    show_usage
     exit 1
 fi
 
 model_key=$1
 
+# Validate the model key
 if [ -z "${models[$model_key]}" ]; then
-    echo "Invalid model key. Available keys are:"
-    for key in "${!models[@]}"; do
-        echo "  $key"
-    done
+    echo "Invalid model key."
+    show_usage
     exit 1
 fi
 
-read MODEL WEIGHT <<< "${models[$model_key]}"
+# Extract model and weight information
+read -r MODEL WEIGHT <<< "${models[$model_key]}"
 
+# Set configuration directory and demo file based on model type
 config_dir="configs/pretrain"
-demo_file=demo/gradio_demo.py
+demo_file="demo/gradio_demo.py"
 if [[ $model_key == seg-* ]]; then
-    export config_dir="configs/segmentation"
-    export demo_file="demo/segmentation_demo.py"
+    config_dir="configs/segmentation"
+    demo_file="demo/segmentation_demo.py"
 fi
 
-# docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:latest" . && \
-docker run -it -v "$(readlink -f $MODEL_DIR):/weights/" --runtime nvidia -p 8080:8080 "yolo-demo:latest" python3 $demo_file "$config_dir/$MODEL" "/weights/$WEIGHT"
\ No newline at end of file
+# Build Docker image and run container
+echo "Building Docker image..."
+docker build -f ./Dockerfile --no-cache \
+    --build-arg="MODEL=$MODEL" \
+    --build-arg="WEIGHT=$WEIGHT" \
+    -t "yolo-demo:latest" .
+
+echo "Running Docker container..."
+docker run -it \
+    -v "$(readlink -f "$MODEL_DIR"):/weights/" \
+    --runtime nvidia \
+    -p 8080:8080 \
+    "yolo-demo:latest" \
+    python3 "$demo_file" "$config_dir/$MODEL" "/weights/$WEIGHT"
diff --git a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
index 32fcc51c..714e1492 100644
--- a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+++ b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
@@ -18,7 +18,6 @@
 weight_decay = 0.05
 train_batch_size_per_gpu = 16
 load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth'
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 
diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
index 1c34f3a4..630f5710 100644
--- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -16,7 +16,6 @@
 weight_decay = 0.025
 train_batch_size_per_gpu = 4
 load_from = "pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
index cb8beec0..5a770bce 100644
--- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(
diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
index 70b19b28..197289bb 100644
--- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
+++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(
diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
index a2ba421e..4d8ff3aa 100644
--- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(
diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
index 40c2e5c1..35050ecc 100644
--- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
index e3c1226d..92afae3b 100644
--- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(
diff --git a/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
index 062c9e31..d2006659 100644
--- a/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
+++ b/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
@@ -16,7 +16,6 @@
 weight_decay = 0.05
 train_batch_size_per_gpu = 8
 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 
diff --git a/demo/README.md b/demo/README.md
index c6f607c5..9fe600d9 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -19,11 +19,24 @@ pip install gradio==4.16.0
 python demo/demo.py path/to/config path/to/weights
 ```
 
-Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3)
+Additionally, you can use our Docker build system for an easier setup:
 
 ```bash
-docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo .
-docker run --runtime nvidia -p 8080:8080
+./build_and_run.sh <model-key>
+```
+
+Available model keys include:
+- seg-l, seg-l-seghead, seg-m, seg-m-seghead
+- pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l
+- pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s
+- pretrain-x-cc3mlite, pretrain-x-1280ft
+
+This script will build the Docker image and run the container with the specified model configuration. The Gradio interface will be accessible at `http://localhost:8080`.
+
+You can also customize the model weights directory by setting the `MODEL_DIR` environment variable:
+
+```bash
+MODEL_DIR=/path/to/your/weights ./build_and_run.sh <model-key>
 ```
 
 #### Image Demo