PaddlePaddle · LokeZhou · Nov 28, 2023 · Nov 28, 2023 · jerrywgz · Nov 28, 2023
diff --git a/paddlemix/models/blip2/base_model.py b/paddlemix/models/blip2/base_model.py
@@ -118,6 +118,7 @@ def init_tokenizer(cls, tokenizer_name="bert-base-uncased"):
     @classmethod
     def refine_state_dict(self, model, state_dict):
         from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
+
         interpolate_pos_embed(model, state_dict)
 
     def get_expected_keys(self, model_state_dict, name=None):
@@ -203,6 +204,7 @@ def from_pretrained(
         subfolder = kwargs.pop("subfolder", "")
         variant = kwargs.pop("variant", None)
         use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        from_aistudio = kwargs.get("from_aistudio", False)
 
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
         convert_from_torch = kwargs.pop("convert_from_torch", None)
@@ -269,6 +271,7 @@ def from_pretrained(
             cache_dir=cache_dir,
             subfolder=subfolder,
             from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
             config=config,
             convert_from_torch=convert_from_torch,
             use_safetensors=use_safetensors,
@@ -322,7 +325,9 @@ def from_pretrained(
         init_args = config["init_args"] or ()
         with ContextManagers(init_contexts):
             model = cls(config, *init_args, **model_kwargs)
-        cls.refine_state_dict(model, state_dict)
+        if state_dict is not None:
+            cls.refine_state_dict(model, state_dict)
+
         if use_keep_in_fp32_modules:
             # low_cpu_mem_usage = True
             keep_in_fp32_modules = model._keep_in_fp32_modules

diff --git a/paddlemix/models/blip2/utils.py b/paddlemix/models/blip2/utils.py
@@ -100,16 +100,16 @@ def load_real_time_tokens():
     return tokens
 
 
-def create_tokenizer(text_model_name_or_path):
+def create_tokenizer(text_model_name_or_path, **kwags):
     if "opt" in text_model_name_or_path:
-        tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
+        tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
     elif "t5" in text_model_name_or_path:
-        tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
+        tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
     elif "llama" in text_model_name_or_path:
-        tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path)
+        tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path, **kwags)
         tokenizer_class.pad_token = tokenizer_class.eos_token
     elif "bloom" in text_model_name_or_path:
-        tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path)
+        tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path, **kwags)
         tokenizer_class.pad_token = tokenizer_class.eos_token
     else:
         raise NotImplementedError

diff --git a/paddlemix/processors/image_processing_utils.py b/paddlemix/processors/image_processing_utils.py
@@ -40,6 +40,14 @@
 )
 from paddlemix.utils.log import logger
 
+try:
+    from paddlenlp.transformers.aistudio_utils import aistudio_download
+except:
+    logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
+    aistudio_download = None
+    pass
+
+
 IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json"
 TEXT_PROCESSOR_NAME = "text_processor_config.json"
 
@@ -272,6 +280,7 @@ def get_image_processor_dict(
         """
         cache_dir = kwargs.pop("cache_dir", None)
         from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.get("from_aistudio", False)
         subfolder = kwargs.pop("subfolder", None)
         cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
 
@@ -292,6 +301,11 @@ def get_image_processor_dict(
                 library_name="PaddleNLP",
                 library_version=__version__,
             )
+        elif from_aistudio and aistudio_download is not None:
+            image_processor_file = IMAGE_PROCESSOR_NAME
+            resolved_image_processor_file = aistudio_download(
+                repo_id=pretrained_model_name_or_path, filename=image_processor_file
+            )
         else:
             # Assuming from community-contributed pretrained models
             image_processor_file = "/".join(

diff --git a/paddlemix/processors/processing_utils.py b/paddlemix/processors/processing_utils.py
@@ -44,6 +44,7 @@
     logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
     aistudio_download = None
     pass
+import aistudio_sdk
 
 PROCESSOR_CONFIG_MAPPING = {
     "image": "image_preprocessor_config.json",
@@ -228,6 +229,69 @@ def save_to_hf_hub(
                 create_pr=create_pr,
             )
 
+    def save_to_aistudio(
+        self,
+        repo_id,
+        private=True,
+        license="Apache License 2.0",
+        exist_ok=True,
+        safe_serialization=True,
+        subfolder=None,
+        merge_tensor_parallel=False,
+        **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            safe_serialization (bool, optional): Whether to save the model in safe serialization way. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
+        """
+
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+
+            # save model
+            self.save_pretrained(save_dir)
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                path_in_repo = os.path.join(subfolder, filename) if subfolder is not None else filename
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id,
+                    path_or_fileobj=os.path.join(save_dir, filename),
+                    path_in_repo=path_in_repo,
+                    **kwargs,
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
+
     @classmethod
     def get_processor_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
@@ -273,6 +337,15 @@ def get_processor_dict(
             )
         elif from_aistudio and aistudio_download is not None:
             processor_file = PROCESSOR_CONFIG_MAPPING[cls.input_type]
+            if subfolder is not None:
+                processor_file = os.path.join(subfolder, processor_file)
+
+            pretrained_model_name_or_path_list = pretrained_model_name_or_path.split("/")
+            if len(pretrained_model_name_or_path_list) > 2:
+                pretrained_model_name_or_path = os.path.join(
+                    pretrained_model_name_or_path_list[0], pretrained_model_name_or_path_list[1]
+                )
+
             resolved_processor_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=processor_file)
         else:
             # Assuming from community-contributed pretrained models

diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,4 @@ pycocoevalcap
 ftfy
 regex
 einops>=0.6.1
-
+aistudio-sdk>=0.1.3
diff --git a/tests/processors/__init__.py b/tests/processors/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/processors/test_from_aistudio.py b/tests/processors/test_from_aistudio.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
+
+from paddlemix.models.groundingdino.modeling import GroundingDinoModel
+from paddlemix.processors.groundingdino_processing import (
+    GroudingDinoImageProcessor,
+    GroudingDinoTextProcessor,
+)
+from tests.testing_utils import ai_studio_token, slow
+
+repo_id = "aistudio/groundingdino-swint-ogc"
+bos_model_name = "GroundingDino/groundingdino-swint-ogc"
+
+
+class FromAiStudioTester:
+    def __init__(self):
+        self.bos_model_name = bos_model_name
+        self.token = ai_studio_token
+        self.repo_id = repo_id
+
+    def prepare_from_bos(self):
+        model = GroundingDinoModel.from_pretrained(self.bos_model_name)
+        image_processor = GroudingDinoImageProcessor.from_pretrained(self.bos_model_name)
+        text_processor = GroudingDinoTextProcessor.from_pretrained(self.bos_model_name)
+
+        return model, image_processor, text_processor
+
+
+class AIStudioUpTester(unittest.TestCase):
+    def setUp(self):
+        self.tester = FromAiStudioTester()
+        self.model, self.image_processor, self.text_processor = self.tester.prepare_from_bos()
+
+    @slow
+    def test_model_up_aistusio(self):
+        self.model.save_to_aistudio(
+            repo_id=self.tester.repo_id,
+            token=self.tester.token,
+            private=True,
+            license="Apache License 2.0",
+            exist_ok=True,
+            safe_serialization=True,
+        )
+
+    def test_processor_up_aistusio(self):
+        self.image_processor.save_to_aistudio(
+            repo_id=self.tester.repo_id,
+            token=self.tester.token,
+            private=True,
+            license="Apache License 2.0",
+            exist_ok=True,
+        )
+        self.text_processor.save_to_aistudio(
+            repo_id=self.tester.repo_id,
+            token=self.tester.token,
+            private=True,
+            license="Apache License 2.0",
+            exist_ok=True,
+        )
+
+
+class AIStudioLoadTester(unittest.TestCase):
+    def setUp(self):
+        self.tester = FromAiStudioTester()
+
+    @slow
+    def test_model_load_aistusio(self):
+        GroundingDinoModel.from_pretrained(self.tester.repo_id, from_aistudio=True)
+
+    def test_processor_load_aistusio(self):
+        GroudingDinoTextProcessor.from_pretrained(self.tester.repo_id, from_aistudio=True)
+
+    def image_processor_load_aistusio(self):
+        GroudingDinoImageProcessor.from_pretrained(self.tester.repo_id, from_aistudio=True)
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -44,6 +44,7 @@ def get_bool_from_env(key, default_value=False):
 
 
 _run_slow_test = get_bool_from_env("RUN_SLOW_TEST")
+ai_studio_token = os.getenv("AI_STUDIO_TOKEN")
 
 
 def slow(test):