Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

processors support aistudio #330

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion paddlemix/models/blip2/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def init_tokenizer(cls, tokenizer_name="bert-base-uncased"):
@classmethod
def refine_state_dict(self, model, state_dict):
from paddlemix.models.blip2.eva_vit import interpolate_pos_embed

interpolate_pos_embed(model, state_dict)

def get_expected_keys(self, model_state_dict, name=None):
Expand Down Expand Up @@ -203,6 +204,7 @@ def from_pretrained(
subfolder = kwargs.pop("subfolder", "")
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
from_aistudio = kwargs.get("from_aistudio", False)

low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
convert_from_torch = kwargs.pop("convert_from_torch", None)
Expand Down Expand Up @@ -269,6 +271,7 @@ def from_pretrained(
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=from_hf_hub,
from_aistudio=from_aistudio,
config=config,
convert_from_torch=convert_from_torch,
use_safetensors=use_safetensors,
Expand Down Expand Up @@ -322,7 +325,9 @@ def from_pretrained(
init_args = config["init_args"] or ()
with ContextManagers(init_contexts):
model = cls(config, *init_args, **model_kwargs)
cls.refine_state_dict(model, state_dict)
if state_dict is not None:
cls.refine_state_dict(model, state_dict)

if use_keep_in_fp32_modules:
# low_cpu_mem_usage = True
keep_in_fp32_modules = model._keep_in_fp32_modules
Expand Down
10 changes: 5 additions & 5 deletions paddlemix/models/blip2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,16 @@ def load_real_time_tokens():
return tokens


def create_tokenizer(text_model_name_or_path):
def create_tokenizer(text_model_name_or_path, **kwags):
if "opt" in text_model_name_or_path:
tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
tokenizer_class = AutoTokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
elif "t5" in text_model_name_or_path:
tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False)
tokenizer_class = T5Tokenizer.from_pretrained(text_model_name_or_path, use_fast=False, **kwags)
elif "llama" in text_model_name_or_path:
tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path)
tokenizer_class = LlamaTokenizer.from_pretrained(text_model_name_or_path, **kwags)
tokenizer_class.pad_token = tokenizer_class.eos_token
elif "bloom" in text_model_name_or_path:
tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path)
tokenizer_class = BloomTokenizer.from_pretrained(text_model_name_or_path, **kwags)
tokenizer_class.pad_token = tokenizer_class.eos_token
else:
raise NotImplementedError
Expand Down
14 changes: 14 additions & 0 deletions paddlemix/processors/image_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
)
from paddlemix.utils.log import logger

try:
from paddlenlp.transformers.aistudio_utils import aistudio_download
except:
logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
aistudio_download = None
pass


IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json"
TEXT_PROCESSOR_NAME = "text_processor_config.json"

Expand Down Expand Up @@ -272,6 +280,7 @@ def get_image_processor_dict(
"""
cache_dir = kwargs.pop("cache_dir", None)
from_hf_hub = kwargs.pop("from_hf_hub", False)
from_aistudio = kwargs.get("from_aistudio", False)
subfolder = kwargs.pop("subfolder", None)
cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)

Expand All @@ -292,6 +301,11 @@ def get_image_processor_dict(
library_name="PaddleNLP",
library_version=__version__,
)
elif from_aistudio and aistudio_download is not None:
image_processor_file = IMAGE_PROCESSOR_NAME
resolved_image_processor_file = aistudio_download(
repo_id=pretrained_model_name_or_path, filename=image_processor_file
)
else:
# Assuming from community-contributed pretrained models
image_processor_file = "/".join(
Expand Down
73 changes: 73 additions & 0 deletions paddlemix/processors/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
logger.warning("aistudio_download not import, if you want to use , require paddlenlp develop")
aistudio_download = None
pass
import aistudio_sdk

PROCESSOR_CONFIG_MAPPING = {
"image": "image_preprocessor_config.json",
Expand Down Expand Up @@ -228,6 +229,69 @@ def save_to_hf_hub(
create_pr=create_pr,
)

def save_to_aistudio(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

上传下载能力加下单测

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

self,
repo_id,
private=True,
license="Apache License 2.0",
exist_ok=True,
safe_serialization=True,
subfolder=None,
merge_tensor_parallel=False,
**kwargs
):
"""
Uploads all elements of this model to a new AiStudio Hub repository.
Args:
repo_id (str): Repository name for your model/tokenizer in the Hub.
token (str): Your token for the Hub.
private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
safe_serialization (bool, optional): Whether to save the model in safe serialization way. Defaults to: True.
subfolder (str, optional): Push to a subfolder of the repo instead of the root
merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
"""

res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
if "error_code" in res:
if res["error_code"] == 10003 and exist_ok:
logger.info(
f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
)
else:
logger.error(
f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
)
else:
logger.info(f"Successfully created repo {repo_id}")

with tempfile.TemporaryDirectory() as root_dir:
if subfolder is not None:
save_dir = os.path.join(root_dir, subfolder)
else:
save_dir = root_dir

# save model
self.save_pretrained(save_dir)

# Upload model and return
logger.info(f"Pushing to the {repo_id}. This might take a while")
for filename in os.listdir(save_dir):
path_in_repo = os.path.join(subfolder, filename) if subfolder is not None else filename
res = aistudio_sdk.hub.upload(
repo_id=repo_id,
path_or_fileobj=os.path.join(save_dir, filename),
path_in_repo=path_in_repo,
**kwargs,
)
if "error_code" in res:
logger.error(
f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
)
else:
logger.info(f"{filename}: {res['message']}")

@classmethod
def get_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
Expand Down Expand Up @@ -273,6 +337,15 @@ def get_processor_dict(
)
elif from_aistudio and aistudio_download is not None:
processor_file = PROCESSOR_CONFIG_MAPPING[cls.input_type]
if subfolder is not None:
processor_file = os.path.join(subfolder, processor_file)

pretrained_model_name_or_path_list = pretrained_model_name_or_path.split("/")
if len(pretrained_model_name_or_path_list) > 2:
pretrained_model_name_or_path = os.path.join(
pretrained_model_name_or_path_list[0], pretrained_model_name_or_path_list[1]
)

resolved_processor_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=processor_file)
else:
# Assuming from community-contributed pretrained models
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ pycocoevalcap
ftfy
regex
einops>=0.6.1

aistudio-sdk>=0.1.3
13 changes: 13 additions & 0 deletions tests/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
91 changes: 91 additions & 0 deletions tests/processors/test_from_aistudio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import unittest

sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))

from paddlemix.models.groundingdino.modeling import GroundingDinoModel
from paddlemix.processors.groundingdino_processing import (
GroudingDinoImageProcessor,
GroudingDinoTextProcessor,
)
from tests.testing_utils import ai_studio_token, slow

repo_id = "aistudio/groundingdino-swint-ogc"
bos_model_name = "GroundingDino/groundingdino-swint-ogc"


class FromAiStudioTester:
def __init__(self):
self.bos_model_name = bos_model_name
self.token = ai_studio_token
self.repo_id = repo_id

def prepare_from_bos(self):
model = GroundingDinoModel.from_pretrained(self.bos_model_name)
image_processor = GroudingDinoImageProcessor.from_pretrained(self.bos_model_name)
text_processor = GroudingDinoTextProcessor.from_pretrained(self.bos_model_name)

return model, image_processor, text_processor


class AIStudioUpTester(unittest.TestCase):
def setUp(self):
self.tester = FromAiStudioTester()
self.model, self.image_processor, self.text_processor = self.tester.prepare_from_bos()

@slow
def test_model_up_aistusio(self):
self.model.save_to_aistudio(
repo_id=self.tester.repo_id,
token=self.tester.token,
private=True,
license="Apache License 2.0",
exist_ok=True,
safe_serialization=True,
)

def test_processor_up_aistusio(self):
self.image_processor.save_to_aistudio(
repo_id=self.tester.repo_id,
token=self.tester.token,
private=True,
license="Apache License 2.0",
exist_ok=True,
)
self.text_processor.save_to_aistudio(
repo_id=self.tester.repo_id,
token=self.tester.token,
private=True,
license="Apache License 2.0",
exist_ok=True,
)


class AIStudioLoadTester(unittest.TestCase):
def setUp(self):
self.tester = FromAiStudioTester()

@slow
def test_model_load_aistusio(self):
GroundingDinoModel.from_pretrained(self.tester.repo_id, from_aistudio=True)

def test_processor_load_aistusio(self):
GroudingDinoTextProcessor.from_pretrained(self.tester.repo_id, from_aistudio=True)

def image_processor_load_aistusio(self):
GroudingDinoImageProcessor.from_pretrained(self.tester.repo_id, from_aistudio=True)
1 change: 1 addition & 0 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def get_bool_from_env(key, default_value=False):


_run_slow_test = get_bool_from_env("RUN_SLOW_TEST")
ai_studio_token = os.getenv("AI_STUDIO_TOKEN")


def slow(test):
Expand Down