From 5bbb23bbc1e1a6377f97d2ac768d5e8caa466f30 Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Wed, 26 Jun 2024 10:02:15 +0200 Subject: [PATCH 01/31] started adding task_type --- doc/user_guide/user_guide.md | 1 + .../templates/model_downloader_udf.jinja.sql | 1 + .../udfs/models/base_model_udf.py | 3 +- .../udfs/models/model_downloader_udf.py | 12 ++-- exasol_transformers_extension/upload_model.py | 4 +- .../utils/current_model_specification.py | 12 ++-- ...gingface_hub_bucketfs_model_transfer_sp.py | 1 + .../utils/load_local_model.py | 4 +- .../utils/model_specification.py | 27 ++++++-- poetry.lock | 65 ++++++++++--------- tests/fixtures/database_connection_fixture.py | 2 +- tests/fixtures/model_fixture.py | 15 +++-- .../with_db/test_upload_model.py | 1 + .../udfs/test_model_downloader_udf_script.py | 4 +- .../test_prediction_with_downloader_udf.py | 5 +- .../udfs/test_model_downloader_udf.py | 7 +- .../without_db/utils/test_load_local_model.py | 7 +- tests/unit_tests/udfs/test_base_udf.py | 30 +++++---- .../udfs/test_model_downloader_udf.py | 13 ++-- .../unit_tests/utils/test_load_local_model.py | 9 ++- tests/utils/parameters.py | 12 ++-- 21 files changed, 142 insertions(+), 93 deletions(-) diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index dbcaa5b3..a2e9387b 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -355,6 +355,7 @@ Once you have internet access, invoke the UDF like this: ```sql SELECT TE_MODEL_DOWNLOADER_UDF( model_name, + task_name, #todo description sub_dir, bucketfs_conn, token_conn diff --git a/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql b/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql index 31e7470b..60e5af05 100644 --- a/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql +++ b/exasol_transformers_extension/resources/templates/model_downloader_udf.jinja.sql @@ -1,5 +1,6 @@ CREATE OR REPLACE {{ language_alias }} SET SCRIPT "TE_MODEL_DOWNLOADER_UDF"( model_name VARCHAR(2000000), + task_type VARCHAR(2000000), sub_dir VARCHAR(2000000), bfs_conn VARCHAR(2000000), token_conn VARCHAR(2000000) diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py index 21b96d60..f70d675b 100644 --- a/exasol_transformers_extension/udfs/models/base_model_udf.py +++ b/exasol_transformers_extension/udfs/models/base_model_udf.py @@ -183,9 +183,10 @@ def check_cache(self, model_df: pd.DataFrame) -> None: bucketfs_connection, and sub_dir """ model_name = model_df["model_name"].iloc[0] + task_type = model_df["task_type"].iloc[0]#todo do we need as input? or do self.task_name bucketfs_conn = model_df["bucketfs_conn"].iloc[0] sub_dir = model_df["sub_dir"].iloc[0] - current_model_specification = CurrentModelSpecification(model_name, bucketfs_conn, sub_dir) + current_model_specification = CurrentModelSpecification(model_name, task_type, bucketfs_conn, sub_dir) if self.model_loader.current_model_specification != current_model_specification: bucketfs_location = \ diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py index 8d313289..e504f5ef 100644 --- a/exasol_transformers_extension/udfs/models/model_downloader_udf.py +++ b/exasol_transformers_extension/udfs/models/model_downloader_udf.py @@ -22,15 +22,13 @@ class ModelDownloaderUDF: returns , """ - def __init__(self, + def __init__(self, #todo change calls and docu! exa, - base_model_factory: ModelFactoryProtocol = transformers.AutoModel, tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer, huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory = HuggingFaceHubBucketFSModelTransferSPFactory(), current_model_specification_factory: CurrentModelSpecificationFactory = CurrentModelSpecificationFactory()): self._exa = exa - self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory self._huggingface_hub_bucketfs_model_transfer = huggingface_hub_bucketfs_model_transfer self._current_model_specification_factory = current_model_specification_factory @@ -47,9 +45,11 @@ def _download_model(self, ctx) -> Tuple[str, str]: bfs_conn = ctx.bfs_conn # BucketFS connection token_conn = ctx.token_conn # name of token connection current_model_specification = self._current_model_specification_factory.create(ctx.model_name, - bfs_conn, - ctx.sub_dir) # specifies details of Huggingface model + ctx.task_type, + bfs_conn, + ctx.sub_dir) # specifies details of Huggingface model + model_factory = current_model_specification.get_model_factory() # extract token from the connection if token connection name is given. # note that, token is required for private models. It doesn't matter # whether there is a token for public model or even what the token is. @@ -72,7 +72,7 @@ def _download_model(self, ctx) -> Tuple[str, str]: model_path=model_path, token=token ) as downloader: - for model in [self._base_model_factory, self._tokenizer_factory]: + for model in [model_factory, self._tokenizer_factory]: downloader.download_from_huggingface_hub(model) # upload model files to BucketFS model_tar_file_path = downloader.upload_to_bucketfs() diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index 2c2207b4..01663246 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -13,6 +13,7 @@ @click.command() @click.option('--model-name', type=str, required=True, help="name of the model") +@click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") @click.option('--local-model-path', type=click.Path(exists=True, file_okay=True), @@ -42,6 +43,7 @@ @click.option('--use-ssl-cert-validation/--no-use-ssl-cert-validation', type=bool, default=True) def main( model_name: str, + task_type: str, sub_dir: str, local_model_path: str, bucketfs_name: str, @@ -80,7 +82,7 @@ def main( use_ssl_cert_validation=use_ssl_cert_validation) # create CurrentModelSpecification for model to be loaded - current_model_specs = CurrentModelSpecification(model_name, "", Path(sub_dir)) + current_model_specs = CurrentModelSpecification(model_name, task_type, "", Path(sub_dir)) # upload the downloaded model files into bucketfs upload_path = current_model_specs.get_bucketfs_model_save_path() bucketfs_operations.upload_model_files_to_bucketfs( diff --git a/exasol_transformers_extension/utils/current_model_specification.py b/exasol_transformers_extension/utils/current_model_specification.py index fb1c14bd..d4aeb9af 100644 --- a/exasol_transformers_extension/utils/current_model_specification.py +++ b/exasol_transformers_extension/utils/current_model_specification.py @@ -8,9 +8,10 @@ class CurrentModelSpecification(ModelSpecification): """ def __init__(self, model_name: str, + task_type: str, bucketfs_conn_name: str, sub_dir: Path): - ModelSpecification.__init__(self, model_name) + ModelSpecification.__init__(self, model_name, task_type) self.bucketfs_conn_name = bucketfs_conn_name self.sub_dir = sub_dir @@ -26,17 +27,17 @@ def get_bucketfs_model_save_path(self) -> Path: """ path model is saved at in the bucketfs """ - model_name = self.get_model_specific_path_suffix() - return Path(self.sub_dir, model_name) - + model_path_suffix = self.get_model_specific_path_suffix() + return Path(self.sub_dir, model_path_suffix) class CurrentModelSpecificationFactory: def create(self, model_name: str, + task_type: str, bucketfs_conn_name: str, sub_dir: Path): - return CurrentModelSpecification(model_name, bucketfs_conn_name, sub_dir) + return CurrentModelSpecification(model_name, task_type, bucketfs_conn_name, sub_dir) class CurrentModelSpecificationFromModelSpecs: @@ -45,5 +46,6 @@ def transform(self, bucketfs_conn_name: str, sub_dir: Path): return CurrentModelSpecification(model_name=model_specification.model_name, + task_type=model_specification.task_type, bucketfs_conn_name=bucketfs_conn_name, sub_dir=sub_dir) diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py index 0ae9a9a0..a5369175 100644 --- a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py +++ b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py @@ -58,6 +58,7 @@ def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol): use_auth_token=self._token) model.save_pretrained(self._save_pretrained_model_path) + def upload_to_bucketfs(self) -> Path: """ Upload the downloaded models into the BucketFS. diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py index 211ed44b..fec0f0b5 100644 --- a/exasol_transformers_extension/utils/load_local_model.py +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -26,7 +26,7 @@ def __init__(self, tokenizer_factory: ModelFactoryProtocol ): self.pipeline_factory = pipeline_factory - self.task_name = task_name + self.task_name = task_name #todo replace this? self.device = device self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory @@ -53,7 +53,7 @@ def load_models(self) -> transformers.pipelines.Pipeline: :current_model_key: key of the model to be loaded """ - loaded_model = self._base_model_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) + loaded_model = self._base_model_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) #todo does this need change? loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) last_created_pipeline = self.pipeline_factory( diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py index d8db8b11..c7a8d7b0 100644 --- a/exasol_transformers_extension/utils/model_specification.py +++ b/exasol_transformers_extension/utils/model_specification.py @@ -1,25 +1,42 @@ from pathlib import PurePosixPath, Path +import transformers + class ModelSpecification: """ Class describing a model. """ - def __init__(self, model_name: str): + def __init__(self, model_name: str, task_type: str): # task_type, model_version self.model_name = model_name + self.task_type = task_type - def get_model_specs_for_download(self): + def get_model_specs_for_download(self):#todo change usages? """ returns all attributes necessary for downloading the model from Huggingface. """ - return self.model_name + return self.model_name, self.task_type def __eq__(self, other): """Overrides the default implementation""" if isinstance(other, ModelSpecification): - return self.model_name == other.model_name + return (self.model_name == other.model_name + and self.task_type == other.task_type) return False def get_model_specific_path_suffix(self) -> PurePosixPath: - return PurePosixPath(self.model_name) #model_name-version-task + return PurePosixPath(self.model_name + "_" + self.task_type) #model_name-version-task + + def get_model_factory(self): + """ + sets model factory depending on the task_type of the specific model + """ + model_task_type = self.task_type + if model_task_type == "filling_mask": + model_factory = transformers.AutoModelForMaskedLM #todo make switchcase? + elif model_task_type == "translation": + model_factory = transformers.T5Model #todo correct? + else: + model_factory = transformers.AutoModel + return model_factory diff --git a/poetry.lock b/poetry.lock index 9d0ed0a8..c0f60134 100644 --- a/poetry.lock +++ b/poetry.lock @@ -642,18 +642,18 @@ pytest = ["pytest (>=7)"] [[package]] name = "filelock" -version = "3.15.1" +version = "3.15.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"}, - {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"}, + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, ] [package.extras] docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] typing = ["typing-extensions (>=4.8)"] [[package]] @@ -855,22 +855,22 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.1.0" +version = "7.2.1" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"}, - {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"}, + {file = "importlib_metadata-7.2.1-py3-none-any.whl", hash = "sha256:ffef94b0b66046dd8ea2d619b701fe978d9264d38f3998bc4c27ec3b146a87c8"}, + {file = "importlib_metadata-7.2.1.tar.gz", hash = "sha256:509ecb2ab77071db5137c655e24ceb3eee66e7bbc6574165d0d114d9fc4bbe68"}, ] [package.dependencies] zipp = ">=0.5" [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] +test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] [[package]] name = "importlib-resources" @@ -953,13 +953,13 @@ files = [ [[package]] name = "jsonpickle" -version = "3.2.1" +version = "3.2.2" description = "Python library for serializing arbitrary object graphs into JSON" optional = false python-versions = ">=3.7" files = [ - {file = "jsonpickle-3.2.1-py3-none-any.whl", hash = "sha256:ec291e4719674dd35d390fbdb521ac6517fbe9f541d361c8bffc8131133b1661"}, - {file = "jsonpickle-3.2.1.tar.gz", hash = "sha256:4b6d7640974199f7acf9035295365b5a1a71a91109effa15ba170fbb48cf871c"}, + {file = "jsonpickle-3.2.2-py3-none-any.whl", hash = "sha256:87cd82d237fd72c5a34970e7222dddc0accc13fddf49af84111887ed9a9445aa"}, + {file = "jsonpickle-3.2.2.tar.gz", hash = "sha256:d425fd2b8afe9f5d7d57205153403fbf897782204437882a477e8eed60930f8c"}, ] [package.extras] @@ -1454,13 +1454,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "portalocker" -version = "2.8.2" +version = "2.10.0" description = "Wraps the portalocker recipe for easy usage" optional = false python-versions = ">=3.8" files = [ - {file = "portalocker-2.8.2-py3-none-any.whl", hash = "sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e"}, - {file = "portalocker-2.8.2.tar.gz", hash = "sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33"}, + {file = "portalocker-2.10.0-py3-none-any.whl", hash = "sha256:48944147b2cd42520549bc1bb8fe44e220296e56f7c3d551bc6ecce69d9b0de1"}, + {file = "portalocker-2.10.0.tar.gz", hash = "sha256:49de8bc0a2f68ca98bf9e219c81a3e6b27097c7bf505a87c5a112ce1aaeb9b81"}, ] [package.dependencies] @@ -1749,6 +1749,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2047,13 +2048,13 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"] [[package]] name = "setuptools" -version = "70.1.0" +version = "70.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"}, - {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"}, + {file = "setuptools-70.1.1-py3-none-any.whl", hash = "sha256:a58a8fde0541dab0419750bcc521fbdf8585f6e5cb41909df3a472ef7b81ca95"}, + {file = "setuptools-70.1.1.tar.gz", hash = "sha256:937a48c7cdb7a21eb53cd7f9b59e525503aa8abaf3584c730dc5f7a5bec3a650"}, ] [package.extras] @@ -2227,26 +2228,26 @@ mpmath = ">=1.1.0,<1.4.0" [[package]] name = "tbb" -version = "2021.12.0" +version = "2021.13.0" description = "Intel® oneAPI Threading Building Blocks (oneTBB)" optional = false python-versions = "*" files = [ - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"}, - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"}, - {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"}, - {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"}, + {file = "tbb-2021.13.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:a2567725329639519d46d92a2634cf61e76601dac2f777a05686fea546c4fe4f"}, + {file = "tbb-2021.13.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:aaf667e92849adb012b8874d6393282afc318aca4407fc62f912ee30a22da46a"}, + {file = "tbb-2021.13.0-py3-none-win32.whl", hash = "sha256:6669d26703e9943f6164c6407bd4a237a45007e79b8d3832fe6999576eaaa9ef"}, + {file = "tbb-2021.13.0-py3-none-win_amd64.whl", hash = "sha256:3528a53e4bbe64b07a6112b4c5a00ff3c61924ee46c9c68e004a1ac7ad1f09c3"}, ] [[package]] name = "tenacity" -version = "8.4.1" +version = "8.4.2" description = "Retry code until it succeeds" optional = false python-versions = ">=3.8" files = [ - {file = "tenacity-8.4.1-py3-none-any.whl", hash = "sha256:28522e692eda3e1b8f5e99c51464efcc0b9fc86933da92415168bc1c4e2308fa"}, - {file = "tenacity-8.4.1.tar.gz", hash = "sha256:54b1412b878ddf7e1f1577cd49527bad8cdef32421bd599beac0c6c3f10582fd"}, + {file = "tenacity-8.4.2-py3-none-any.whl", hash = "sha256:9e6f7cf7da729125c7437222f8a522279751cdfbe6b67bfe64f75d3a348661b2"}, + {file = "tenacity-8.4.2.tar.gz", hash = "sha256:cd80a53a79336edba8489e767f729e4f391c896956b57140b5d7511a64bbd3ef"}, ] [package.extras] @@ -2597,13 +2598,13 @@ test = ["mypy (>=1.2.0)", "pytest (>=7)"] [[package]] name = "types-requests" -version = "2.32.0.20240602" +version = "2.32.0.20240622" description = "Typing stubs for requests" optional = false python-versions = ">=3.8" files = [ - {file = "types-requests-2.32.0.20240602.tar.gz", hash = "sha256:3f98d7bbd0dd94ebd10ff43a7fbe20c3b8528acace6d8efafef0b6a184793f06"}, - {file = "types_requests-2.32.0.20240602-py3-none-any.whl", hash = "sha256:ed3946063ea9fbc6b5fc0c44fa279188bae42d582cb63760be6cb4b9d06c3de8"}, + {file = "types-requests-2.32.0.20240622.tar.gz", hash = "sha256:ed5e8a412fcc39159d6319385c009d642845f250c63902718f605cd90faade31"}, + {file = "types_requests-2.32.0.20240622-py3-none-any.whl", hash = "sha256:97bac6b54b5bd4cf91d407e62f0932a74821bc2211f22116d9ee1dd643826caf"}, ] [package.dependencies] @@ -2639,13 +2640,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.26.2" +version = "20.26.3" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"}, - {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"}, + {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, + {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, ] [package.dependencies] diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 0f8b43a5..49a27e7d 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -1,6 +1,6 @@ import pyexasol import pytest -from pytest_itde import config +from pytest_itde import config# todo not found where moved? @pytest.fixture(scope="module") diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index bd3b6336..eb4f2024 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -19,9 +19,10 @@ def download_model_to_standard_local_save_path(model_specification: ModelSpecifi local_model_save_path = bucketfs_operations.create_save_pretrained_model_path(tmpdir_name, model_specification) model_name = model_specification.model_name - for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: - model = model_factory.from_pretrained(model_name, cache_dir=tmpdir_name / "cache" / model_name) - model.save_pretrained(local_model_save_path) + model_factory = model_specification.get_model_factory() + for model in [model_factory, transformers.AutoTokenizer]: + downloaded_model = model.from_pretrained(model_name, cache_dir=tmpdir_name / "cache" / model_name) + downloaded_model.save_pretrained(local_model_save_path) return local_model_save_path @@ -61,8 +62,9 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, @pytest.fixture(scope="session") -def prepare_base_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: +def prepare_base_model_for_local_bucketfs(tmpdir_factory, task_type: str) -> PurePosixPath: #todo change usages model_specification = model_params.base_model_specs + model_specification.task_type = task_type bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @@ -92,9 +94,10 @@ def upload_model_to_bucketfs( @pytest.fixture(scope="session") -def upload_base_model_to_bucketfs( - bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: +def upload_base_model_to_bucketfs( #todo change usages + bucketfs_location: bfs.path.PathLike, tmpdir_factory, task_type:str ) -> PurePosixPath: base_model_specs = model_params.base_model_specs + base_model_specs.task_type = task_type tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 03ec356b..fea2e3db 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -51,6 +51,7 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, "--model-name", model_name, "--sub-dir", sub_dir, "--local-model-path", str(download_path), + "--task_type", "filling_mask" ] try: diff --git a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py index ee884e0b..8749ca33 100644 --- a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py +++ b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py @@ -24,6 +24,7 @@ def test_model_downloader_udf_script( model_paths.append(current_model_specs.get_bucketfs_model_save_path()) input_data.append(( current_model_specs.model_name, + current_model_specs.task_type, sub_dir, bucketfs_conn_name, '' @@ -34,11 +35,12 @@ def test_model_downloader_udf_script( query = f""" SELECT TE_MODEL_DOWNLOADER_UDF( t.model_name, + t.task_type, t.sub_dir, t.bucketfs_conn_name, t.token_conn_name ) FROM (VALUES {str(tuple(input_data))} AS - t(model_name, sub_dir, bucketfs_conn_name, token_conn_name)); + t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name)); """ # execute downloader UDF diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py index c57d7c2f..3e3401df 100644 --- a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py +++ b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py @@ -1,6 +1,7 @@ import time from tests.utils import postprocessing +TASK_TYPE = "filling_mask" SUB_DIR = 'test_downloader_with_prediction_sub_dir' MODEL_NAME = 'gaunernst/bert-tiny-uncased' @@ -13,6 +14,7 @@ def test_prediction_with_downloader_udf( # execute downloader UDF input_data = ( MODEL_NAME, + TASK_TYPE, SUB_DIR, bucketfs_conn_name, '' @@ -20,11 +22,12 @@ def test_prediction_with_downloader_udf( query = f""" SELECT TE_MODEL_DOWNLOADER_UDF( t.model_name, + t.task_type, t.sub_dir, t.bucketfs_conn_name, t.token_conn_name ) FROM (VALUES {str(input_data)} AS - t(model_name, sub_dir, bucketfs_conn_name, token_conn_name)); + t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name)); """ pyexasol_connection.execute(query).fetchall() diff --git a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py index af6f229c..3ab3f9a2 100644 --- a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py +++ b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py @@ -13,7 +13,7 @@ create_mounted_bucketfs_connection, create_hf_token_connection) from tests.utils.bucketfs_file_list import get_bucketfs_file_list - +# todo add tests for checking if model metadata is correct? class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): self._connections = connections @@ -38,6 +38,10 @@ def model_name(self): def sub_dir(self): return self.ctx_data[self.index]['sub_dir'] + @property + def task_type(self): + return self.ctx_data[self.index]['task_type'] + @property def bfs_conn(self): return self.ctx_data[self.index]['bucketfs_conn_name'] @@ -68,6 +72,7 @@ def __init__(self, id: str, tmp_dir: Path, token_conn_name: str): self.token_conn_name = token_conn_name self.ctx_data = { 'tiny_model': current_model_specs.model_name, + 'task_type': current_model_specs.task_type, 'sub_dir': self.sub_dir, 'bucketfs_conn_name': self.bucketfs_conn_name, 'token_conn_name': self.token_conn_name diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py index f44568c1..f5356c7e 100644 --- a/tests/integration_tests/without_db/utils/test_load_local_model.py +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -28,7 +28,7 @@ def __init__(self): self.token = "token" self.model_specification = model_params.tiny_model_specs - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) + self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification)#todo need change? test_pipeline = pipeline self.loader = LoadLocalModel( test_pipeline, @@ -56,14 +56,13 @@ def test_load_local_model(tmp_path): model_specification = test_setup.model_specification model_save_path = create_save_pretrained_model_path(tmp_path, model_specification) # download a model - model = AutoModel.from_pretrained(model_specification.model_name) + model = AutoModel.from_pretrained(model_specification.model_name) #todo change? tokenizer = AutoTokenizer.from_pretrained(model_specification.model_name) model.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) test_setup.loader.set_current_model_specification(current_model_specification= test_setup.mock_current_model_specification) - #test_setup.loader.set_bucketfs_model_cache_dir(bucketfs_location=) #todo macke a mock? or add test for set_bucketfs_model_cache_dir test_setup.loader._bucketfs_model_cache_dir = model_save_path test_setup.loader.load_models() @@ -80,7 +79,7 @@ def test_load_local_model_with_huggingface_model_transfer(tmp_path): downloaded_model_path = download_model_with_huggingface_transfer( test_setup, mock_bucketfs_location) - sub_dir_path = tmp_path / sub_dir + sub_dir_path = tmp_path / sub_dir #todo better name? with tarfile.open(str(sub_dir_path / downloaded_model_path)) as tar: tar.extractall(path=str(sub_dir_path)) diff --git a/tests/unit_tests/udfs/test_base_udf.py b/tests/unit_tests/udfs/test_base_udf.py index 40a0d24a..b7d72463 100644 --- a/tests/unit_tests/udfs/test_base_udf.py +++ b/tests/unit_tests/udfs/test_base_udf.py @@ -36,6 +36,7 @@ def udf_wrapper(): Column("device_id", int, "INTEGER"), Column("model_name", str, "VARCHAR(2000000)"), Column("sub_dir", str, "VARCHAR(2000000)"), + Column("task_type", str, "VARCHAR(2000000)"), Column("bucketfs_conn", str, "VARCHAR(2000000)"), Column("token_conn", str, "VARCHAR(2000000)"), ], @@ -53,7 +54,7 @@ def udf_wrapper(): return meta -def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): +def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): mock_base_model_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_tokenizer_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) @@ -62,6 +63,7 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): 1, model_name, sub_dir, + task_type, bucketfs_conn_name, '' ) @@ -86,44 +88,46 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): @pytest.mark.parametrize(["description", "bucketfs_conn_name", "bucketfs_conn", - "sub_dir", "model_name"], [ + "sub_dir", "model_name", "task_type"], [ ("all given", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", "test_model") + "test_subdir", "test_model", "filling_mask") ]) @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path') def test_model_downloader_all_parameters(mock_local_path, mock_create_loc, description, - bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): + bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): mock_create_loc.side_effect = fake_bucketfs_location_from_conn_object mock_local_path.side_effect = fake_local_bucketfs_path - res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name) + res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type) # check if no errors assert res[0][-1] is None and len(res[0]) == len(mock_meta.output_columns) @pytest.mark.parametrize(["description", "bucketfs_conn_name", "bucketfs_conn", - "sub_dir", "model_name"], [ - ("all null", None, None, None, None), + "sub_dir", "model_name", "task_type"], [ + ("all null", None, None, None, None, None), ("model name missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", None), + "test_subdir", None, "filling_mask"), ("bucketfs_conn missing", None, None, - "test_subdir", "test_model"), + "test_subdir", "test_model", "filling_mask"), ("sub_dir missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - None, "test_model"), + None, "test_model", "filling_mask"), ("model_name missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", None) + "test_subdir", None, "filling_mask"), + ("task_type missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), + "test_subdir", "test_model", None) ]) @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path') def test_model_downloader_missing_parameters(mock_local_path, mock_create_loc, description, - bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): + bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): mock_create_loc.side_effect = fake_bucketfs_location_from_conn_object mock_local_path.side_effect = fake_local_bucketfs_path - res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name) + res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type) error_field = res[0][-1] expected_error = regex_matcher(f".*For each model model_name, bucketfs_conn and sub_dir need to be provided." diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index 8dae4e06..a80fcc4a 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -18,7 +18,7 @@ from tests.utils.matchers import AnyOrder from tests.utils.mock_cast import mock_cast - +#todo add tests? def create_mock_metadata() -> MockMetaData: def udf_wrapper(): pass @@ -29,6 +29,7 @@ def udf_wrapper(): input_columns=[ Column("model_name", str, "VARCHAR(2000000)"), Column("sub_dir", str, "VARCHAR(2000000)"), + Column("task_type", str, "VARCHAR(2000000)"), Column("bfs_conn", str, "VARCHAR(2000000)"), Column("token_conn", str, "VARCHAR(2000000)"), ], @@ -49,7 +50,6 @@ def udf_wrapper(): @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') def test_model_downloader(mock_create_loc, description, count, token_conn_name, token_conn_obj, expected_token): - mock_base_model_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_tokenizer_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_model_downloader_factory: Union[HuggingFaceHubBucketFSModelTransferSPFactory, MagicMock] = create_autospec( @@ -64,12 +64,15 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, mock_create_loc.side_effect = mock_bucketfs_locations base_model_names = [f"base_model_name_{i}" for i in range(count)] sub_directory_names = [f"sub_dir_{i}" for i in range(count)] + task_type = [f"task_type_{i}" for i in range(count)] #todo just use real type? bucketfs_connections = [Connection(address=f"file:///test{i}") for i in range(count)] bfs_conn_name = [f"bfs_conn_name_{i}" for i in bucketfs_connections] mock_cmss = [create_autospec(CurrentModelSpecification, model_name=base_model_names[i], - sub_dir=Path(sub_directory_names[i])) for i in range(count)] + task_type=task_type[i], + sub_dir=Path(sub_directory_names[i]), + get_model_factory=CurrentModelSpecification.get_model_factory) for i in range(count)] for i in range(count): mock_cast(mock_cmss[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] mock_current_model_specification_factory: Union[CurrentModelSpecificationFactory, MagicMock] = ( @@ -80,6 +83,7 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, ( base_model_names[i], sub_directory_names[i], + task_type[i], bfs_conn_name[i], token_conn_name ) @@ -95,7 +99,6 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, mock_ctx = create_mock_udf_context(input_data, mock_meta) udf = ModelDownloaderUDF(exa=mock_exa, - base_model_factory=mock_base_model_factory, tokenizer_factory=mock_tokenizer_factory, huggingface_hub_bucketfs_model_transfer=mock_model_downloader_factory, current_model_specification_factory=mock_current_model_specification_factory) @@ -109,7 +112,7 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, ] for i in range(count): assert mock_cast(mock_model_downloaders[i].download_from_huggingface_hub).mock_calls == [ - call(mock_base_model_factory), + call(mock_cmss[i].get_model_factory), #todo add to call transformers.taskthing #todo d dont match how mock corecctly? call(mock_tokenizer_factory) ] assert call() in mock_cast(mock_model_downloaders[i].upload_to_bucketfs).mock_calls diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py index af6284d9..98287972 100644 --- a/tests/unit_tests/utils/test_load_local_model.py +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -17,17 +17,18 @@ class TestSetup: def __init__(self): - self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) + self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) #todo change? self.tokenizer_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) self.token = "token" self.model_name = "model_name" + self.model_task = "test_task" self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) self.cache_dir = "test/Path" self.mock_pipeline = Mock() self.loader = LoadLocalModel( self.mock_pipeline, - task_name="test_task", + self.model_task, device="cpu", base_model_factory=self.model_factory_mock, tokenizer_factory=self.tokenizer_factory_mock) @@ -35,7 +36,9 @@ def __init__(self): def test_load_function_call(): test_setup = TestSetup() - model_save_path = create_save_pretrained_model_path(test_setup.cache_dir, ModelSpecification(test_setup.model_name)) + model_save_path = create_save_pretrained_model_path(test_setup.cache_dir, + ModelSpecification(test_setup.model_name, + test_setup.model_task)) test_setup.loader._bucketfs_model_cache_dir = model_save_path test_setup.loader.set_current_model_specification(test_setup.mock_current_model_specification) diff --git a/tests/utils/parameters.py b/tests/utils/parameters.py index e42a2a70..15c4b660 100644 --- a/tests/utils/parameters.py +++ b/tests/utils/parameters.py @@ -15,9 +15,9 @@ class BucketFSParams: @dataclass(frozen=True) class ModelParams: - base_model_specs: ModelSpecification - seq2seq_model_specs: ModelSpecification - tiny_model_specs: ModelSpecification + base_model_specs: ModelSpecification #this is used for other tests, task_type should be set per test + seq2seq_model_specs: ModelSpecification #tis model is used for testing translation_udf + tiny_model_specs: ModelSpecification #this model is used for upload/download tests text_data: str sub_dir: str @@ -29,8 +29,8 @@ class ModelParams: path_in_bucket="container") model_params = ModelParams( - base_model_specs=ModelSpecification('bert-base-uncased'), - seq2seq_model_specs=ModelSpecification("t5-small"), - tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny"), + base_model_specs=ModelSpecification('bert-base-uncased', "need to set this task_type"), + seq2seq_model_specs=ModelSpecification("t5-small", "translation"), + tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny", ""),#todo make work with empty task_zype or use a real one? text_data='The company Exasol is based in Nuremberg', sub_dir='model_sub_dir') From ea9478017b744e73d2059e2d0c0221141ae68dc9 Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Fri, 28 Jun 2024 15:35:33 +0200 Subject: [PATCH 02/31] started adding quality control tests --- doc/user_guide/user_guide.md | 2 +- .../udfs/models/base_model_udf.py | 3 +- .../udfs/models/model_downloader_udf.py | 2 +- exasol_transformers_extension/upload_model.py | 35 ++++-- .../utils/load_local_model.py | 4 +- .../utils/model_specification.py | 39 +++++- tests/fixtures/database_connection_fixture.py | 2 +- tests/fixtures/model_fixture.py | 113 +++++++++++++++++- .../with_db/test_upload_model.py | 13 +- .../with_db/udfs/test_filling_mask_script.py | 22 +++- .../test_prediction_with_downloader_udf.py | 14 +++ .../udfs/test_question_answering_script.py | 25 +++- ...uence_classification_single_text_script.py | 19 ++- ...equence_classification_text_pair_script.py | 19 ++- .../udfs/test_text_generation_script.py | 23 +++- .../udfs/test_token_classification_script.py | 19 ++- .../with_db/udfs/test_translation_script.py | 14 +++ ...st_zero_shot_text_classification_script.py | 21 +++- .../without_db/udfs/test_filling_mask_udf.py | 10 +- .../udfs/test_model_downloader_udf.py | 1 - .../udfs/test_question_answering_udf.py | 9 +- ...sequence_classification_single_text_udf.py | 9 +- ...t_sequence_classification_text_pair_udf.py | 9 +- .../udfs/test_text_generation_udf.py | 9 +- .../udfs/test_token_classification_udf.py | 14 +-- .../test_zero_shot_text_classification_udf.py | 13 +- .../without_db/utils/test_load_local_model.py | 8 +- ...rediction_multiple_model_multiple_batch.py | 4 +- tests/unit_tests/udfs/test_base_udf.py | 30 ++--- .../udfs/test_model_downloader_udf.py | 4 +- .../unit_tests/utils/test_load_local_model.py | 2 +- tests/utils/parameters.py | 4 +- 32 files changed, 414 insertions(+), 101 deletions(-) diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index a2e9387b..df7f95a5 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -397,7 +397,7 @@ function to ensure proper loading by the Transformers Extension UDFs. You can download the model using python like this: ```python - for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: + for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]:#todo hange? # download the model and tokenizer from Hugging Face model = model_factory.from_pretrained(model_name) # save the downloaded model using the save_pretrained function diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py index f70d675b..7e080a00 100644 --- a/exasol_transformers_extension/udfs/models/base_model_udf.py +++ b/exasol_transformers_extension/udfs/models/base_model_udf.py @@ -183,10 +183,9 @@ def check_cache(self, model_df: pd.DataFrame) -> None: bucketfs_connection, and sub_dir """ model_name = model_df["model_name"].iloc[0] - task_type = model_df["task_type"].iloc[0]#todo do we need as input? or do self.task_name bucketfs_conn = model_df["bucketfs_conn"].iloc[0] sub_dir = model_df["sub_dir"].iloc[0] - current_model_specification = CurrentModelSpecification(model_name, task_type, bucketfs_conn, sub_dir) + current_model_specification = CurrentModelSpecification(model_name, self.task_name, bucketfs_conn, sub_dir) if self.model_loader.current_model_specification != current_model_specification: bucketfs_location = \ diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py index e504f5ef..3af0c883 100644 --- a/exasol_transformers_extension/udfs/models/model_downloader_udf.py +++ b/exasol_transformers_extension/udfs/models/model_downloader_udf.py @@ -22,7 +22,7 @@ class ModelDownloaderUDF: returns , """ - def __init__(self, #todo change calls and docu! + def __init__(self, #todo change docu! exa, tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer, huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory = diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index 01663246..aaa925b0 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -2,12 +2,15 @@ from pathlib import Path import click +import transformers from exasol.python_extension_common.deployment.language_container_deployer_cli import ( SECRET_DISPLAY, SecretParams, secret_callback) from exasol_transformers_extension.utils import bucketfs_operations from exasol_transformers_extension.deployment import deployment_utils as utils from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ + HuggingFaceHubBucketFSModelTransferSP @click.command() @@ -16,6 +19,7 @@ @click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") +@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") #todo chnage docu @click.option('--local-model-path', type=click.Path(exists=True, file_okay=True), required=True, help="local path where model is located") @click.option('--bucketfs-name', type=str) @@ -45,7 +49,8 @@ def main( model_name: str, task_type: str, sub_dir: str, - local_model_path: str, + token: str | None, + #local_model_path: str, bucketfs_name: str, bucketfs_host: str, bucketfs_port: int, @@ -61,9 +66,13 @@ def main( path_in_bucket: str, use_ssl_cert_validation: bool) -> None: """ - Script for uploading locally saved model files to BucketFS. Files should have been saved locally - using Transformers save_pretrained function. This ensures proper loading from the BucketFS later + Downloads model from Huggingface hub and the transfers model to database """ + # create CurrentModelSpecification for model to be loaded + current_model_specs = CurrentModelSpecification(model_name, task_type, "", Path(sub_dir)) + # upload the downloaded model files into bucketfs + upload_path = current_model_specs.get_bucketfs_model_save_path() + # create bucketfs location bucketfs_location = bucketfs_operations.create_bucketfs_location( bucketfs_name=bucketfs_name, @@ -81,12 +90,22 @@ def main( path_in_bucket=path_in_bucket, use_ssl_cert_validation=use_ssl_cert_validation) - # create CurrentModelSpecification for model to be loaded - current_model_specs = CurrentModelSpecification(model_name, task_type, "", Path(sub_dir)) # upload the downloaded model files into bucketfs - upload_path = current_model_specs.get_bucketfs_model_save_path() - bucketfs_operations.upload_model_files_to_bucketfs( - local_model_path, upload_path, bucketfs_location) + + #bucketfs_operations.upload_model_files_to_bucketfs( + # local_model_path, upload_path, bucketfs_location) + model_factory = current_model_specs.get_model_factory() + + downloader = HuggingFaceHubBucketFSModelTransferSP(bucketfs_location=bucketfs_location, + model_specification=current_model_specs, + bucketfs_model_path=upload_path, + token=token) + + for model in [model_factory, transformers.AutoTokenizer]: + downloader.download_from_huggingface_hub(model) + # upload model files to BucketFS + model_tar_file_path = downloader.upload_to_bucketfs() + print("your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) if __name__ == '__main__': diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py index fec0f0b5..211ed44b 100644 --- a/exasol_transformers_extension/utils/load_local_model.py +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -26,7 +26,7 @@ def __init__(self, tokenizer_factory: ModelFactoryProtocol ): self.pipeline_factory = pipeline_factory - self.task_name = task_name #todo replace this? + self.task_name = task_name self.device = device self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory @@ -53,7 +53,7 @@ def load_models(self) -> transformers.pipelines.Pipeline: :current_model_key: key of the model to be loaded """ - loaded_model = self._base_model_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) #todo does this need change? + loaded_model = self._base_model_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) last_created_pipeline = self.pipeline_factory( diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py index c7a8d7b0..06e9de5a 100644 --- a/exasol_transformers_extension/utils/model_specification.py +++ b/exasol_transformers_extension/utils/model_specification.py @@ -11,6 +11,28 @@ def __init__(self, model_name: str, task_type: str): self.model_name = model_name self.task_type = task_type + def set_task_type_from_udf_name(self, text): + """ + switches user input(matching udf name) to transformers task types + """ + if text == "filling_mask": + task_type = "fill-mask" + elif text == "question_answering": + task_type = "question-answering" + elif text == "sequence_classification": + task_type = "text-classification" + elif text == "text_generation": + task_type = "text-generation" + elif text == "token_classification": + task_type = "token-classification" + elif text == "translation": + task_type = "translation" + elif text == "zero_shot_classification": + task_type = "zero-shot-classification" + else: + task_type = text + self.task_type = task_type + def get_model_specs_for_download(self):#todo change usages? """ returns all attributes necessary for downloading the model from Huggingface. @@ -32,11 +54,20 @@ def get_model_factory(self): sets model factory depending on the task_type of the specific model """ model_task_type = self.task_type - if model_task_type == "filling_mask": - model_factory = transformers.AutoModelForMaskedLM #todo make switchcase? + if model_task_type == "fill-mask": + model_factory = transformers.AutoModelForMaskedLM elif model_task_type == "translation": - model_factory = transformers.T5Model #todo correct? + model_factory = transformers.T5Model #todo correct? se to seq in translation udf + elif model_task_type == "zero-shot-classification": + model_factory = transformers.AutoModelForSequenceClassification + elif model_task_type == "text-classification": + model_factory = transformers.AutoModelForSequenceClassification + elif model_task_type == "question-answering": + model_factory = transformers.AutoModelForQuestionAnswering + #elif model_task_type == "text-generation": + # model_factory = transformers.AutoModelFor + elif model_task_type == "token-classification": + model_factory = transformers.AutoModelForTokenClassification else: model_factory = transformers.AutoModel return model_factory - diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 49a27e7d..0f8b43a5 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -1,6 +1,6 @@ import pyexasol import pytest -from pytest_itde import config# todo not found where moved? +from pytest_itde import config @pytest.fixture(scope="module") diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index eb4f2024..ef0ccd26 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -60,11 +60,52 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, download_model_to_path(current_model_specs, bucketfs_path_for_model) return tmpdir +@pytest.fixture(scope="session") +def prepare_filling_mask_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "fill-mask" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_question_answering_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "question-answering" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_sequence_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "text-classification" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_text_generation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "text-generation" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_token_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "token-classification" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_translation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.base_model_specs + model_specification.task_type = "translation" + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path @pytest.fixture(scope="session") -def prepare_base_model_for_local_bucketfs(tmpdir_factory, task_type: str) -> PurePosixPath: #todo change usages +def prepare_zero_shot_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: model_specification = model_params.base_model_specs - model_specification.task_type = task_type + model_specification.task_type = "zero-shot-classification" bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @@ -94,16 +135,78 @@ def upload_model_to_bucketfs( @pytest.fixture(scope="session") -def upload_base_model_to_bucketfs( #todo change usages - bucketfs_location: bfs.path.PathLike, tmpdir_factory, task_type:str ) -> PurePosixPath: +def upload_filling_mask_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "fill-mask" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + + +@pytest.fixture(scope="session") +def upload_question_answering_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "question-answering" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_sequence_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "text-classification" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_text_generation_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "text-generation" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_token_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "token-classification" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_translation_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.base_model_specs + base_model_specs.task_type = "translation" + tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_zero_shot_classification_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: base_model_specs = model_params.base_model_specs - base_model_specs.task_type = task_type + base_model_specs.task_type = "zero-shot-classification" tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path + @pytest.fixture(scope="session") def upload_seq2seq_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index fea2e3db..1e31b47c 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -13,8 +13,11 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params -from tests.fixtures.model_fixture import download_model_to_standard_local_save_path - +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def adapt_file_to_upload(path: PosixPath, download_path: PosixPath): if path.is_dir(): @@ -31,14 +34,15 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, bucketfs_location: bfs.path.PathLike, bucketfs_config: config.BucketFs): sub_dir = 'sub_dir' model_specification = model_params.base_model_specs + model_specification.task_type = "filling_mask" model_name = model_specification.model_name - download_path = download_model_to_standard_local_save_path(model_specification, tmp_path) current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, "", Path(sub_dir)) upload_path = current_model_specs.get_bucketfs_model_save_path() parsed_url = urlparse(bucketfs_config.url) host = parsed_url.netloc.split(":")[0] port = parsed_url.netloc.split(":")[1] + print("path in bucket: "+ bucketfs_params.path_in_bucket) args_list = [ "--bucketfs-name", bucketfs_params.name, "--bucketfs-host", host, @@ -50,13 +54,13 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, "--path-in-bucket", bucketfs_params.path_in_bucket, "--model-name", model_name, "--sub-dir", sub_dir, - "--local-model-path", str(download_path), "--task_type", "filling_mask" ] try: runner = CliRunner() result = runner.invoke(upload_model.main, args_list) + print(result) assert result.exit_code == 0 bucketfs_upload_location = bucketfs_location / upload_path.with_suffix(".tar.gz") assert bucketfs_upload_location.is_file() @@ -90,3 +94,4 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, assert len(result) == 1 and result[0][-1] is None finally: postprocessing.cleanup_buckets(bucketfs_location, sub_dir) +#todo path is not corretc after upload? \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py index 4faf9b83..7e06a41b 100644 --- a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py +++ b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py @@ -1,11 +1,16 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql +from tests.fixtures.model_fixture import upload_filling_mask_model_to_bucketfs +from tests.fixtures.bucketfs_fixture import bucketfs_location +from tests.fixtures.database_connection_fixture import pyexasol_connection +from tests.fixtures.setup_database_fixture import setup_database, language_alias +from tests.fixtures.language_container_fixture import flavor_path, upload_slc, export_slc from tests.utils.parameters import model_params def test_filling_mask_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_filling_mask_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database - text_data = "Exasol is an analytics management software company." + text_data = "I you so much." n_rows = 100 top_k = 3 input_data = [] @@ -39,3 +44,16 @@ def test_filling_mask_script( n_rows_result = n_rows * top_k n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py index 3e3401df..94d0d65e 100644 --- a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py +++ b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py @@ -58,9 +58,23 @@ def test_prediction_with_downloader_udf( result = pyexasol_connection.execute(query).fetchall() + print(result) + # assertions assert len(result) == top_k assert all(row[-1] is None for row in result) + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string, list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > top_k / 2 + finally: postprocessing.cleanup_buckets(bucketfs_location, SUB_DIR) diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py index 39a7beb2..13914ddb 100644 --- a/tests/integration_tests/with_db/udfs/test_question_answering_script.py +++ b/tests/integration_tests/with_db/udfs/test_question_answering_script.py @@ -1,11 +1,18 @@ +from tests.fixtures.model_fixture import upload_question_answering_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + def test_question_answering_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_question_answering_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database - question = "Where is the Exasol?" + question = "How many syllables are in the word Syllable?" n_rows = 100 top_k = 1 input_data = [] @@ -42,3 +49,17 @@ def test_question_answering_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + for i in range(5): + print(result[i]) + results = [result[i][6] for i in range(len(result))] + acceptable_results = ["three", "3", "want", "need"] + number_accepted_results = 0 + + def contains(string, list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > top_k / 2 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py index c8306f6e..37405871 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py @@ -1,9 +1,10 @@ +from tests.fixtures.model_fixture import upload_sequence_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params def test_sequence_classification_single_text_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database n_labels = 2 n_rows = 100 @@ -36,3 +37,19 @@ def test_sequence_classification_single_text_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + for i in range(10): + print(result[i]) + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py index 8bd48f6b..ad0547d0 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py @@ -1,9 +1,10 @@ +from tests.fixtures.model_fixture import upload_sequence_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params def test_sequence_classification_text_pair_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database n_labels = 2 n_rows = 100 @@ -38,3 +39,19 @@ def test_sequence_classification_text_pair_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + for i in range(10): + print(result[i]) + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_text_generation_script.py b/tests/integration_tests/with_db/udfs/test_text_generation_script.py index b085ac20..81d80430 100644 --- a/tests/integration_tests/with_db/udfs/test_text_generation_script.py +++ b/tests/integration_tests/with_db/udfs/test_text_generation_script.py @@ -1,9 +1,15 @@ +from tests.fixtures.model_fixture import upload_text_generation_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_text_generation_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_text_generation_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database text_data = "Exasol is an analytics database management" n_rows = 100 @@ -43,3 +49,18 @@ def test_text_generation_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + # lenient test for quality of results, will be replaced by deterministic test later + for i in range(5): + print(result[i]) + results = [result[i][6] for i in range(len(result))] + acceptable_results = ["software", "system", "solution"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/with_db/udfs/test_token_classification_script.py b/tests/integration_tests/with_db/udfs/test_token_classification_script.py index 85cf1227..622e62e4 100644 --- a/tests/integration_tests/with_db/udfs/test_token_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_token_classification_script.py @@ -1,9 +1,10 @@ +from tests.fixtures.model_fixture import upload_token_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params def test_token_classification_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): + setup_database, pyexasol_connection, upload_token_classification_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database aggregation_strategy = "simple" n_rows = 100 @@ -38,3 +39,19 @@ def test_token_classification_script( removed_columns = 1 # device_id n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) >= n_rows and len(result[0]) == n_cols_result + + for i in range(10): + print(result[i]) + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_translation_script.py b/tests/integration_tests/with_db/udfs/test_translation_script.py index 67bf965e..34174f34 100644 --- a/tests/integration_tests/with_db/udfs/test_translation_script.py +++ b/tests/integration_tests/with_db/udfs/test_translation_script.py @@ -45,3 +45,17 @@ def test_translation_script( n_rows_result = n_rows n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + print(result) + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][7] for i in range(len(result))] + acceptable_results = ["Die Firma Exasol hat ihren Sitz in Nürnberg"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py index c44de521..5b40bbc8 100644 --- a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py @@ -1,9 +1,10 @@ +from tests.fixtures.model_fixture import upload_zero_shot_classification_model_to_bucketfs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params -def test_sequence_classification_single_text_script( - setup_database, pyexasol_connection, upload_base_model_to_bucketfs): +def test_zero_shot_classification_single_text_script( + setup_database, pyexasol_connection, upload_zero_shot_classification_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database n_rows = 100 input_data = [] @@ -40,3 +41,19 @@ def test_sequence_classification_single_text_script( n_rows_result = n_rows * n_labels n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result + + for i in range(10): + print(result[i]) + + # lenient test for quality of results, will be replaced by deterministic test later + results = [result[i][5] for i in range(len(result))] + acceptable_results = ["love", "miss", "want", "need"] + number_accepted_results = 0 + + def contains(string,list): + return any(map(lambda x: x in string, list)) + + for i in range(len(results)): + if contains(results[i], acceptable_results): + number_accepted_results += 1 + assert number_accepted_results > n_rows_result/2 diff --git a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py index ea1966ca..3aaf2506 100644 --- a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py +++ b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py @@ -11,7 +11,7 @@ NoErrorMessageMatcher, NewColumnsEmptyMatcher, ErrorMessageMatcher, RankMonotonicMatcher, ColumnsMatcher from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection -from tests.fixtures.model_fixture import prepare_base_model_for_local_bucketfs +from tests.fixtures.model_fixture import prepare_filling_mask_model_for_local_bucketfs class ExaEnvironment: @@ -54,12 +54,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_filling_mask_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_filling_mask_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -111,12 +111,12 @@ def test_filling_mask_udf( ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf_on_error_handling( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_filling_mask_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_filling_mask_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py index 3ab3f9a2..2e525586 100644 --- a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py +++ b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py @@ -13,7 +13,6 @@ create_mounted_bucketfs_connection, create_hf_token_connection) from tests.utils.bucketfs_file_list import get_bucketfs_file_list -# todo add tests for checking if model metadata is correct? class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): self._connections = connections diff --git a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py index b2598cea..02283eae 100644 --- a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py +++ b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py @@ -3,6 +3,7 @@ import pandas as pd from typing import Dict +from tests.fixtures.model_fixture import prepare_question_answering_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ScoreMatcher, RankDTypeMatcher, NoErrorMessageMatcher, RankMonotonicMatcher, ColumnsMatcher from tests.utils.parameters import model_params @@ -57,12 +58,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_question_answering_udf( description, device_id, n_rows, - top_k, prepare_base_model_for_local_bucketfs): + top_k, prepare_question_answering_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_question_answering_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -121,12 +122,12 @@ def test_question_answering_udf( ]) def test_question_answering_udf_on_error_handling( description, device_id, n_rows, - top_k, prepare_base_model_for_local_bucketfs): + top_k, prepare_question_answering_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_question_answering_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py index b819df1f..ad1affc8 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py @@ -5,6 +5,7 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.sequence_classification_single_text_udf import \ SequenceClassificationSingleTextUDF +from tests.fixtures.model_fixture import prepare_sequence_classification_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, ColumnsMatcher, NoErrorMessageMatcher, \ NewColumnsEmptyMatcher, ErrorMessageMatcher from tests.utils.parameters import model_params @@ -49,12 +50,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_single_text_udf( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -105,12 +106,12 @@ def test_sequence_classification_single_text_udf( ("on GPU", 0) ]) def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py index 7c2d5831..fa630503 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py @@ -5,6 +5,7 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.sequence_classification_text_pair_udf import \ SequenceClassificationTextPairUDF +from tests.fixtures.model_fixture import prepare_sequence_classification_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ColumnsMatcher, NoErrorMessageMatcher from tests.utils.parameters import model_params @@ -49,12 +50,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -107,12 +108,12 @@ def test_sequence_classification_text_pair_udf( ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py index 413c8dbc..3c2d81a1 100644 --- a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py +++ b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py @@ -7,6 +7,7 @@ from exasol_transformers_extension.udfs.models.text_generation_udf import \ TextGenerationUDF +from tests.fixtures.model_fixture import prepare_text_generation_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ScoreMatcher, ColumnsMatcher, NoErrorMessageMatcher from tests.utils.parameters import model_params @@ -53,12 +54,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_text_generation_udf( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_text_generation_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_text_generation_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -111,12 +112,12 @@ def test_text_generation_udf( ("on GPU with single input", 0, 1) ]) def test_text_generation_udf_on_error_handlig( - description, device_id, n_rows, prepare_base_model_for_local_bucketfs): + description, device_id, n_rows, prepare_text_generation_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_text_generation_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py index 337d47a5..f6f63efb 100644 --- a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py @@ -12,7 +12,7 @@ TokenClassificationUDF # debugging -from tests.fixtures.model_fixture import prepare_base_model_for_local_bucketfs +from tests.fixtures.model_fixture import prepare_token_classification_model_for_local_bucketfs class ExaEnvironment: @@ -64,12 +64,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_token_classification_udf( description, device_id, n_rows, agg, - prepare_base_model_for_local_bucketfs): + prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -115,12 +115,12 @@ def test_token_classification_udf( ("on GPU", 0) ]) def test_token_classification_udf_with_multiple_aggregation_strategies( - description, device_id, prepare_base_model_for_local_bucketfs): + description, device_id, prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -178,12 +178,12 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( ]) def test_token_classification_udf_on_error_handling( description, device_id, n_rows, agg, - prepare_base_model_for_local_bucketfs): + prepare_token_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_token_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) diff --git a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py index f3091384..4f632b83 100644 --- a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py @@ -5,6 +5,7 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.zero_shot_text_classification_udf import \ ZeroShotTextClassificationUDF +from tests.fixtures.model_fixture import prepare_zero_shot_classification_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, NoErrorMessageMatcher, \ ShapeMatcher, RankMonotonicMatcher, RankDTypeMatcher, ScoreMatcher, NewColumnsEmptyMatcher, ErrorMessageMatcher, \ ColumnsMatcher @@ -49,13 +50,13 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on CPU", None), ("on GPU", 0) ]) -def test_sequence_classification_single_text_udf( - description, device_id, prepare_base_model_for_local_bucketfs): +def test_zero_shot_classification_single_text_udf( + description, device_id, prepare_zero_shot_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_zero_shot_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -112,13 +113,13 @@ def test_sequence_classification_single_text_udf( ("on CPU", None), ("on GPU", 0) ]) -def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, prepare_base_model_for_local_bucketfs): +def test_zero_shot_classification_single_text_udf_on_error_handling( + description, device_id, prepare_zero_shot_classification_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_base_model_for_local_bucketfs + bucketfs_base_path = prepare_zero_shot_classification_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py index f5356c7e..ef8eae41 100644 --- a/tests/integration_tests/without_db/utils/test_load_local_model.py +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -16,8 +16,6 @@ from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection -#todo rename all modelspecification strings - class TestSetup: def __init__(self): @@ -28,7 +26,7 @@ def __init__(self): self.token = "token" self.model_specification = model_params.tiny_model_specs - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification)#todo need change? + self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) test_pipeline = pipeline self.loader = LoadLocalModel( test_pipeline, @@ -79,12 +77,12 @@ def test_load_local_model_with_huggingface_model_transfer(tmp_path): downloaded_model_path = download_model_with_huggingface_transfer( test_setup, mock_bucketfs_location) - sub_dir_path = tmp_path / sub_dir #todo better name? + sub_dir_path = tmp_path / sub_dir with tarfile.open(str(sub_dir_path / downloaded_model_path)) as tar: tar.extractall(path=str(sub_dir_path)) test_setup.loader.set_current_model_specification(current_model_specification= test_setup.mock_current_model_specification) - #test_setup.loader.set_bucketfs_model_cache_dir(bucketfs_location=) #todo macke a mock? or add test for set_bucketfs_model_cache_dir + test_setup.loader._bucketfs_model_cache_dir = sub_dir_path test_setup.loader.load_models() diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py index a57ebf19..f0474156 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py @@ -52,9 +52,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udfs/test_base_udf.py b/tests/unit_tests/udfs/test_base_udf.py index b7d72463..46914d7e 100644 --- a/tests/unit_tests/udfs/test_base_udf.py +++ b/tests/unit_tests/udfs/test_base_udf.py @@ -36,7 +36,6 @@ def udf_wrapper(): Column("device_id", int, "INTEGER"), Column("model_name", str, "VARCHAR(2000000)"), Column("sub_dir", str, "VARCHAR(2000000)"), - Column("task_type", str, "VARCHAR(2000000)"), Column("bucketfs_conn", str, "VARCHAR(2000000)"), Column("token_conn", str, "VARCHAR(2000000)"), ], @@ -54,7 +53,7 @@ def udf_wrapper(): return meta -def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): +def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): mock_base_model_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_tokenizer_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) @@ -63,7 +62,6 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, 1, model_name, sub_dir, - task_type, bucketfs_conn_name, '' ) @@ -88,46 +86,44 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, @pytest.mark.parametrize(["description", "bucketfs_conn_name", "bucketfs_conn", - "sub_dir", "model_name", "task_type"], [ + "sub_dir", "model_name"], [ ("all given", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", "test_model", "filling_mask") + "test_subdir", "test_model") ]) @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path') def test_model_downloader_all_parameters(mock_local_path, mock_create_loc, description, - bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): + bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): mock_create_loc.side_effect = fake_bucketfs_location_from_conn_object mock_local_path.side_effect = fake_local_bucketfs_path - res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type) + res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name) # check if no errors assert res[0][-1] is None and len(res[0]) == len(mock_meta.output_columns) @pytest.mark.parametrize(["description", "bucketfs_conn_name", "bucketfs_conn", - "sub_dir", "model_name", "task_type"], [ - ("all null", None, None, None, None, None), + "sub_dir", "model_name"], [ + ("all null", None, None, None, None), ("model name missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", None, "filling_mask"), + "test_subdir", None), ("bucketfs_conn missing", None, None, - "test_subdir", "test_model", "filling_mask"), + "test_subdir", "test_model"), ("sub_dir missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - None, "test_model", "filling_mask"), + None, "test_model"), ("model_name missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", None, "filling_mask"), - ("task_type missing", "test_bucketfs_con_name", Connection(address=f"file:///test"), - "test_subdir", "test_model", None) + "test_subdir", None), ]) @patch('exasol_transformers_extension.utils.bucketfs_operations.create_bucketfs_location_from_conn_object') @patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path') def test_model_downloader_missing_parameters(mock_local_path, mock_create_loc, description, - bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type): + bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): mock_create_loc.side_effect = fake_bucketfs_location_from_conn_object mock_local_path.side_effect = fake_local_bucketfs_path - res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name, task_type) + res, mock_meta = setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name) error_field = res[0][-1] expected_error = regex_matcher(f".*For each model model_name, bucketfs_conn and sub_dir need to be provided." diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index a80fcc4a..cc353f6c 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -64,7 +64,7 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, mock_create_loc.side_effect = mock_bucketfs_locations base_model_names = [f"base_model_name_{i}" for i in range(count)] sub_directory_names = [f"sub_dir_{i}" for i in range(count)] - task_type = [f"task_type_{i}" for i in range(count)] #todo just use real type? + task_type = [f"task_type_{i}" for i in range(count)] bucketfs_connections = [Connection(address=f"file:///test{i}") for i in range(count)] bfs_conn_name = [f"bfs_conn_name_{i}" for i in bucketfs_connections] @@ -112,7 +112,7 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, ] for i in range(count): assert mock_cast(mock_model_downloaders[i].download_from_huggingface_hub).mock_calls == [ - call(mock_cmss[i].get_model_factory), #todo add to call transformers.taskthing #todo d dont match how mock corecctly? + call(mock_cmss[i].get_model_factory()), #todo add to call transformers.taskthing #todo d dont match how mock corecctly? call(mock_tokenizer_factory) ] assert call() in mock_cast(mock_model_downloaders[i].upload_to_bucketfs).mock_calls diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py index 98287972..1ef163a3 100644 --- a/tests/unit_tests/utils/test_load_local_model.py +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -17,7 +17,7 @@ class TestSetup: def __init__(self): - self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) #todo change? + self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) self.tokenizer_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) self.token = "token" self.model_name = "model_name" diff --git a/tests/utils/parameters.py b/tests/utils/parameters.py index 15c4b660..86dd6308 100644 --- a/tests/utils/parameters.py +++ b/tests/utils/parameters.py @@ -29,8 +29,10 @@ class ModelParams: path_in_bucket="container") model_params = ModelParams( + #todo add aditional models for test tasks + # https://huggingface.co/dslim/bert-base-NER for token classification base_model_specs=ModelSpecification('bert-base-uncased', "need to set this task_type"), seq2seq_model_specs=ModelSpecification("t5-small", "translation"), - tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny", ""),#todo make work with empty task_zype or use a real one? + tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny", "task"), text_data='The company Exasol is based in Nuremberg', sub_dir='model_sub_dir') From 27b6fb01cbc91a596655d47e260f16d542cde19d Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Thu, 18 Jul 2024 15:31:53 +0200 Subject: [PATCH 03/31] fixed integration tests, added separate models for tasks --- exasol_transformers_extension/upload_model.py | 3 +- .../utils/model_specification.py | 14 ++-- tests/fixtures/model_fixture.py | 71 ++++++++++--------- .../with_db/test_upload_model.py | 4 -- .../test_prediction_with_downloader_udf.py | 3 +- .../udfs/test_question_answering_script.py | 8 +-- ...uence_classification_single_text_script.py | 31 ++++---- ...equence_classification_text_pair_script.py | 34 +++++---- .../udfs/test_text_generation_script.py | 8 +-- .../udfs/test_token_classification_script.py | 22 +++--- ...st_zero_shot_text_classification_script.py | 27 ++++--- .../udfs/test_question_answering_udf.py | 2 +- ...sequence_classification_single_text_udf.py | 11 ++- ...t_sequence_classification_text_pair_udf.py | 12 ++-- .../udfs/test_text_generation_udf.py | 2 +- .../udfs/test_token_classification_udf.py | 4 +- .../test_zero_shot_text_classification_udf.py | 2 +- .../without_db/utils/test_load_local_model.py | 2 +- .../udfs/test_model_downloader_udf.py | 2 +- tests/utils/parameters.py | 24 +++++-- 20 files changed, 157 insertions(+), 129 deletions(-) diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index aaa925b0..e708c09a 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -19,7 +19,7 @@ @click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") -@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") #todo chnage docu +@click.option('--token', type=str, help="Hugging Face hub token for private models") #todo chnage docu @click.option('--local-model-path', type=click.Path(exists=True, file_okay=True), required=True, help="local path where model is located") @click.option('--bucketfs-name', type=str) @@ -50,7 +50,6 @@ def main( task_type: str, sub_dir: str, token: str | None, - #local_model_path: str, bucketfs_name: str, bucketfs_host: str, bucketfs_port: int, diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py index 06e9de5a..a0eb4169 100644 --- a/exasol_transformers_extension/utils/model_specification.py +++ b/exasol_transformers_extension/utils/model_specification.py @@ -9,9 +9,9 @@ class ModelSpecification: def __init__(self, model_name: str, task_type: str): # task_type, model_version self.model_name = model_name - self.task_type = task_type + self.task_type = self._set_task_type_from_udf_name(task_type) - def set_task_type_from_udf_name(self, text): + def _set_task_type_from_udf_name(self, text): """ switches user input(matching udf name) to transformers task types """ @@ -31,7 +31,7 @@ def set_task_type_from_udf_name(self, text): task_type = "zero-shot-classification" else: task_type = text - self.task_type = task_type + return task_type def get_model_specs_for_download(self):#todo change usages? """ @@ -47,7 +47,7 @@ def __eq__(self, other): return False def get_model_specific_path_suffix(self) -> PurePosixPath: - return PurePosixPath(self.model_name + "_" + self.task_type) #model_name-version-task + return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# todo change def get_model_factory(self): """ @@ -57,15 +57,15 @@ def get_model_factory(self): if model_task_type == "fill-mask": model_factory = transformers.AutoModelForMaskedLM elif model_task_type == "translation": - model_factory = transformers.T5Model #todo correct? se to seq in translation udf + model_factory = transformers.AutoModelForSeq2SeqLM elif model_task_type == "zero-shot-classification": model_factory = transformers.AutoModelForSequenceClassification elif model_task_type == "text-classification": model_factory = transformers.AutoModelForSequenceClassification elif model_task_type == "question-answering": model_factory = transformers.AutoModelForQuestionAnswering - #elif model_task_type == "text-generation": - # model_factory = transformers.AutoModelFor + elif model_task_type == "text-generation": + model_factory = transformers.AutoModelForCausalLM elif model_task_type == "token-classification": model_factory = transformers.AutoModelForTokenClassification else: diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index ef0ccd26..5f368486 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -53,10 +53,12 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, "", model_params.sub_dir) - tmpdir = tmpdir_factory.mktemp(current_model_specs.get_model_specific_path_suffix()) + + tmpdir = tmpdir_factory.mktemp(current_model_specs.task_type) model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path() bucketfs_path_for_model = tmpdir / model_path_in_bucketfs + print(bucketfs_path_for_model) download_model_to_path(current_model_specs, bucketfs_path_for_model) return tmpdir @@ -69,43 +71,43 @@ def prepare_filling_mask_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPa @pytest.fixture(scope="session") def prepare_question_answering_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "question-answering" + model_specification = model_params.q_a_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @pytest.fixture(scope="session") def prepare_sequence_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "text-classification" + model_specification = model_params.sequence_class_model_specs + bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) + yield bucketfs_path + +@pytest.fixture(scope="session") +def prepare_sequence_classification_pair_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model_specification = model_params.sequence_class_pair_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @pytest.fixture(scope="session") def prepare_text_generation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "text-generation" + model_specification = model_params.text_gen_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @pytest.fixture(scope="session") def prepare_token_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "token-classification" + model_specification = model_params.token_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @pytest.fixture(scope="session") def prepare_translation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "translation" + model_specification = model_params.seq2seq_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @pytest.fixture(scope="session") def prepare_zero_shot_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: - model_specification = model_params.base_model_specs - model_specification.task_type = "zero-shot-classification" + model_specification = model_params.zero_shot_model_specs bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory) yield bucketfs_path @@ -139,7 +141,7 @@ def upload_filling_mask_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: base_model_specs = model_params.base_model_specs base_model_specs.task_type = "fill-mask" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -148,9 +150,8 @@ def upload_filling_mask_model_to_bucketfs( @pytest.fixture(scope="session") def upload_question_answering_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "question-answering" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.q_a_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -158,9 +159,17 @@ def upload_question_answering_model_to_bucketfs( @pytest.fixture(scope="session") def upload_sequence_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "text-classification" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.sequence_class_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + with upload_model_to_bucketfs( + base_model_specs, tmpdir, bucketfs_location) as path: + yield path + +@pytest.fixture(scope="session") +def upload_sequence_classification_pair_model_to_bucketfs( + bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: + base_model_specs = model_params.sequence_class_pair_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -168,9 +177,8 @@ def upload_sequence_classification_model_to_bucketfs( @pytest.fixture(scope="session") def upload_text_generation_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "text-generation" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.text_gen_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -178,9 +186,8 @@ def upload_text_generation_model_to_bucketfs( @pytest.fixture(scope="session") def upload_token_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "token-classification" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.token_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -188,9 +195,8 @@ def upload_token_classification_model_to_bucketfs( @pytest.fixture(scope="session") def upload_translation_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "translation" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.seq2seq_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -198,9 +204,8 @@ def upload_translation_model_to_bucketfs( @pytest.fixture(scope="session") def upload_zero_shot_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.base_model_specs - base_model_specs.task_type = "zero-shot-classification" - tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix()) + base_model_specs = model_params.zero_shot_model_specs + tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) with upload_model_to_bucketfs( base_model_specs, tmpdir, bucketfs_location) as path: yield path @@ -211,7 +216,7 @@ def upload_zero_shot_classification_model_to_bucketfs( def upload_seq2seq_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: model_specification = model_params.seq2seq_model_specs - tmpdir = tmpdir_factory.mktemp(model_specification.get_model_specific_path_suffix()) + tmpdir = tmpdir_factory.mktemp(model_specification.task_type) with upload_model_to_bucketfs( model_specification, tmpdir, bucketfs_location) as path: yield path diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 1e31b47c..0c7da8f7 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -6,10 +6,6 @@ import exasol.bucketfs as bfs from exasol_transformers_extension import upload_model -from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \ - CurrentModelSpecificationFromModelSpecs -from exasol_transformers_extension.utils.model_specification import ModelSpecification from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py index 94d0d65e..a6ea9c08 100644 --- a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py +++ b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py @@ -30,8 +30,9 @@ def test_prediction_with_downloader_udf( t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name)); """ - pyexasol_connection.execute(query).fetchall() + result = pyexasol_connection.execute(query).fetchall() time.sleep(10) + print(result) # execute the filling mask UDF text_data = "I you so much." diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py index 13914ddb..70b4640e 100644 --- a/tests/integration_tests/with_db/udfs/test_question_answering_script.py +++ b/tests/integration_tests/with_db/udfs/test_question_answering_script.py @@ -12,7 +12,7 @@ def test_question_answering_script( setup_database, pyexasol_connection, upload_question_answering_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database - question = "How many syllables are in the word Syllable?" + question = "Where is Exasol based?" n_rows = 100 top_k = 1 input_data = [] @@ -21,9 +21,9 @@ def test_question_answering_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.q_a_model_specs.model_name, question, - ' '.join((model_params.text_data, str(i))), + model_params.text_data, top_k )) @@ -53,7 +53,7 @@ def test_question_answering_script( for i in range(5): print(result[i]) results = [result[i][6] for i in range(len(result))] - acceptable_results = ["three", "3", "want", "need"] + acceptable_results = ["Nuremberg", "Germany"] number_accepted_results = 0 def contains(string, list): diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py index 37405871..77da9def 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py @@ -2,11 +2,17 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +#debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_sequence_classification_single_text_script( setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database - n_labels = 2 + n_labels = 3 # negative, neutral, positive n_rows = 100 input_data = [] for i in range(n_rows): @@ -14,8 +20,8 @@ def test_sequence_classification_single_text_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, - model_params.text_data)) + model_params.sequence_class_model_specs.model_name, + "I am so happy to be working on the Transformers Extension.")) query = f"SELECT TE_SEQUENCE_CLASSIFICATION_SINGLE_TEXT_UDF(" \ f"t.device_id, " \ @@ -38,18 +44,13 @@ def test_sequence_classification_single_text_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result - for i in range(10): - print(result[i]) - # lenient test for quality of results, will be replaced by deterministic test later - results = [result[i][5] for i in range(len(result))] - acceptable_results = ["love", "miss", "want", "need"] - number_accepted_results = 0 - - def contains(string,list): - return any(map(lambda x: x in string, list)) - for i in range(len(results)): - if contains(results[i], acceptable_results): + number_accepted_results = 0 + for i in range(len(result)): + if (result[i][4] == "positive" and + result[i][5] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][5] < 0.2: number_accepted_results += 1 - assert number_accepted_results > n_rows_result/2 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py index ad0547d0..c13198c1 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py @@ -2,11 +2,17 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +# debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_sequence_classification_text_pair_script( - setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs): + setup_database, pyexasol_connection, upload_sequence_classification_pair_model_to_bucketfs): bucketfs_conn_name, schema_name = setup_database - n_labels = 2 + n_labels = 3 n_rows = 100 input_data = [] for i in range(n_rows): @@ -14,9 +20,9 @@ def test_sequence_classification_text_pair_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.sequence_class_pair_model_specs.model_name, model_params.text_data, - ' '.join((model_params.text_data, str(i))))) + 'The main Exasol office is located in Flensburg')) query = f"SELECT TE_SEQUENCE_CLASSIFICATION_TEXT_PAIR_UDF(" \ f"t.device_id, " \ @@ -32,6 +38,9 @@ def test_sequence_classification_text_pair_script( # execute sequence classification UDF result = pyexasol_connection.execute(query).fetchall() + for i in range(10): + print(result[i]) + # assertions assert result[0][-1] is None added_columns = 3 # label,score,error_message @@ -40,18 +49,13 @@ def test_sequence_classification_text_pair_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result - for i in range(10): - print(result[i]) # lenient test for quality of results, will be replaced by deterministic test later - results = [result[i][5] for i in range(len(result))] - acceptable_results = ["love", "miss", "want", "need"] number_accepted_results = 0 - - def contains(string,list): - return any(map(lambda x: x in string, list)) - - for i in range(len(results)): - if contains(results[i], acceptable_results): + for i in range(len(result)): + if (result[i][5] == "contradiction" and # possible labels: contradiction, entailment, neutral + result[i][6] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][6] < 0.2: number_accepted_results += 1 - assert number_accepted_results > n_rows_result/2 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/with_db/udfs/test_text_generation_script.py b/tests/integration_tests/with_db/udfs/test_text_generation_script.py index 81d80430..09eef2e3 100644 --- a/tests/integration_tests/with_db/udfs/test_text_generation_script.py +++ b/tests/integration_tests/with_db/udfs/test_text_generation_script.py @@ -2,6 +2,7 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +#for debug from tests.fixtures.model_fixture import * from tests.fixtures.setup_database_fixture import * from tests.fixtures.language_container_fixture import * @@ -13,7 +14,7 @@ def test_text_generation_script( bucketfs_conn_name, schema_name = setup_database text_data = "Exasol is an analytics database management" n_rows = 100 - max_length = 10 + max_length = 12 return_full_text = True input_data = [] for i in range(n_rows): @@ -21,7 +22,7 @@ def test_text_generation_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.text_gen_model_specs.model_name, text_data, max_length, return_full_text @@ -54,9 +55,8 @@ def test_text_generation_script( for i in range(5): print(result[i]) results = [result[i][6] for i in range(len(result))] - acceptable_results = ["software", "system", "solution"] + acceptable_results = ["software", "system", "solution", "tool"] number_accepted_results = 0 - def contains(string,list): return any(map(lambda x: x in string, list)) diff --git a/tests/integration_tests/with_db/udfs/test_token_classification_script.py b/tests/integration_tests/with_db/udfs/test_token_classification_script.py index 622e62e4..88c2500d 100644 --- a/tests/integration_tests/with_db/udfs/test_token_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_token_classification_script.py @@ -2,6 +2,11 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_token_classification_script( setup_database, pyexasol_connection, upload_token_classification_model_to_bucketfs): @@ -14,7 +19,7 @@ def test_token_classification_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.token_model_specs.model_name, model_params.text_data, aggregation_strategy )) @@ -40,18 +45,13 @@ def test_token_classification_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) >= n_rows and len(result[0]) == n_cols_result - for i in range(10): - print(result[i]) - # lenient test for quality of results, will be replaced by deterministic test later - results = [result[i][5] for i in range(len(result))] - acceptable_results = ["love", "miss", "want", "need"] + results = [[result[i][7], result[i][8]] for i in range(len(result))] + print(result) + acceptable_result_sets = [["Exasol", "ORG"], ["Nuremberg", "LOC"]] number_accepted_results = 0 - def contains(string,list): - return any(map(lambda x: x in string, list)) - for i in range(len(results)): - if contains(results[i], acceptable_results): + if results[i] in acceptable_result_sets: number_accepted_results += 1 - assert number_accepted_results > n_rows_result/2 \ No newline at end of file + assert number_accepted_results > len(result)/1.5 \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py index 5b40bbc8..6ef924e4 100644 --- a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py @@ -2,6 +2,12 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils.parameters import model_params +# debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * def test_zero_shot_classification_single_text_script( setup_database, pyexasol_connection, upload_zero_shot_classification_model_to_bucketfs): @@ -15,7 +21,7 @@ def test_zero_shot_classification_single_text_script( '', bucketfs_conn_name, str(model_params.sub_dir), - model_params.base_model_specs.model_name, + model_params.zero_shot_model_specs.model_name, model_params.text_data, candidate_labels )) @@ -42,18 +48,17 @@ def test_zero_shot_classification_single_text_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result - for i in range(10): - print(result[i]) - # lenient test for quality of results, will be replaced by deterministic test later - results = [result[i][5] for i in range(len(result))] - acceptable_results = ["love", "miss", "want", "need"] - number_accepted_results = 0 + acceptable_results = ["Analytics", "Database", "Germany"] - def contains(string,list): + def contains(string, list): return any(map(lambda x: x in string, list)) - for i in range(len(results)): - if contains(results[i], acceptable_results): + number_accepted_results = 0 + for i in range(len(result)): + if (contains(result[i][5], acceptable_results) and + result[i][6] > 0.8): #check if confidence resonably high + number_accepted_results += 1 + elif result[i][6] < 0.2: number_accepted_results += 1 - assert number_accepted_results > n_rows_result/2 + assert number_accepted_results > n_rows_result / 1.5 diff --git a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py index 02283eae..f1c051a5 100644 --- a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py +++ b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py @@ -73,7 +73,7 @@ def test_question_answering_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.q_a_model_specs.model_name, question, model_params.text_data, top_k diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py index ad1affc8..1189902b 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py @@ -11,6 +11,13 @@ from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection +# debug +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): @@ -65,7 +72,7 @@ def test_sequence_classification_single_text_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.sequence_class_model_specs.model_name, model_params.text_data + str(i) ) for i in range(n_rows)] columns = [ @@ -89,7 +96,7 @@ def test_sequence_classification_single_text_udf( grouped_by_inputs = result_df.groupby('text_data') n_unique_labels_per_input = grouped_by_inputs['label'].nunique().to_list() - n_labels = 2 + n_labels = 3 n_labels_per_input_expected = [n_labels] * n_rows result = Result(result_df) assert ( diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py index fa630503..a75247f3 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py @@ -5,7 +5,7 @@ from exasol_udf_mock_python.connection import Connection from exasol_transformers_extension.udfs.models.sequence_classification_text_pair_udf import \ SequenceClassificationTextPairUDF -from tests.fixtures.model_fixture import prepare_sequence_classification_model_for_local_bucketfs +from tests.fixtures.model_fixture import prepare_sequence_classification_pair_model_for_local_bucketfs from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NewColumnsEmptyMatcher, \ ErrorMessageMatcher, ColumnsMatcher, NoErrorMessageMatcher from tests.utils.parameters import model_params @@ -50,12 +50,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf( - description, device_id, prepare_sequence_classification_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_pair_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_pair_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = create_mounted_bucketfs_connection(bucketfs_base_path) @@ -65,7 +65,7 @@ def test_sequence_classification_text_pair_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.sequence_class_pair_model_specs.model_name, model_params.text_data + str(i), model_params.text_data + str(i * i)) for i in range(n_rows)] @@ -108,12 +108,12 @@ def test_sequence_classification_text_pair_udf( ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf_on_error_handling( - description, device_id, prepare_sequence_classification_model_for_local_bucketfs): + description, device_id, prepare_sequence_classification_pair_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = prepare_sequence_classification_model_for_local_bucketfs + bucketfs_base_path = prepare_sequence_classification_pair_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py index 3c2d81a1..90c22c9b 100644 --- a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py +++ b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py @@ -71,7 +71,7 @@ def test_text_generation_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.text_gen_model_specs.model_name, text_data, max_length, return_full_text diff --git a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py index f6f63efb..a53f72e8 100644 --- a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py @@ -78,7 +78,7 @@ def test_token_classification_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.token_model_specs.model_name, model_params.text_data * (i + 1), agg ) for i in range(n_rows)] @@ -130,7 +130,7 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.token_model_specs.model_name, model_params.text_data * (i + 1), agg_strategy ) for i, agg_strategy in enumerate(agg_strategies)] diff --git a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py index 4f632b83..935c7d64 100644 --- a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py @@ -67,7 +67,7 @@ def test_zero_shot_classification_single_text_udf( None, bucketfs_conn_name, model_params.sub_dir, - model_params.base_model_specs.model_name, + model_params.zero_shot_model_specs.model_name, model_params.text_data + str(i), candidate_labels + str(i) ) for i in range(n_rows)] diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py index ef8eae41..176e21aa 100644 --- a/tests/integration_tests/without_db/utils/test_load_local_model.py +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -54,7 +54,7 @@ def test_load_local_model(tmp_path): model_specification = test_setup.model_specification model_save_path = create_save_pretrained_model_path(tmp_path, model_specification) # download a model - model = AutoModel.from_pretrained(model_specification.model_name) #todo change? + model = AutoModel.from_pretrained(model_specification.model_name) tokenizer = AutoTokenizer.from_pretrained(model_specification.model_name) model.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index cc353f6c..e15fe25b 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -112,7 +112,7 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, ] for i in range(count): assert mock_cast(mock_model_downloaders[i].download_from_huggingface_hub).mock_calls == [ - call(mock_cmss[i].get_model_factory()), #todo add to call transformers.taskthing #todo d dont match how mock corecctly? + call(mock_cmss[i].get_model_factory()), call(mock_tokenizer_factory) ] assert call() in mock_cast(mock_model_downloaders[i].upload_to_bucketfs).mock_calls diff --git a/tests/utils/parameters.py b/tests/utils/parameters.py index 86dd6308..2edf15ae 100644 --- a/tests/utils/parameters.py +++ b/tests/utils/parameters.py @@ -15,9 +15,15 @@ class BucketFSParams: @dataclass(frozen=True) class ModelParams: - base_model_specs: ModelSpecification #this is used for other tests, task_type should be set per test - seq2seq_model_specs: ModelSpecification #tis model is used for testing translation_udf - tiny_model_specs: ModelSpecification #this model is used for upload/download tests + base_model_specs: ModelSpecification # this is used for other tests, task_type should be set per test + seq2seq_model_specs: ModelSpecification # this model is used for testing translation_udf + q_a_model_specs: ModelSpecification # this model is used for testing question answering + text_gen_model_specs: ModelSpecification # used for text generation tests + token_model_specs: ModelSpecification # this model is used for token classification tests + sequence_class_model_specs: ModelSpecification # this model is used for sequence classification single text tests + sequence_class_pair_model_specs: ModelSpecification # this model is used for sequence classification text pair tests + zero_shot_model_specs: ModelSpecification # this model is used for zero-shot-classification tests + tiny_model_specs: ModelSpecification # this model is used for upload/download tests text_data: str sub_dir: str @@ -29,10 +35,14 @@ class ModelParams: path_in_bucket="container") model_params = ModelParams( - #todo add aditional models for test tasks - # https://huggingface.co/dslim/bert-base-NER for token classification - base_model_specs=ModelSpecification('bert-base-uncased', "need to set this task_type"), + base_model_specs=ModelSpecification('bert-base-uncased', "need to set this task_type"), #fill mask seq2seq_model_specs=ModelSpecification("t5-small", "translation"), + q_a_model_specs=ModelSpecification("deepset/tinybert-6l-768d-squad2", "question-answering"), + text_gen_model_specs=ModelSpecification("openai-community/gpt2", "text-generation"), + token_model_specs=ModelSpecification("dslim/bert-base-NER", "token-classification"),#token-classification + sequence_class_model_specs=ModelSpecification("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", "text-classification"), + sequence_class_pair_model_specs=ModelSpecification("MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli", "text-classification"),#Alireza1044/albert-base-v2-mnli + zero_shot_model_specs=ModelSpecification("MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33", "zero-shot-classification"),#text-class tiny_model_specs=ModelSpecification("prajjwal1/bert-tiny", "task"), - text_data='The company Exasol is based in Nuremberg', + text_data='The database software company Exasol is based in Nuremberg', sub_dir='model_sub_dir') From 61df5285d0cffbbf0a967253d376d13222d2eb4c Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Wed, 31 Jul 2024 13:39:06 +0200 Subject: [PATCH 04/31] cleanup and renamings, added docu for new parameters --- doc/user_guide/user_guide.md | 51 +++++++------- .../udfs/models/base_model_udf.py | 10 +-- .../udfs/models/filling_mask_udf.py | 2 +- .../udfs/models/model_downloader_udf.py | 8 +-- ...sequence_classification_single_text_udf.py | 2 +- .../sequence_classification_text_pair_udf.py | 2 +- .../udfs/models/text_generation_udf.py | 2 +- .../udfs/models/token_classification_udf.py | 2 +- .../udfs/models/translation_udf.py | 2 +- .../zero_shot_text_classification_udf.py | 2 +- exasol_transformers_extension/upload_model.py | 26 +++----- ...ion.py => bucketfs_model_specification.py} | 25 ++++--- .../utils/bucketfs_operations.py | 2 +- .../utils/load_local_model.py | 12 ++-- .../utils/model_specification.py | 8 +-- tests/fixtures/model_fixture.py | 66 +++++++++---------- .../with_db/test_upload_model.py | 14 ++-- .../udfs/test_model_downloader_udf_script.py | 6 +- .../udfs/test_question_answering_script.py | 2 +- ...equence_classification_text_pair_script.py | 2 +- .../udfs/test_token_classification_script.py | 2 +- .../with_db/udfs/test_translation_script.py | 4 +- ...st_zero_shot_text_classification_script.py | 2 +- .../udfs/test_model_downloader_udf.py | 6 +- ...t_sequence_classification_text_pair_udf.py | 8 ++- .../without_db/utils/test_load_local_model.py | 6 +- ...ot_cached_multiple_model_multiple_batch.py | 4 +- ..._not_cached_single_model_multiple_batch.py | 2 +- ..._prediction_single_model_multiple_batch.py | 2 +- ...ngle_subdir_single_model_multiple_batch.py | 4 +- ...single_subdir_single_model_single_batch.py | 4 +- .../multiple_model_multiple_batch_complete.py | 4 +- ...ultiple_model_multiple_batch_incomplete.py | 4 +- ...ultiple_batch_multiple_models_per_batch.py | 8 +-- .../multiple_model_single_batch_complete.py | 4 +- .../multiple_model_single_batch_incomplete.py | 4 +- ...ltiple_topk_single_model_multiple_batch.py | 2 +- ...multiple_topk_single_model_single_batch.py | 2 +- ...iple_subdir_single_model_multiple_batch.py | 4 +- ...ltiple_subdir_single_model_single_batch.py | 4 +- .../single_model_multiple_batch_complete.py | 2 +- .../single_model_multiple_batch_incomplete.py | 2 +- .../single_model_single_batch_complete.py | 2 +- .../single_model_single_batch_incomplete.py | 2 +- ...ngle_topk_multiple_model_multiple_batch.py | 4 +- ...single_topk_multiple_model_single_batch.py | 4 +- .../udfs/test_model_downloader_udf.py | 13 ++-- .../unit_tests/utils/test_load_local_model.py | 4 +- tests/utils/parameters.py | 4 +- 49 files changed, 180 insertions(+), 183 deletions(-) rename exasol_transformers_extension/utils/{current_model_specification.py => bucketfs_model_specification.py} (63%) diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index df7f95a5..bc19a7f7 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -355,15 +355,17 @@ Once you have internet access, invoke the UDF like this: ```sql SELECT TE_MODEL_DOWNLOADER_UDF( model_name, - task_name, #todo description + task_type, sub_dir, bucketfs_conn, token_conn ) + ``` - Parameters: - ```model_name```: The name of the model to use for prediction. You can find the details of the models on the [huggingface models page](https://huggingface.co/models). + - ```task_type```: The Name of the task you want to use the model for. - ```sub_dir```: The directory where the model is stored in the BucketFS. - ```bucketfs_conn```: The BucketFS connection name. - ```token_conn```: The connection name containing the token required for @@ -371,10 +373,17 @@ SELECT TE_MODEL_DOWNLOADER_UDF( on how to create a connection object with token information, please check [here](#getting-started). - +"task_type" is a variable for the type of task you plan to use the model for. +Some models can be used for multiple types of tasks, but transformers stores +different metadata depending on the task of the model, which affects how the model +is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance +severely. Available task_types are the same as the names of our available UDFs, namely: +`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, +`translation` and`zero_shot_classification`. + ### 2. Model Uploader Script -You can invoke the python script as below which allows to load the transformer -models from the local filesystem into BucketFS: +You can invoke the python script as below which allows to download the transformer +models from The Hugging Face hub to the local filesystem, and then from there to the Bucketfs. ```buildoutcfg python -m exasol_transformers_extension.upload_model \ @@ -386,28 +395,24 @@ models from the local filesystem into BucketFS: --bucket \ --path-in-bucket \ --model-name \ - --subd-dir \ - --local-model-path + --task_type \ + --token \ # optional + --subd-dir ``` **Note:** The `--path-in-bucket` can not be empty. -**Note**: The options --local-model-path needs to point to a path which contains the model and its tokenizer. -These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline) -function to ensure proper loading by the Transformers Extension UDFs. -You can download the model using python like this: - -```python - for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]:#todo hange? - # download the model and tokenizer from Hugging Face - model = model_factory.from_pretrained(model_name) - # save the downloaded model using the save_pretrained function - model_save_path = - model.save_pretrained(model_save_path) -``` -***Note:*** Hugging Face models consist of two parts, the model and the tokenizer. -Make sure to download and save both into the same save directory so the upload model script uploads them together. -And then upload it using exasol_transformers_extension.upload_model script where ```--local-model-path = ``` - +If you are using an Exasol Saas Database you need to use the Saas related options. View them by using + ``` + python -m exasol_transformers_extension.upload_model -h + ``` +"task_type" is a variable for the type of task you plan to use the model for. +Some models can be used for multiple types of tasks, but transformers stores +different metadata depending on the task of the model, which affects how the model +is loaded later. Setting an Incorrect task_type, o leaving the task_type empty may affect the models performance +severely. Available task_types are the same as the names of our available UDFs, namely: +`filling_mask`, `question_answering`, `sequence_classification`, `text_generation`, `token_classification`, +`translation` and`zero_shot_classification`. + ## Using Prediction UDFs We provide 7 prediction UDFs in this Transformers Extension, each performing an NLP diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py index 7e080a00..490c0f1e 100644 --- a/exasol_transformers_extension/udfs/models/base_model_udf.py +++ b/exasol_transformers_extension/udfs/models/base_model_udf.py @@ -10,7 +10,7 @@ from exasol_transformers_extension.deployment import constants from exasol_transformers_extension.utils import device_management, \ bucketfs_operations, dataframe_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -40,13 +40,13 @@ def __init__(self, pipeline: transformers.Pipeline, base_model: ModelFactoryProtocol, tokenizer: ModelFactoryProtocol, - task_name: str): + task_type: str): self.exa = exa self.batch_size = batch_size self.pipeline = pipeline self.base_model = base_model self.tokenizer = tokenizer - self.task_name = task_name + self.task_type = task_type self.device = None self.model_loader = None self.last_created_pipeline = None @@ -74,7 +74,7 @@ def create_model_loader(self): self.model_loader = LoadLocalModel(pipeline_factory=self.pipeline, base_model_factory=self.base_model, tokenizer_factory=self.tokenizer, - task_name=self.task_name, + task_type=self.task_type, device=self.device) def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame: @@ -185,7 +185,7 @@ def check_cache(self, model_df: pd.DataFrame) -> None: model_name = model_df["model_name"].iloc[0] bucketfs_conn = model_df["bucketfs_conn"].iloc[0] sub_dir = model_df["sub_dir"].iloc[0] - current_model_specification = CurrentModelSpecification(model_name, self.task_name, bucketfs_conn, sub_dir) + current_model_specification = BucketFSModelSpecification(model_name, self.task_type, bucketfs_conn, sub_dir) if self.model_loader.current_model_specification != current_model_specification: bucketfs_location = \ diff --git a/exasol_transformers_extension/udfs/models/filling_mask_udf.py b/exasol_transformers_extension/udfs/models/filling_mask_udf.py index d7fb1170..30d7c761 100644 --- a/exasol_transformers_extension/udfs/models/filling_mask_udf.py +++ b/exasol_transformers_extension/udfs/models/filling_mask_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForMaskedLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='fill-mask') + tokenizer, task_type='fill-mask') self._mask_token = "" self._desired_fields_in_prediction = ["sequence", "score"] self.new_columns = ["filled_text", "score", "rank", "error_message"] diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py index 3af0c883..a021be51 100644 --- a/exasol_transformers_extension/udfs/models/model_downloader_udf.py +++ b/exasol_transformers_extension/udfs/models/model_downloader_udf.py @@ -3,8 +3,8 @@ import transformers from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import \ - CurrentModelSpecificationFactory +from exasol_transformers_extension.utils.bucketfs_model_specification import \ + BucketFSModelSpecificationFactory from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ HuggingFaceHubBucketFSModelTransferSPFactory @@ -22,12 +22,12 @@ class ModelDownloaderUDF: returns , """ - def __init__(self, #todo change docu! + def __init__(self, exa, tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer, huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory = HuggingFaceHubBucketFSModelTransferSPFactory(), - current_model_specification_factory: CurrentModelSpecificationFactory = CurrentModelSpecificationFactory()): + current_model_specification_factory: BucketFSModelSpecificationFactory = BucketFSModelSpecificationFactory()): self._exa = exa self._tokenizer_factory = tokenizer_factory self._huggingface_hub_bucketfs_model_transfer = huggingface_hub_bucketfs_model_transfer diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py index 3060d9ae..b5d6b640 100644 --- a/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py +++ b/exasol_transformers_extension/udfs/models/sequence_classification_single_text_udf.py @@ -13,7 +13,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-classification') + tokenizer, task_type='text-classification') self.new_columns = ["label", "score", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py b/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py index 393e8dba..71df6e4d 100644 --- a/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py +++ b/exasol_transformers_extension/udfs/models/sequence_classification_text_pair_udf.py @@ -13,7 +13,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-classification') + tokenizer, task_type='text-classification') self.new_columns = ["label", "score", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/text_generation_udf.py b/exasol_transformers_extension/udfs/models/text_generation_udf.py index 081f3122..5455a625 100644 --- a/exasol_transformers_extension/udfs/models/text_generation_udf.py +++ b/exasol_transformers_extension/udfs/models/text_generation_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForCausalLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='text-generation') + tokenizer, task_type='text-generation') self.new_columns = ["generated_text", "error_message"] def extract_unique_param_based_dataframes( diff --git a/exasol_transformers_extension/udfs/models/token_classification_udf.py b/exasol_transformers_extension/udfs/models/token_classification_udf.py index 1f2fa3a1..81c6c3d5 100644 --- a/exasol_transformers_extension/udfs/models/token_classification_udf.py +++ b/exasol_transformers_extension/udfs/models/token_classification_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForTokenClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='token-classification') + tokenizer, task_type='token-classification') self._default_aggregation_strategy = 'simple' self._desired_fields_in_prediction = [ "start", "end", "word", "entity", "score"] diff --git a/exasol_transformers_extension/udfs/models/translation_udf.py b/exasol_transformers_extension/udfs/models/translation_udf.py index 4c0d9e04..36a5b470 100644 --- a/exasol_transformers_extension/udfs/models/translation_udf.py +++ b/exasol_transformers_extension/udfs/models/translation_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForSeq2SeqLM, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='translation') + tokenizer, task_type='translation') self._translation_prefix = "translate {src_lang} to {target_lang}: " self.new_columns = ["translation_text", "error_message"] diff --git a/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py b/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py index dba47a51..652a17eb 100644 --- a/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py +++ b/exasol_transformers_extension/udfs/models/zero_shot_text_classification_udf.py @@ -14,7 +14,7 @@ def __init__(self, base_model=transformers.AutoModelForSequenceClassification, tokenizer=transformers.AutoTokenizer): super().__init__(exa, batch_size, pipeline, base_model, - tokenizer, task_name='zero-shot-classification') + tokenizer, task_type='zero-shot-classification') self._desired_fields_in_prediction = ["labels", "scores"] self.new_columns = ["label", "score", "rank", "error_message"] diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index e708c09a..7746526a 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -8,7 +8,7 @@ SECRET_DISPLAY, SecretParams, secret_callback) from exasol_transformers_extension.utils import bucketfs_operations from exasol_transformers_extension.deployment import deployment_utils as utils -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ HuggingFaceHubBucketFSModelTransferSP @@ -16,12 +16,10 @@ @click.command() @click.option('--model-name', type=str, required=True, help="name of the model") -@click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model) +@click.option('--task_type', type=str, required=True) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") -@click.option('--token', type=str, help="Hugging Face hub token for private models") #todo chnage docu -@click.option('--local-model-path', type=click.Path(exists=True, file_okay=True), - required=True, help="local path where model is located") +@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") @click.option('--bucketfs-name', type=str) @click.option('--bucketfs-host', type=str) @click.option('--bucketfs-port', type=int) @@ -67,10 +65,10 @@ def main( """ Downloads model from Huggingface hub and the transfers model to database """ - # create CurrentModelSpecification for model to be loaded - current_model_specs = CurrentModelSpecification(model_name, task_type, "", Path(sub_dir)) + # create BucketFSModelSpecification for model to be loaded + current_model_spec = BucketFSModelSpecification(model_name, task_type, "", Path(sub_dir)) # upload the downloaded model files into bucketfs - upload_path = current_model_specs.get_bucketfs_model_save_path() + upload_path = current_model_spec.get_bucketfs_model_save_path() # create bucketfs location bucketfs_location = bucketfs_operations.create_bucketfs_location( @@ -89,22 +87,18 @@ def main( path_in_bucket=path_in_bucket, use_ssl_cert_validation=use_ssl_cert_validation) - # upload the downloaded model files into bucketfs - - #bucketfs_operations.upload_model_files_to_bucketfs( - # local_model_path, upload_path, bucketfs_location) - model_factory = current_model_specs.get_model_factory() + model_factory = current_model_spec.get_model_factory() downloader = HuggingFaceHubBucketFSModelTransferSP(bucketfs_location=bucketfs_location, - model_specification=current_model_specs, + model_specification=current_model_spec, bucketfs_model_path=upload_path, token=token) for model in [model_factory, transformers.AutoTokenizer]: downloader.download_from_huggingface_hub(model) # upload model files to BucketFS - model_tar_file_path = downloader.upload_to_bucketfs() - print("your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) + model_tar_file_path = downloader.upload_to_bucketfs() + print("your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) if __name__ == '__main__': diff --git a/exasol_transformers_extension/utils/current_model_specification.py b/exasol_transformers_extension/utils/bucketfs_model_specification.py similarity index 63% rename from exasol_transformers_extension/utils/current_model_specification.py rename to exasol_transformers_extension/utils/bucketfs_model_specification.py index d4aeb9af..8e8130fe 100644 --- a/exasol_transformers_extension/utils/current_model_specification.py +++ b/exasol_transformers_extension/utils/bucketfs_model_specification.py @@ -1,7 +1,7 @@ from exasol_transformers_extension.utils.model_specification import ModelSpecification from pathlib import PurePosixPath, Path -class CurrentModelSpecification(ModelSpecification): +class BucketFSModelSpecification(ModelSpecification): """ Class describing a model with additional information about the bucketFS connection and the subdir in the bucketfs the model can be found at. @@ -17,7 +17,7 @@ def __init__(self, def __eq__(self, other): """Overrides the default implementation""" - if isinstance(other, CurrentModelSpecification): + if isinstance(other, BucketFSModelSpecification): return (super().__eq__(other) and self.sub_dir == other.sub_dir and self.bucketfs_conn_name == other.bucketfs_conn_name) @@ -31,21 +31,20 @@ def get_bucketfs_model_save_path(self) -> Path: return Path(self.sub_dir, model_path_suffix) -class CurrentModelSpecificationFactory: +class BucketFSModelSpecificationFactory: def create(self, model_name: str, task_type: str, bucketfs_conn_name: str, sub_dir: Path): - return CurrentModelSpecification(model_name, task_type, bucketfs_conn_name, sub_dir) + return BucketFSModelSpecification(model_name, task_type, bucketfs_conn_name, sub_dir) -class CurrentModelSpecificationFromModelSpecs: - def transform(self, - model_specification: ModelSpecification, - bucketfs_conn_name: str, - sub_dir: Path): - return CurrentModelSpecification(model_name=model_specification.model_name, - task_type=model_specification.task_type, - bucketfs_conn_name=bucketfs_conn_name, - sub_dir=sub_dir) +def get_BucketFSModelSpecification_from_model_Specs( + model_specification: ModelSpecification, + bucketfs_conn_name: str, + sub_dir: Path): + return BucketFSModelSpecification(model_name=model_specification.model_name, + task_type=model_specification.task_type, + bucketfs_conn_name=bucketfs_conn_name, + sub_dir=sub_dir) diff --git a/exasol_transformers_extension/utils/bucketfs_operations.py b/exasol_transformers_extension/utils/bucketfs_operations.py index 1e6a4f4b..6ab2671b 100644 --- a/exasol_transformers_extension/utils/bucketfs_operations.py +++ b/exasol_transformers_extension/utils/bucketfs_operations.py @@ -115,6 +115,6 @@ def create_save_pretrained_model_path(_tmpdir_name, model_specification: ModelSp before it is uploaded to the bucketfs """ model_specific_path_suffix = model_specification.get_model_specific_path_suffix() - return Path(_tmpdir_name, "pretrained", model_specific_path_suffix) #todo move to modespecstring eventually? + return Path(_tmpdir_name, "pretrained", model_specific_path_suffix) diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py index 211ed44b..6ccd9dba 100644 --- a/exasol_transformers_extension/utils/load_local_model.py +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -3,7 +3,7 @@ from typing import Optional from pathlib import Path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils import bucketfs_operations from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -13,20 +13,20 @@ class LoadLocalModel: Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline. :_pipeline_factory: a function to create a transformers pipeline - :task_name: name of the current task + :task_type: name of the current task :device: device to be used for pipeline creation, i.e "CPU" :_base_model_factory: a ModelFactoryProtocol for creating the loaded model :_tokenizer_factory: a ModelFactoryProtocol for creating the loaded tokenizer """ def __init__(self, pipeline_factory, - task_name: str, + task_type: str, device: str, base_model_factory: ModelFactoryProtocol, tokenizer_factory: ModelFactoryProtocol ): self.pipeline_factory = pipeline_factory - self.task_name = task_name + self.task_type = task_type self.device = device self._base_model_factory = base_model_factory self._tokenizer_factory = tokenizer_factory @@ -40,7 +40,7 @@ def current_model_specification(self): """Get the current current_model_specification.""" return self._current_model_specification - def set_current_model_specification(self, current_model_specification: CurrentModelSpecification): + def set_current_model_specification(self, current_model_specification: BucketFSModelSpecification): """Set the current_model_specification.""" self._current_model_specification = current_model_specification @@ -57,7 +57,7 @@ def load_models(self) -> transformers.pipelines.Pipeline: loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(self._bucketfs_model_cache_dir)) last_created_pipeline = self.pipeline_factory( - self.task_name, + self.task_type, model=loaded_model, tokenizer=loaded_tokenizer, device=self.device, diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py index a0eb4169..a1245abd 100644 --- a/exasol_transformers_extension/utils/model_specification.py +++ b/exasol_transformers_extension/utils/model_specification.py @@ -33,12 +33,6 @@ def _set_task_type_from_udf_name(self, text): task_type = text return task_type - def get_model_specs_for_download(self):#todo change usages? - """ - returns all attributes necessary for downloading the model from Huggingface. - """ - return self.model_name, self.task_type - def __eq__(self, other): """Overrides the default implementation""" if isinstance(other, ModelSpecification): @@ -47,7 +41,7 @@ def __eq__(self, other): return False def get_model_specific_path_suffix(self) -> PurePosixPath: - return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# todo change + return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# def get_model_factory(self): """ diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index 5f368486..ec9516a4 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -5,8 +5,8 @@ import exasol.bucketfs as bfs -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \ - CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification, \ + get_BucketFSModelSpecification_from_model_Specs from exasol_transformers_extension.utils.model_specification import ModelSpecification from tests.utils import postprocessing from tests.utils.parameters import model_params @@ -38,7 +38,7 @@ def download_model_to_path(model_specification: ModelSpecification, @contextmanager def upload_model(bucketfs_location: bfs.path.PathLike, - current_model_specification: CurrentModelSpecification, + current_model_specification: BucketFSModelSpecification, model_dir: Path) -> Path: model_path = current_model_specification.get_bucketfs_model_save_path() bucketfs_operations.upload_model_files_to_bucketfs( @@ -50,9 +50,9 @@ def upload_model(bucketfs_location: bfs.path.PathLike, def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, tmpdir_factory): - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", - model_params.sub_dir) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, + "", + model_params.sub_dir) tmpdir = tmpdir_factory.mktemp(current_model_specs.task_type) model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path() @@ -122,14 +122,14 @@ def prepare_seq2seq_model_in_local_bucketfs(tmpdir_factory) -> PurePosixPath: @contextmanager def upload_model_to_bucketfs( model_specification: ModelSpecification, - download_tmpdir: Path, + local_model_save_path: Path, bucketfs_location: bfs.path.PathLike) -> str: - download_tmpdir = download_model_to_standard_local_save_path(model_specification, download_tmpdir) - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", - model_params.sub_dir) + local_model_save_path = download_model_to_standard_local_save_path(model_specification, local_model_save_path) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, + "", + model_params.sub_dir) with upload_model( - bucketfs_location, current_model_specs, download_tmpdir) as model_path: + bucketfs_location, current_model_specs, local_model_save_path) as model_path: try: yield model_path finally: @@ -150,64 +150,64 @@ def upload_filling_mask_model_to_bucketfs( @pytest.fixture(scope="session") def upload_question_answering_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.q_a_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.q_a_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_sequence_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.sequence_class_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.sequence_class_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_sequence_classification_pair_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.sequence_class_pair_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.sequence_class_pair_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_text_generation_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.text_gen_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.text_gen_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_token_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.token_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.token_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_translation_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.seq2seq_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.seq2seq_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path @pytest.fixture(scope="session") def upload_zero_shot_classification_model_to_bucketfs( bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath: - base_model_specs = model_params.zero_shot_model_specs - tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type) + model_specs = model_params.zero_shot_model_specs + tmpdir = tmpdir_factory.mktemp(model_specs.task_type) with upload_model_to_bucketfs( - base_model_specs, tmpdir, bucketfs_location) as path: + model_specs, tmpdir, bucketfs_location) as path: yield path diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 0c7da8f7..4cdec9f3 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -5,10 +5,12 @@ from pytest_itde import config import exasol.bucketfs as bfs -from exasol_transformers_extension import upload_model +from exasol_transformers_extension import upload_model as upload_model_cli +from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params + from tests.fixtures.model_fixture import * from tests.fixtures.setup_database_fixture import * from tests.fixtures.language_container_fixture import * @@ -32,13 +34,12 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, model_specification = model_params.base_model_specs model_specification.task_type = "filling_mask" model_name = model_specification.model_name - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification, - "", Path(sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_specification, "", Path(sub_dir)) upload_path = current_model_specs.get_bucketfs_model_save_path() parsed_url = urlparse(bucketfs_config.url) host = parsed_url.netloc.split(":")[0] port = parsed_url.netloc.split(":")[1] - print("path in bucket: "+ bucketfs_params.path_in_bucket) + print("path in bucket: " + bucketfs_params.path_in_bucket) args_list = [ "--bucketfs-name", bucketfs_params.name, "--bucketfs-host", host, @@ -55,7 +56,7 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, try: runner = CliRunner() - result = runner.invoke(upload_model.main, args_list) + result = runner.invoke(upload_model_cli.main, args_list) print(result) assert result.exit_code == 0 bucketfs_upload_location = bucketfs_location / upload_path.with_suffix(".tar.gz") @@ -89,5 +90,4 @@ def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, result = pyexasol_connection.execute(query).fetchall() assert len(result) == 1 and result[0][-1] is None finally: - postprocessing.cleanup_buckets(bucketfs_location, sub_dir) -#todo path is not corretc after upload? \ No newline at end of file + postprocessing.cleanup_buckets(bucketfs_location, sub_dir) \ No newline at end of file diff --git a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py index 8749ca33..0a9716cb 100644 --- a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py +++ b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py @@ -1,7 +1,7 @@ from pathlib import Path from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs from tests.utils import postprocessing from tests.utils.parameters import model_params from tests.utils.bucketfs_file_list import get_bucketfs_file_list @@ -19,8 +19,8 @@ def test_model_downloader_udf_script( for i in range(n_rows): sub_dir = SUB_DIR.format(id=i) sub_dirs.append(sub_dir) - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_params.tiny_model_specs, - bucketfs_conn_name, Path(sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_params.tiny_model_specs, + bucketfs_conn_name, Path(sub_dir)) model_paths.append(current_model_specs.get_bucketfs_model_save_path()) input_data.append(( current_model_specs.model_name, diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py index 70b4640e..cc70d632 100644 --- a/tests/integration_tests/with_db/udfs/test_question_answering_script.py +++ b/tests/integration_tests/with_db/udfs/test_question_answering_script.py @@ -23,7 +23,7 @@ def test_question_answering_script( str(model_params.sub_dir), model_params.q_a_model_specs.model_name, question, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', top_k )) diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py index c13198c1..2663aa86 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py @@ -21,7 +21,7 @@ def test_sequence_classification_text_pair_script( bucketfs_conn_name, str(model_params.sub_dir), model_params.sequence_class_pair_model_specs.model_name, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', 'The main Exasol office is located in Flensburg')) query = f"SELECT TE_SEQUENCE_CLASSIFICATION_TEXT_PAIR_UDF(" \ diff --git a/tests/integration_tests/with_db/udfs/test_token_classification_script.py b/tests/integration_tests/with_db/udfs/test_token_classification_script.py index 88c2500d..76d0bd87 100644 --- a/tests/integration_tests/with_db/udfs/test_token_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_token_classification_script.py @@ -20,7 +20,7 @@ def test_token_classification_script( bucketfs_conn_name, str(model_params.sub_dir), model_params.token_model_specs.model_name, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', aggregation_strategy )) diff --git a/tests/integration_tests/with_db/udfs/test_translation_script.py b/tests/integration_tests/with_db/udfs/test_translation_script.py index 34174f34..f562967a 100644 --- a/tests/integration_tests/with_db/udfs/test_translation_script.py +++ b/tests/integration_tests/with_db/udfs/test_translation_script.py @@ -16,7 +16,7 @@ def test_translation_script( bucketfs_conn_name, str(model_params.sub_dir), model_params.seq2seq_model_specs.model_name, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', src_lang, target_lang, max_length @@ -49,7 +49,7 @@ def test_translation_script( print(result) # lenient test for quality of results, will be replaced by deterministic test later results = [result[i][7] for i in range(len(result))] - acceptable_results = ["Die Firma Exasol hat ihren Sitz in Nürnberg"] + acceptable_results = ["Die Datenbanksoftware Exasol hat ihren Sitz in Nürnberg"] number_accepted_results = 0 def contains(string,list): diff --git a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py index 6ef924e4..7707abae 100644 --- a/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_zero_shot_text_classification_script.py @@ -22,7 +22,7 @@ def test_zero_shot_classification_single_text_script( bucketfs_conn_name, str(model_params.sub_dir), model_params.zero_shot_model_specs.model_name, - model_params.text_data, + 'The database software company Exasol is based in Nuremberg', candidate_labels )) diff --git a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py index 2e525586..af774d61 100644 --- a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py +++ b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py @@ -7,7 +7,7 @@ from exasol_transformers_extension.udfs.models.model_downloader_udf import \ ModelDownloaderUDF from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecificationFromModelSpecs +from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs from tests.utils.parameters import model_params from tests.utils.mock_connections import ( create_mounted_bucketfs_connection, create_hf_token_connection) @@ -66,8 +66,8 @@ class TestEnvironmentSetup: def __init__(self, id: str, tmp_dir: Path, token_conn_name: str): self.bucketfs_conn_name = "bucketfs_connection" + id self.sub_dir = model_params.sub_dir + id - current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_params.tiny_model_specs, - self.bucketfs_conn_name, Path(self.sub_dir)) + current_model_specs = get_BucketFSModelSpecification_from_model_Specs(model_params.tiny_model_specs, + self.bucketfs_conn_name, Path(self.sub_dir)) self.token_conn_name = token_conn_name self.ctx_data = { 'tiny_model': current_model_specs.model_name, diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py index a75247f3..6c5d2410 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py @@ -11,6 +11,12 @@ from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection +from tests.fixtures.model_fixture import * +from tests.fixtures.setup_database_fixture import * +from tests.fixtures.language_container_fixture import * +from tests.fixtures.bucketfs_fixture import * +from tests.fixtures.database_connection_fixture import * + class ExaEnvironment: def __init__(self, connections: Dict[str, Connection] = None): @@ -90,7 +96,7 @@ def test_sequence_classification_text_pair_udf( grouped_by_inputs = result_df.groupby('first_text') n_unique_labels_per_input = grouped_by_inputs['label'].nunique().to_list() - n_labels = 2 + n_labels = 3 n_labels_per_input_expected = [n_labels] * n_rows result = Result(result_df) diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py index 176e21aa..00ee4195 100644 --- a/tests/integration_tests/without_db/utils/test_load_local_model.py +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -5,7 +5,7 @@ from transformers import AutoModel, AutoTokenizer, pipeline import tarfile -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ @@ -26,11 +26,11 @@ def __init__(self): self.token = "token" self.model_specification = model_params.tiny_model_specs - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) + self.mock_current_model_specification: Union[BucketFSModelSpecification, MagicMock] = create_autospec(BucketFSModelSpecification) test_pipeline = pipeline self.loader = LoadLocalModel( test_pipeline, - task_name="token-classification", + task_type="token-classification", device="cpu", base_model_factory=self.base_model_factory, tokenizer_factory=self.tokenizer_factory diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py index bd5a07db..cf35b310 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py @@ -53,9 +53,9 @@ class ErrorNotCachedMultipleModelMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py index 95fe1623..8a9741d0 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py @@ -48,7 +48,7 @@ class ErrorNotCachedSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py index dc4a50a0..715bd71a 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py @@ -47,7 +47,7 @@ class ErrorOnPredictionSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py index e3ba8d4a..0b0dd61e 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameMultipleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py index 1743857f..099a5147 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameSingleBatch: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py index f10d4443..04b32975 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelMultipleBatchComplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py index f7af363b..f2ec5586 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py @@ -53,9 +53,9 @@ class MultipleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py index 49fbe430..f658add1 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py @@ -64,13 +64,13 @@ class MultipleModelMultipleBatchMultipleModelsPerBatch: "bfs_conn4": Connection(address=f"file://{base_cache_dir4}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1), - PurePosixPath(base_cache_dir3, "sub_dir3", "model3"): + PurePosixPath(base_cache_dir3, "sub_dir3", "model3_fill-mask"): MockFillingMaskModel(sequence="text valid 3", score=0.3, rank=1), - PurePosixPath(base_cache_dir4, "sub_dir4", "model4"): + PurePosixPath(base_cache_dir4, "sub_dir4", "model4_fill-mask"): MockFillingMaskModel(sequence="text valid 4", score=0.4, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py index 3087de66..8a99d6dd 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelSingleBatchComplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2,"sub_dir2", "model2"): + PurePosixPath(base_cache_dir2,"sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py index b47f664b..b2ae8ffa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py @@ -53,9 +53,9 @@ class MultipleModelSingleBatchIncomplete: "bfs_conn2": Connection(address=f"file://{base_cache_dir2}") } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir2, "sub_dir2", "model2"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py index 86d3f87e..21fe9eb2 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py index 4279d260..ad0bbd43 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py index 198b8e21..c13a909f 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir2", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir2", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py index e6080336..2ef50292 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir2", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir2", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py index 5352f52a..d88a07d5 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchComplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py index de72ea3a..09cf5c8b 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py index a627df57..66cd1be3 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py @@ -47,7 +47,7 @@ class SingleModelSingleBatchComplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py index 7c91df61..d750cafa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelSingleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py index 9166249f..4c8400a0 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir1", "model2"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py index 7d38b5dc..6e3a88cd 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - PurePosixPath(base_cache_dir1, "sub_dir1", "model1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1_fill-mask"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - PurePosixPath(base_cache_dir1, "sub_dir1", "model2"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2_fill-mask"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index e15fe25b..99fbc5fb 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -7,8 +7,8 @@ from exasol_udf_mock_python.connection import Connection from exasol_udf_mock_python.mock_meta_data import MockMetaData -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \ - CurrentModelSpecificationFactory +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification, \ + BucketFSModelSpecificationFactory from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context from exasol_transformers_extension.udfs.models.model_downloader_udf import \ ModelDownloaderUDF @@ -18,7 +18,6 @@ from tests.utils.matchers import AnyOrder from tests.utils.mock_cast import mock_cast -#todo add tests? def create_mock_metadata() -> MockMetaData: def udf_wrapper(): pass @@ -68,15 +67,15 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, bucketfs_connections = [Connection(address=f"file:///test{i}") for i in range(count)] bfs_conn_name = [f"bfs_conn_name_{i}" for i in bucketfs_connections] - mock_cmss = [create_autospec(CurrentModelSpecification, + mock_cmss = [create_autospec(BucketFSModelSpecification, model_name=base_model_names[i], task_type=task_type[i], sub_dir=Path(sub_directory_names[i]), - get_model_factory=CurrentModelSpecification.get_model_factory) for i in range(count)] + get_model_factory=BucketFSModelSpecification.get_model_factory) for i in range(count)] for i in range(count): mock_cast(mock_cmss[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] - mock_current_model_specification_factory: Union[CurrentModelSpecificationFactory, MagicMock] = ( - create_autospec(CurrentModelSpecificationFactory)) + mock_current_model_specification_factory: Union[BucketFSModelSpecificationFactory, MagicMock] = ( + create_autospec(BucketFSModelSpecificationFactory)) mock_cast(mock_current_model_specification_factory.create).side_effect = mock_cmss input_data = [ diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py index 1ef163a3..89a0c856 100644 --- a/tests/unit_tests/utils/test_load_local_model.py +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -6,7 +6,7 @@ import transformers from exasol_transformers_extension.utils.bucketfs_operations import create_save_pretrained_model_path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from exasol_transformers_extension.utils.load_local_model import LoadLocalModel from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -22,7 +22,7 @@ def __init__(self): self.token = "token" self.model_name = "model_name" self.model_task = "test_task" - self.mock_current_model_specification: Union[CurrentModelSpecification, MagicMock] = create_autospec(CurrentModelSpecification) + self.mock_current_model_specification: Union[BucketFSModelSpecification, MagicMock] = create_autospec(BucketFSModelSpecification) self.cache_dir = "test/Path" self.mock_pipeline = Mock() diff --git a/tests/utils/parameters.py b/tests/utils/parameters.py index 2edf15ae..2aef54b4 100644 --- a/tests/utils/parameters.py +++ b/tests/utils/parameters.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from pathlib import Path -from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification +from exasol_transformers_extension.utils.bucketfs_model_specification import BucketFSModelSpecification from exasol_transformers_extension.utils.model_specification import ModelSpecification @@ -15,7 +15,7 @@ class BucketFSParams: @dataclass(frozen=True) class ModelParams: - base_model_specs: ModelSpecification # this is used for other tests, task_type should be set per test + base_model_specs: ModelSpecification # this is used for other tests, taks_name should be set per test seq2seq_model_specs: ModelSpecification # this model is used for testing translation_udf q_a_model_specs: ModelSpecification # this model is used for testing question answering text_gen_model_specs: ModelSpecification # used for text generation tests From f5c8c7c564cc6cd044b4462b3e9e9a657fb6de7c Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Wed, 31 Jul 2024 15:03:19 +0200 Subject: [PATCH 05/31] [CodeBuild] remove prints, changelog --- doc/changes/changes_2.0.0.md | 2 ++ tests/fixtures/model_fixture.py | 1 - tests/integration_tests/with_db/test_upload_model.py | 1 - .../with_db/udfs/test_prediction_with_downloader_udf.py | 1 - .../with_db/udfs/test_question_answering_script.py | 2 -- .../udfs/test_sequence_classification_text_pair_script.py | 3 --- .../with_db/udfs/test_text_generation_script.py | 2 -- .../with_db/udfs/test_token_classification_script.py | 1 - .../integration_tests/with_db/udfs/test_translation_script.py | 1 - 9 files changed, 2 insertions(+), 12 deletions(-) diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 09ad262b..0e4e6b7d 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -13,6 +13,7 @@ Code name: ### Bugs - #237: Fixed reference to python-extension-common +- #245: Added task_type parameter to fix model saving and loading ### Documentation @@ -27,5 +28,6 @@ Code name: - #217: Refactored PredictionUDFs and LoadLocalModel so that LoadLocalModel constructs the bucketfs model file path - #230: Updated supported python version to >= Python 3.10 - #236: Moved to the PathLike bucketfs interface. +- #218: Changed upload_model_udf to load model from Huggingface ### Security diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index e63884e2..2f6bcb0d 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -61,7 +61,6 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification, model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path() bucketfs_path_for_model = tmpdir / model_path_in_bucketfs - print(bucketfs_path_for_model) download_model_to_path(current_model_specs, bucketfs_path_for_model) return tmpdir diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index b72562ad..9524ec0e 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -52,7 +52,6 @@ def test_model_upload(upload_params, try: runner = CliRunner() result = runner.invoke(upload_model_cli.main, args_list) - print(result) assert result.exit_code == 0 time.sleep(20) bucketfs_upload_location = bucketfs_location / upload_path.with_suffix(".tar.gz") diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py index 2b2dda9a..73d33f73 100644 --- a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py +++ b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py @@ -32,7 +32,6 @@ def test_prediction_with_downloader_udf( result = db_conn.execute(query).fetchall() time.sleep(10) - print(result) # execute the filling mask UDF text_data = "I you so much." diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py index a0dc2f24..52e1989f 100644 --- a/tests/integration_tests/with_db/udfs/test_question_answering_script.py +++ b/tests/integration_tests/with_db/udfs/test_question_answering_script.py @@ -51,8 +51,6 @@ def test_question_answering_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result - for i in range(5): - print(result[i]) results = [result[i][6] for i in range(len(result))] acceptable_results = ["Nuremberg", "Germany"] number_accepted_results = 0 diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py index d154eaf2..928415a4 100644 --- a/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py +++ b/tests/integration_tests/with_db/udfs/test_sequence_classification_text_pair_script.py @@ -38,9 +38,6 @@ def test_sequence_classification_text_pair_script( # execute sequence classification UDF result = db_conn.execute(query).fetchall() - for i in range(10): - print(result[i]) - # assertions assert result[0][-1] is None added_columns = 3 # label,score,error_message diff --git a/tests/integration_tests/with_db/udfs/test_text_generation_script.py b/tests/integration_tests/with_db/udfs/test_text_generation_script.py index f1d0133b..1f0089c4 100644 --- a/tests/integration_tests/with_db/udfs/test_text_generation_script.py +++ b/tests/integration_tests/with_db/udfs/test_text_generation_script.py @@ -52,8 +52,6 @@ def test_text_generation_script( assert len(result) == n_rows_result and len(result[0]) == n_cols_result # lenient test for quality of results, will be replaced by deterministic test later - for i in range(5): - print(result[i]) results = [result[i][6] for i in range(len(result))] acceptable_results = ["software", "system", "solution", "tool"] number_accepted_results = 0 diff --git a/tests/integration_tests/with_db/udfs/test_token_classification_script.py b/tests/integration_tests/with_db/udfs/test_token_classification_script.py index d68a04cf..3ac674fe 100644 --- a/tests/integration_tests/with_db/udfs/test_token_classification_script.py +++ b/tests/integration_tests/with_db/udfs/test_token_classification_script.py @@ -47,7 +47,6 @@ def test_token_classification_script( # lenient test for quality of results, will be replaced by deterministic test later results = [[result[i][7], result[i][8]] for i in range(len(result))] - print(result) acceptable_result_sets = [["Exasol", "ORG"], ["Nuremberg", "LOC"]] number_accepted_results = 0 diff --git a/tests/integration_tests/with_db/udfs/test_translation_script.py b/tests/integration_tests/with_db/udfs/test_translation_script.py index b1f2b6f6..6fc60941 100644 --- a/tests/integration_tests/with_db/udfs/test_translation_script.py +++ b/tests/integration_tests/with_db/udfs/test_translation_script.py @@ -46,7 +46,6 @@ def test_translation_script( n_cols_result = len(input_data[0]) + (added_columns - removed_columns) assert len(result) == n_rows_result and len(result[0]) == n_cols_result - print(result) # lenient test for quality of results, will be replaced by deterministic test later results = [result[i][7] for i in range(len(result))] acceptable_results = ["Die Datenbanksoftware Exasol hat ihren Sitz in Nürnberg"] From 34fea484df422b84c184fcfb5f3c41c836df52ed Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Thu, 1 Aug 2024 15:56:51 +0200 Subject: [PATCH 06/31] [CodeBuild] fixed run integration tests without saas, fixed import --- tests/fixtures/database_connection_fixture.py | 15 +++++++++------ .../with_db/test_upload_model.py | 4 +++- .../with_db/udfs/test_filling_mask_script.py | 2 +- .../with_db/udfs/test_text_generation_script.py | 1 + 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index c6882cbd..36457205 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -32,18 +32,21 @@ def backend(request) -> bfs.path.StorageBackend: @pytest.fixture(scope="session") -def saas_url() -> str: - return _env("SAAS_HOST") +def saas_url(backend) -> str: + if backend == bfs.path.StorageBackend.saas: + return _env("SAAS_HOST") @pytest.fixture(scope="session") -def saas_account_id() -> str: - return _env("SAAS_ACCOUNT_ID") +def saas_account_id(backend) -> str: + if backend == bfs.path.StorageBackend.saas: + return _env("SAAS_ACCOUNT_ID") @pytest.fixture(scope="session") -def saas_token() -> str: - return _env("SAAS_PAT") +def saas_token(backend) -> str: + if backend == bfs.path.StorageBackend.saas: + return _env("SAAS_PAT") @pytest.fixture(scope="session") diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 9524ec0e..bce08def 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -11,7 +11,7 @@ from tests.utils.parameters import bucketfs_params, model_params, get_arg_list from tests.fixtures.model_fixture import download_model_to_standard_local_save_path - +from tests.fixtures.script_deployment_fixture import * from tests.fixtures.model_fixture import * from tests.fixtures.setup_database_fixture import * from tests.fixtures.language_container_fixture import * @@ -51,7 +51,9 @@ def test_model_upload(upload_params, try: runner = CliRunner() + print(args_list) result = runner.invoke(upload_model_cli.main, args_list) + print(result) assert result.exit_code == 0 time.sleep(20) bucketfs_upload_location = bucketfs_location / upload_path.with_suffix(".tar.gz") diff --git a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py index 91cdb84e..924d473f 100644 --- a/tests/integration_tests/with_db/udfs/test_filling_mask_script.py +++ b/tests/integration_tests/with_db/udfs/test_filling_mask_script.py @@ -2,7 +2,7 @@ from tests.fixtures.model_fixture import upload_filling_mask_model_to_bucketfs from tests.fixtures.bucketfs_fixture import bucketfs_location from tests.fixtures.database_connection_fixture import pyexasol_connection -from tests.fixtures.setup_database_fixture import setup_database, language_alias +from tests.fixtures.setup_database_fixture import setup_database from tests.fixtures.language_container_fixture import flavor_path, upload_slc, export_slc from tests.utils.parameters import model_params diff --git a/tests/integration_tests/with_db/udfs/test_text_generation_script.py b/tests/integration_tests/with_db/udfs/test_text_generation_script.py index 1f0089c4..bebf1915 100644 --- a/tests/integration_tests/with_db/udfs/test_text_generation_script.py +++ b/tests/integration_tests/with_db/udfs/test_text_generation_script.py @@ -5,6 +5,7 @@ #for debug from tests.fixtures.model_fixture import * from tests.fixtures.setup_database_fixture import * +from tests.fixtures.script_deployment_fixture import * from tests.fixtures.language_container_fixture import * from tests.fixtures.bucketfs_fixture import * from tests.fixtures.database_connection_fixture import * From 2674d8b05545da88df6fafd85915ca497e3e7834 Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Mon, 5 Aug 2024 13:52:08 +0200 Subject: [PATCH 07/31] [CodeBuild] fixed text replace error --- exasol_transformers_extension/upload_model.py | 8 ++++---- tests/integration_tests/with_db/test_upload_model.py | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index d5a3caae..6c3d897b 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -16,7 +16,7 @@ @click.command() @click.option('--model-name', type=str, required=True, help="name of the model") -@click.option('--task_type', type=str, required=True) +@click.option('--task-type', type=str, required=True) @click.option('--sub-dir', type=str, required=True, help="directory where the model is stored in the BucketFS") @click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") @@ -90,9 +90,9 @@ def main( model_factory = current_model_spec.get_model_factory() downloader = HuggingFaceHubBucketFSModelTransferSP(bucketfs_location=bucketfs_location, - model_specification=current_model_spec, - bucketfs_model_path=upload_path, - token=token) + model_specification=current_model_spec, + bucketfs_model_path=upload_path, + token=token) for model in [model_factory, transformers.AutoTokenizer]: downloader.download_from_huggingface_hub(model) diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index bce08def..65029313 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -5,7 +5,8 @@ import exasol.bucketfs as bfs from exasol_transformers_extension import upload_model as upload_model_cli -from exasol_transformers_extension.utils.bucketfs_model_specification import get_BucketFSModelSpecification_from_model_Specs +from exasol_transformers_extension.utils.bucketfs_model_specification import ( + get_BucketFSModelSpecification_from_model_Specs) from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params, get_arg_list @@ -18,6 +19,7 @@ from tests.fixtures.bucketfs_fixture import * from tests.fixtures.database_connection_fixture import * + def adapt_file_to_upload(path: PosixPath, download_path: PosixPath): if path.is_dir(): path = path / "not_empty" From 1ac208869174e10087ed69a23c8dd294be413d91 Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Mon, 5 Aug 2024 15:50:27 +0200 Subject: [PATCH 08/31] [CodeBuild] prepare release --- doc/changes/changes_2.0.0.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 0e4e6b7d..32938cba 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -1,9 +1,12 @@ -# Transformers Extension 2.0.0, t.b.d +# Transformers Extension 2.0.0, 2024-08-6 -Code name: +Code name: Python update and fixed model saving ## Summary +This release Fixes an error in the saving and loading of model metadata. It also adds Exasol Saas support and +updated the project to python 3.10 + ### Features From 10cc7d47f3d6ed492d9b2303f868ee8795d36dfc Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Mon, 5 Aug 2024 16:16:06 +0200 Subject: [PATCH 09/31] Apply suggestions from code review [CodeBuild] --- doc/changes/changes_2.0.0.md | 2 +- doc/user_guide/user_guide.md | 2 +- exasol_transformers_extension/upload_model.py | 2 +- .../udfs/test_sequence_classification_single_text_udf.py | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 32938cba..7f3f5125 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -4,7 +4,7 @@ Code name: Python update and fixed model saving ## Summary -This release Fixes an error in the saving and loading of model metadata. It also adds Exasol Saas support and +This release Fixes an error in saving and loading of the model metadata. It also adds Exasol Saas support and updated the project to python 3.10 diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index 08e2c08a..8bc67ea6 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -291,7 +291,7 @@ severely. Available task_types are the same as the names of our available UDFs, ### 2. Model Uploader Script You can invoke the python script as below which allows to download the transformer -models from The Hugging Face hub to the local filesystem, and then from there to the Bucketfs. +models from The Hugging Face hub to the local filesystem, and then from there to the BucketFS. #### List of options diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index 6c3d897b..a923a62b 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -98,7 +98,7 @@ def main( downloader.download_from_huggingface_hub(model) # upload model files to BucketFS model_tar_file_path = downloader.upload_to_bucketfs() - print("your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) + print("Your model or tokenizer has been saved in the BucketFS at: " + str(model_tar_file_path)) if __name__ == '__main__': diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py index 1189902b..ddf0c754 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py @@ -11,7 +11,6 @@ from tests.utils.parameters import model_params from tests.utils.mock_connections import create_mounted_bucketfs_connection -# debug from tests.fixtures.model_fixture import * from tests.fixtures.setup_database_fixture import * from tests.fixtures.language_container_fixture import * From f756d0b6ff2b213b0c0f7ae52c6bf64991247521 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Mon, 5 Aug 2024 16:16:44 +0200 Subject: [PATCH 10/31] Apply suggestions from code review [CodeBuild] --- doc/changes/changes_2.0.0.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 7f3f5125..31b88ea6 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -1,6 +1,6 @@ -# Transformers Extension 2.0.0, 2024-08-6 +# Transformers Extension 2.0.0, 2024-08-06 -Code name: Python update and fixed model saving +Code name: Fixed model saving, added SaaS support and update to Python 3.10 ## Summary From 8b174c7f6b4f2d83b65da24907d2fd5a4d2fd7ce Mon Sep 17 00:00:00 2001 From: MarleneKress79789 Date: Tue, 6 Aug 2024 11:11:57 +0200 Subject: [PATCH 11/31] [CodeBuild] fix saas db naming error --- tests/fixtures/database_connection_fixture.py | 2 +- .../udfs/test_model_downloader_udf.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 36457205..5b533c01 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -61,7 +61,7 @@ def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: # Create a temporary database and waite till it becomes operational db = stack.enter_context(api_access.database( - name=timestamp_name('SME_CI'), + name=timestamp_name('TE_CI'), idle_time=timedelta(hours=12))) api_access.wait_until_running(db.id) yield db.id diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index 99fbc5fb..2b85c17c 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -67,16 +67,16 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, bucketfs_connections = [Connection(address=f"file:///test{i}") for i in range(count)] bfs_conn_name = [f"bfs_conn_name_{i}" for i in bucketfs_connections] - mock_cmss = [create_autospec(BucketFSModelSpecification, - model_name=base_model_names[i], - task_type=task_type[i], - sub_dir=Path(sub_directory_names[i]), - get_model_factory=BucketFSModelSpecification.get_model_factory) for i in range(count)] + mock_bucketfs_model_specs = [create_autospec(BucketFSModelSpecification, + model_name=base_model_names[i], + task_type=task_type[i], + sub_dir=Path(sub_directory_names[i]), + get_model_factory=BucketFSModelSpecification.get_model_factory) for i in range(count)] for i in range(count): - mock_cast(mock_cmss[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] + mock_cast(mock_bucketfs_model_specs[i].get_bucketfs_model_save_path).side_effect = [f'{sub_directory_names[i]}/{base_model_names[i]}'] mock_current_model_specification_factory: Union[BucketFSModelSpecificationFactory, MagicMock] = ( create_autospec(BucketFSModelSpecificationFactory)) - mock_cast(mock_current_model_specification_factory.create).side_effect = mock_cmss + mock_cast(mock_current_model_specification_factory.create).side_effect = mock_bucketfs_model_specs input_data = [ ( @@ -104,14 +104,14 @@ def test_model_downloader(mock_create_loc, description, count, token_conn_name, udf.run(mock_ctx) assert mock_cast(mock_model_downloader_factory.create).mock_calls == [ call(bucketfs_location=mock_bucketfs_locations[i], - model_specification=mock_cmss[i], + model_specification=mock_bucketfs_model_specs[i], model_path=f'{sub_directory_names[i]}/{base_model_names[i]}', token=expected_token) for i in range(count) ] for i in range(count): assert mock_cast(mock_model_downloaders[i].download_from_huggingface_hub).mock_calls == [ - call(mock_cmss[i].get_model_factory()), + call(mock_bucketfs_model_specs[i].get_model_factory()), call(mock_tokenizer_factory) ] assert call() in mock_cast(mock_model_downloaders[i].upload_to_bucketfs).mock_calls From c28f186d19448defd8e64bbc5519ae667296b1f9 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 15:48:39 +0200 Subject: [PATCH 12/31] Use batch build for AWS CodeBuild to speed up tests against backends. [CodeBuild] --- buildspec.yml | 45 ++++++++----------- buildspec_onprem.yml | 26 +++++++++++ buildspec_saas.yml | 28 ++++++++++++ buildspec_without_db.yml | 25 +++++++++++ doc/changes/changes_2.0.0.md | 2 +- noxfile.py | 19 +++++++- tests/conftest.py | 15 +++++++ tests/fixtures/database_connection_fixture.py | 19 ++++---- 8 files changed, 141 insertions(+), 38 deletions(-) create mode 100644 buildspec_onprem.yml create mode 100644 buildspec_saas.yml create mode 100644 buildspec_without_db.yml diff --git a/buildspec.yml b/buildspec.yml index 6595e21a..4c42fad2 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -1,29 +1,20 @@ version: 0.2 -env: - shell: bash - secrets-manager: - DOCKER_USER: "Dockerhub:User" - DOCKER_PASSWORD: "Dockerhub:AccessToken" - SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST" - SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID" - SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT" - -phases: - install: - runtime-versions: - python: 3.10 - commands: - - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - - - export PATH=$PATH:$HOME/.local/bin - - poetry env use $(command -v "python3.10") - - poetry --version - - poetry install - - poetry build - pre_build: - commands: - - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin - build: - commands: - - poetry run nox -s start_database - - poetry run nox -s integration_tests +batch: + fast-fail: false + build-graph: + - identifier: without_db_tests + env: + compute-type: BUILD_GENERAL1_MEDIUM + privileged-mode: true + buildspec: ./buildspec_without_db.yaml + - identifier: saas_tests + env: + compute-type: BUILD_GENERAL1_MEDIUM + privileged-mode: true + buildspec: ./buildspec_saas.yaml + - identifier: onprem_tests + env: + compute-type: BUILD_GENERAL1_MEDIUM + privileged-mode: true + buildspec: ./buildspec_onprem.yaml diff --git a/buildspec_onprem.yml b/buildspec_onprem.yml new file mode 100644 index 00000000..90a96a92 --- /dev/null +++ b/buildspec_onprem.yml @@ -0,0 +1,26 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s start_database + - poetry run nox -s onprem_integration_tests diff --git a/buildspec_saas.yml b/buildspec_saas.yml new file mode 100644 index 00000000..9c9e3451 --- /dev/null +++ b/buildspec_saas.yml @@ -0,0 +1,28 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + SAAS_HOST: "ExasolSaaSDatabase:SAAS_HOST" + SAAS_ACCOUNT_ID: "ExasolSaaSDatabase:SAAS_ACCOUNT_ID" + SAAS_PAT: "ExasolSaaSDatabase:SAAS_PAT" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s saas_integration_tests diff --git a/buildspec_without_db.yml b/buildspec_without_db.yml new file mode 100644 index 00000000..6f3a7782 --- /dev/null +++ b/buildspec_without_db.yml @@ -0,0 +1,25 @@ +version: 0.2 + +env: + shell: bash + secrets-manager: + DOCKER_USER: "Dockerhub:User" + DOCKER_PASSWORD: "Dockerhub:AccessToken" + +phases: + install: + runtime-versions: + python: 3.10 + commands: + - curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.2 python3 - + - export PATH=$PATH:$HOME/.local/bin + - poetry env use $(command -v "python3.10") + - poetry --version + - poetry install + - poetry build + pre_build: + commands: + - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin + build: + commands: + - poetry run nox -s without_db_integration_tests diff --git a/doc/changes/changes_2.0.0.md b/doc/changes/changes_2.0.0.md index 31b88ea6..70f35458 100644 --- a/doc/changes/changes_2.0.0.md +++ b/doc/changes/changes_2.0.0.md @@ -1,4 +1,4 @@ -# Transformers Extension 2.0.0, 2024-08-06 +# Transformers Extension 2.0.0, 2024-08-07 Code name: Fixed model saving, added SaaS support and update to Python 3.10 diff --git a/noxfile.py b/noxfile.py index 2b7d644a..900fdad1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -39,8 +39,25 @@ def unit_tests(session): def integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '--itde-db-version=external', 'tests/integration_tests') + session.run('pytest', '-s', '--itde-db-version=external', 'tests/integration_tests') +@nox.session(python=False) +def saas_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '-s', '--backend=saas', 'tests/integration_tests/with_db') + +@nox.session(python=False) +def onprem_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '-s', '--backend=onprem', '--itde-db-version=external', 'tests/integration_tests/with_db') + +@nox.session(python=False) +def without_db_integration_tests(session): + # We need to use a external database here, because the itde plugin doesn't provide all necessary options to + # configure the database. See the start_database session. + session.run('pytest', '-s', '--itde-db-version=external', 'tests/integration_tests/without_db') @nox.session(python=False) def start_database(session): diff --git a/tests/conftest.py b/tests/conftest.py index 1432f627..ab546df3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,3 +7,18 @@ "tests.fixtures.model_fixture", "tests.fixtures.script_deployment_fixture" ] + +_BACKEND_OPTION = '--backend' + +def pytest_addoption(parser): + parser.addoption( + _BACKEND_OPTION, + action="append", + default=[], + help=f"""List of test backends (onprem, saas). By default, the tests will be + run on both backends. To select only one of the backends add the + argument {_BACKEND_OPTION}= to the command line. Both + backends can be selected like ... {_BACKEND_OPTION}=onprem {_BACKEND_OPTION}=saas, + but this is the same as the default. + """, + ) diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 5b533c01..3ae8a405 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -23,11 +23,16 @@ def _env(var: str) -> str: raise RuntimeError(f"Environment variable {var} is empty.") -@pytest.fixture(scope='session', params=[bfs.path.StorageBackend.onprem, bfs.path.StorageBackend.saas]) -def backend(request) -> bfs.path.StorageBackend: - # Here we are going to add - # pytest.skip() - # if there is an instruction to skip a particular backed in the command line or an envar. +_BACKEND_OPTION = '--backend' +_BACKEND_ONPREM = 'onprem' +_BACKEND_SAAS = 'saas' + + +@pytest.fixture(scope='session', params=[_BACKEND_ONPREM, _BACKEND_SAAS]) +def backend(request) -> str: + backend_options = request.config.getoption(_BACKEND_OPTION) + if backend_options and (request.param not in backend_options): + pytest.skip() return request.param @@ -51,7 +56,6 @@ def saas_token(backend) -> str: @pytest.fixture(scope="session") def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: - if backend == bfs.path.StorageBackend.saas: with ExitStack() as stack: # Create and configure the SaaS client. @@ -73,7 +77,6 @@ def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: def pyexasol_connection_onprem(backend, connection_factory, exasol_config: config.Exasol) -> pyexasol.ExaConnection | None: - if backend == bfs.path.StorageBackend.onprem: with connection_factory(exasol_config) as conn: yield conn @@ -87,7 +90,6 @@ def pyexasol_connection_saas(backend, saas_account_id, saas_database_id, saas_token) -> pyexasol.ExaConnection | None: - if backend == bfs.path.StorageBackend.saas: # Create a connection to the database. conn_params = get_connection_params(host=saas_url, @@ -107,7 +109,6 @@ def pyexasol_connection_saas(backend, def pyexasol_connection(backend, pyexasol_connection_onprem, pyexasol_connection_saas) -> pyexasol.ExaConnection: - if backend == bfs.path.StorageBackend.onprem: assert pyexasol_connection_onprem is not None yield pyexasol_connection_onprem From 7380563c15a4c485ff4051366e4fff87f3ed5b76 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 15:52:57 +0200 Subject: [PATCH 13/31] [CodeBuild] From 9e22f439c3333d5fb601d6dcf2ace42481fc26d0 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 15:55:05 +0200 Subject: [PATCH 14/31] [CodeBuild] From 727fead9065ffd2376269c5e4d93a2f9f88d8c38 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 16:00:51 +0200 Subject: [PATCH 15/31] [CodeBuild] From 4f4c11ab7d737f596f406723a1a43e31dca68a1f Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 17:21:22 +0200 Subject: [PATCH 16/31] [CodeBuild] From 6df3989bca43dc3850cc0c7475d076d8fc376a57 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 17:23:09 +0200 Subject: [PATCH 17/31] Fix buildspec.yml [CodeBuild] --- buildspec.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index 4c42fad2..e67bbde6 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -7,14 +7,14 @@ batch: env: compute-type: BUILD_GENERAL1_MEDIUM privileged-mode: true - buildspec: ./buildspec_without_db.yaml + buildspec: ./buildspec_without_db.yml - identifier: saas_tests env: compute-type: BUILD_GENERAL1_MEDIUM privileged-mode: true - buildspec: ./buildspec_saas.yaml + buildspec: ./buildspec_saas.yml - identifier: onprem_tests env: compute-type: BUILD_GENERAL1_MEDIUM privileged-mode: true - buildspec: ./buildspec_onprem.yaml + buildspec: ./buildspec_onprem.yml From 3e3bd6d74e8ae47f50798cfd0f58558d4ea785e8 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 18:03:37 +0200 Subject: [PATCH 18/31] Use correct backend strings for comparison in tests [CodeBuild] --- tests/fixtures/bucketfs_fixture.py | 5 +++-- tests/fixtures/database_connection_fixture.py | 18 +++++++++--------- tests/fixtures/language_container_fixture.py | 3 ++- tests/fixtures/script_deployment_fixture.py | 5 +++-- tests/fixtures/setup_database_fixture.py | 3 ++- .../deployment/test_scripts_deployer_cli.py | 3 ++- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/tests/fixtures/bucketfs_fixture.py b/tests/fixtures/bucketfs_fixture.py index 2190c8c2..2db4bd9c 100644 --- a/tests/fixtures/bucketfs_fixture.py +++ b/tests/fixtures/bucketfs_fixture.py @@ -5,6 +5,7 @@ import exasol.bucketfs as bfs from exasol_transformers_extension.utils.bucketfs_operations import create_bucketfs_location +from tests.fixtures.database_connection_fixture import BACKEND_SAAS from tests.utils.parameters import bucketfs_params @@ -30,7 +31,7 @@ def bucketfs_location_saas(backend, saas_database_id, saas_token) -> bfs.path.PathLike | None: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, saas_url=saas_url, @@ -48,7 +49,7 @@ def bucketfs_location(backend, if backend == bfs.path.StorageBackend.onprem: assert bucketfs_location_onprem is not None return bucketfs_location_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: assert bucketfs_location_saas is not None return bucketfs_location_saas else: diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 3ae8a405..3bda8870 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -24,11 +24,11 @@ def _env(var: str) -> str: _BACKEND_OPTION = '--backend' -_BACKEND_ONPREM = 'onprem' -_BACKEND_SAAS = 'saas' +BACKEND_ONPREM = 'onprem' +BACKEND_SAAS = 'saas' -@pytest.fixture(scope='session', params=[_BACKEND_ONPREM, _BACKEND_SAAS]) +@pytest.fixture(scope='session', params=[BACKEND_ONPREM, BACKEND_SAAS]) def backend(request) -> str: backend_options = request.config.getoption(_BACKEND_OPTION) if backend_options and (request.param not in backend_options): @@ -38,25 +38,25 @@ def backend(request) -> str: @pytest.fixture(scope="session") def saas_url(backend) -> str: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: return _env("SAAS_HOST") @pytest.fixture(scope="session") def saas_account_id(backend) -> str: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: return _env("SAAS_ACCOUNT_ID") @pytest.fixture(scope="session") def saas_token(backend) -> str: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: return _env("SAAS_PAT") @pytest.fixture(scope="session") def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: with ExitStack() as stack: # Create and configure the SaaS client. client = create_saas_client(host=saas_url, pat=saas_token) @@ -90,7 +90,7 @@ def pyexasol_connection_saas(backend, saas_account_id, saas_database_id, saas_token) -> pyexasol.ExaConnection | None: - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: # Create a connection to the database. conn_params = get_connection_params(host=saas_url, account_id=saas_account_id, @@ -112,7 +112,7 @@ def pyexasol_connection(backend, if backend == bfs.path.StorageBackend.onprem: assert pyexasol_connection_onprem is not None yield pyexasol_connection_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: assert pyexasol_connection_saas is not None yield pyexasol_connection_saas else: diff --git a/tests/fixtures/language_container_fixture.py b/tests/fixtures/language_container_fixture.py index 9ac044b6..0f5c38b0 100644 --- a/tests/fixtures/language_container_fixture.py +++ b/tests/fixtures/language_container_fixture.py @@ -9,6 +9,7 @@ import exasol.bucketfs as bfs from exasol_transformers_extension.deployment import language_container +from tests.fixtures.database_connection_fixture import BACKEND_SAAS LANGUAGE_ALIAS = "PYTHON3_TE" CONTAINER_FILE_NAME = "exasol_transformers_extension_container.tar.gz" @@ -43,7 +44,7 @@ def upload_slc(backend, bucketfs_location, pyexasol_connection, export_slc: Expo wait_for_completion=True) # Let's see if this helps - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: time.sleep(300) diff --git a/tests/fixtures/script_deployment_fixture.py b/tests/fixtures/script_deployment_fixture.py index 83646120..db718d4d 100644 --- a/tests/fixtures/script_deployment_fixture.py +++ b/tests/fixtures/script_deployment_fixture.py @@ -6,6 +6,7 @@ from pytest_itde import config import exasol.bucketfs as bfs +from tests.fixtures.database_connection_fixture import BACKEND_SAAS from tests.utils.parameters import bucketfs_params @@ -50,7 +51,7 @@ def deploy_params(backend, deploy_params_saas) -> dict[str, Any]: if backend == bfs.path.StorageBackend.onprem: yield deploy_params_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: yield deploy_params_saas else: raise ValueError(f'No deploy_params fixture for the backend {backend}') @@ -62,7 +63,7 @@ def upload_params(backend, deploy_params_saas) -> dict[str, Any]: if backend == bfs.path.StorageBackend.onprem: yield upload_params_onprem - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: yield deploy_params_saas else: raise ValueError(f'No deploy_params fixture for the backend {backend}') diff --git a/tests/fixtures/setup_database_fixture.py b/tests/fixtures/setup_database_fixture.py index beb21124..f30aa558 100644 --- a/tests/fixtures/setup_database_fixture.py +++ b/tests/fixtures/setup_database_fixture.py @@ -11,6 +11,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer +from tests.fixtures.database_connection_fixture import BACKEND_SAAS from tests.utils.parameters import bucketfs_params from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS @@ -96,7 +97,7 @@ def setup_database(backend: bfs.path.StorageBackend, _deploy_scripts(pyexasol_connection) if backend == bfs.path.StorageBackend.onprem: _create_bucketfs_connection_onprem(bucketfs_config, pyexasol_connection) - elif backend == bfs.path.StorageBackend.saas: + elif backend == BACKEND_SAAS: _create_bucketfs_connection_saas(saas_url, saas_account_id, saas_database_id, saas_token, pyexasol_connection) else: diff --git a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py index e9170566..3c97d6fb 100644 --- a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py +++ b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py @@ -7,6 +7,7 @@ import exasol.bucketfs as bfs from exasol.python_extension_common.deployment.language_container_validator import temp_schema +from tests.fixtures.database_connection_fixture import BACKEND_SAAS from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS from exasol_transformers_extension import deploy @@ -23,7 +24,7 @@ def test_scripts_deployer_cli(backend, args_list = get_arg_list(**deploy_params, schema=schema_name, language_alias=LANGUAGE_ALIAS) args_list.insert(0, "scripts") # We validate the server certificate in SaaS, but not in the Docker DB - if backend == bfs.path.StorageBackend.saas: + if backend == BACKEND_SAAS: args_list.append("--use-ssl-cert-validation") else: args_list.append("--no-use-ssl-cert-validation") From 0f51211fab7c6d5e15c5fef5346d428ceea9986d Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 18:18:29 +0200 Subject: [PATCH 19/31] Use correct backend strings for onprem for comparison in tests [CodeBuild] --- tests/fixtures/bucketfs_fixture.py | 6 +++--- tests/fixtures/database_connection_fixture.py | 4 ++-- tests/fixtures/script_deployment_fixture.py | 6 +++--- tests/fixtures/setup_database_fixture.py | 4 ++-- .../with_db/deployment/test_scripts_deployer.py | 3 ++- .../with_db/deployment/test_scripts_deployer_cli.py | 4 ++-- 6 files changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/fixtures/bucketfs_fixture.py b/tests/fixtures/bucketfs_fixture.py index 2db4bd9c..165c8428 100644 --- a/tests/fixtures/bucketfs_fixture.py +++ b/tests/fixtures/bucketfs_fixture.py @@ -5,7 +5,7 @@ import exasol.bucketfs as bfs from exasol_transformers_extension.utils.bucketfs_operations import create_bucketfs_location -from tests.fixtures.database_connection_fixture import BACKEND_SAAS +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params @@ -13,7 +13,7 @@ def bucketfs_location_onprem(backend, itde: TestConfig) -> bfs.path.PathLike | None: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, bucketfs_name=bucketfs_params.name, @@ -46,7 +46,7 @@ def bucketfs_location(backend, bucketfs_location_onprem, bucketfs_location_saas) -> bfs.path.PathLike: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: assert bucketfs_location_onprem is not None return bucketfs_location_onprem elif backend == BACKEND_SAAS: diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index 3bda8870..aee7c9de 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -77,7 +77,7 @@ def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: def pyexasol_connection_onprem(backend, connection_factory, exasol_config: config.Exasol) -> pyexasol.ExaConnection | None: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: with connection_factory(exasol_config) as conn: yield conn else: @@ -109,7 +109,7 @@ def pyexasol_connection_saas(backend, def pyexasol_connection(backend, pyexasol_connection_onprem, pyexasol_connection_saas) -> pyexasol.ExaConnection: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: assert pyexasol_connection_onprem is not None yield pyexasol_connection_onprem elif backend == BACKEND_SAAS: diff --git a/tests/fixtures/script_deployment_fixture.py b/tests/fixtures/script_deployment_fixture.py index db718d4d..5e8be0ef 100644 --- a/tests/fixtures/script_deployment_fixture.py +++ b/tests/fixtures/script_deployment_fixture.py @@ -6,7 +6,7 @@ from pytest_itde import config import exasol.bucketfs as bfs -from tests.fixtures.database_connection_fixture import BACKEND_SAAS +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params @@ -49,7 +49,7 @@ def deploy_params_saas(saas_url, saas_account_id, saas_database_id, saas_token) def deploy_params(backend, deploy_params_onprem, deploy_params_saas) -> dict[str, Any]: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: yield deploy_params_onprem elif backend == BACKEND_SAAS: yield deploy_params_saas @@ -61,7 +61,7 @@ def deploy_params(backend, def upload_params(backend, upload_params_onprem, deploy_params_saas) -> dict[str, Any]: - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: yield upload_params_onprem elif backend == BACKEND_SAAS: yield deploy_params_saas diff --git a/tests/fixtures/setup_database_fixture.py b/tests/fixtures/setup_database_fixture.py index f30aa558..a84b6954 100644 --- a/tests/fixtures/setup_database_fixture.py +++ b/tests/fixtures/setup_database_fixture.py @@ -11,7 +11,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer -from tests.fixtures.database_connection_fixture import BACKEND_SAAS +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.utils.parameters import bucketfs_params from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS @@ -95,7 +95,7 @@ def setup_database(backend: bfs.path.StorageBackend, _create_schema(pyexasol_connection) _deploy_scripts(pyexasol_connection) - if backend == bfs.path.StorageBackend.onprem: + if backend == BACKEND_ONPREM: _create_bucketfs_connection_onprem(bucketfs_config, pyexasol_connection) elif backend == BACKEND_SAAS: _create_bucketfs_connection_saas(saas_url, saas_account_id, saas_database_id, saas_token, diff --git a/tests/integration_tests/with_db/deployment/test_scripts_deployer.py b/tests/integration_tests/with_db/deployment/test_scripts_deployer.py index e7138ca1..3e7209aa 100644 --- a/tests/integration_tests/with_db/deployment/test_scripts_deployer.py +++ b/tests/integration_tests/with_db/deployment/test_scripts_deployer.py @@ -9,6 +9,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer +from tests.fixtures.database_connection_fixture import BACKEND_ONPREM from tests.utils.db_queries import DBQueries from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS @@ -36,7 +37,7 @@ def test_scripts_deployer_no_schema_creation_permission( exasol_config: config.Exasol, upload_slc): - if backend != bfs.path.StorageBackend.onprem: + if backend != BACKEND_ONPREM: pytest.skip(("We run this test only with the Docker-DB, " "since the script deployer doesn't use the DB user login and password in SaaS.")) diff --git a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py index 3c97d6fb..a2783c0e 100644 --- a/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py +++ b/tests/integration_tests/with_db/deployment/test_scripts_deployer_cli.py @@ -7,7 +7,7 @@ import exasol.bucketfs as bfs from exasol.python_extension_common.deployment.language_container_validator import temp_schema -from tests.fixtures.database_connection_fixture import BACKEND_SAAS +from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM from tests.fixtures.language_container_fixture import LANGUAGE_ALIAS from exasol_transformers_extension import deploy @@ -41,7 +41,7 @@ def test_scripts_deployer_cli_with_encryption_verify(backend, deploy_params: dict[str, Any], pyexasol_connection: ExaConnection, upload_slc): - if backend != bfs.path.StorageBackend.onprem: + if backend != BACKEND_ONPREM: pytest.skip(("We run this test only with the Docker-DB " "because SaaS always verifies the SSL certificate")) From 3e07bcd055ff2b093bac856b60bae2852a0636ef Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 18:20:39 +0200 Subject: [PATCH 20/31] Add --setup-show to running integration tests nox sessions [CodeBuild] --- noxfile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/noxfile.py b/noxfile.py index 900fdad1..1b5cb504 100644 --- a/noxfile.py +++ b/noxfile.py @@ -39,25 +39,25 @@ def unit_tests(session): def integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '-s', '--itde-db-version=external', 'tests/integration_tests') + session.run('pytest', '--setup-show', '-s', '--itde-db-version=external', 'tests/integration_tests') @nox.session(python=False) def saas_integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '-s', '--backend=saas', 'tests/integration_tests/with_db') + session.run('pytest', '--setup-show', '-s', '--backend=saas', 'tests/integration_tests/with_db') @nox.session(python=False) def onprem_integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '-s', '--backend=onprem', '--itde-db-version=external', 'tests/integration_tests/with_db') + session.run('pytest', '--setup-show', '-s', '--backend=onprem', '--itde-db-version=external', 'tests/integration_tests/with_db') @nox.session(python=False) def without_db_integration_tests(session): # We need to use a external database here, because the itde plugin doesn't provide all necessary options to # configure the database. See the start_database session. - session.run('pytest', '-s', '--itde-db-version=external', 'tests/integration_tests/without_db') + session.run('pytest', '--setup-show', '-s', '--itde-db-version=external', 'tests/integration_tests/without_db') @nox.session(python=False) def start_database(session): From 99732c12cef59d0c3de78bdc947f1acfa3ff209c Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 22:10:49 +0200 Subject: [PATCH 21/31] [CodeBuild] From 836be3026216e706db9fed0c145dc6136ea8d644 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 22:37:52 +0200 Subject: [PATCH 22/31] Build and export SLC before running SaaS integration tests to avoid waiting for the SLC build while the SaaS DB is already running [CodeBuild] --- buildspec_saas.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/buildspec_saas.yml b/buildspec_saas.yml index 9c9e3451..cf46a37c 100644 --- a/buildspec_saas.yml +++ b/buildspec_saas.yml @@ -25,4 +25,5 @@ phases: - echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin build: commands: + - poetry run nox -s export_slc - poetry run nox -s saas_integration_tests From 501ba04132c8b09bbdbc14af6b854a4cd0286d4a Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Tue, 6 Aug 2024 23:49:34 +0200 Subject: [PATCH 23/31] Use itde_config fixture instead itde fiture to avoid starting the itde without need and make db_conn a session fixture [CodeBuild] --- tests/fixtures/bucketfs_fixture.py | 8 ++++---- tests/fixtures/setup_database_fixture.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/fixtures/bucketfs_fixture.py b/tests/fixtures/bucketfs_fixture.py index 165c8428..8ebe8ecc 100644 --- a/tests/fixtures/bucketfs_fixture.py +++ b/tests/fixtures/bucketfs_fixture.py @@ -11,15 +11,15 @@ @pytest.fixture(scope="session") def bucketfs_location_onprem(backend, - itde: TestConfig) -> bfs.path.PathLike | None: + itde_config: TestConfig) -> bfs.path.PathLike | None: if backend == BACKEND_ONPREM: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, bucketfs_name=bucketfs_params.name, - bucketfs_url=itde.bucketfs.url, - bucketfs_user=itde.bucketfs.username, - bucketfs_password=itde.bucketfs.password, + bucketfs_url=itde_config.bucketfs.url, + bucketfs_user=itde_config.bucketfs.username, + bucketfs_password=itde_config.bucketfs.password, bucket=bucketfs_params.bucket) return None diff --git a/tests/fixtures/setup_database_fixture.py b/tests/fixtures/setup_database_fixture.py index a84b6954..59c5be2a 100644 --- a/tests/fixtures/setup_database_fixture.py +++ b/tests/fixtures/setup_database_fixture.py @@ -106,7 +106,7 @@ def setup_database(backend: bfs.path.StorageBackend, return BUCKETFS_CONNECTION_NAME, SCHEMA_NAME -@pytest.fixture +@pytest.fixture(scope="session") def db_conn(setup_database, pyexasol_connection) -> pyexasol.ExaConnection: """ Per-test fixture that returns the same session-wide pyexasol connection, From 406c7ce6369cbef4b2d1790b59d90de2f04236a4 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 00:37:04 +0200 Subject: [PATCH 24/31] Save SaaS Database id in pytest stash to not recreate a SaaS DB for each test. It seems to be a bug that a session scope fixture is called for every test. This might happen because backend is parameterized. [CodeBuild] --- tests/fixtures/database_connection_fixture.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index aee7c9de..a95667a0 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -7,6 +7,7 @@ import pyexasol import pytest import exasol.bucketfs as bfs +from _pytest.fixtures import FixtureRequest from exasol.saas.client.api_access import ( OpenApiAccess, create_saas_client, @@ -15,6 +16,8 @@ ) from pytest_itde import config +CURRENT_SAAS_DATABASE_ID = pytest.StashKey[str]() + def _env(var: str) -> str: result = os.environ.get(var) @@ -54,21 +57,26 @@ def saas_token(backend) -> str: return _env("SAAS_PAT") + @pytest.fixture(scope="session") -def saas_database_id(backend, saas_url, saas_account_id, saas_token) -> str: +def saas_database_id(request: FixtureRequest, backend, saas_url, saas_account_id, saas_token) -> str: if backend == BACKEND_SAAS: - with ExitStack() as stack: - # Create and configure the SaaS client. - client = create_saas_client(host=saas_url, pat=saas_token) - api_access = OpenApiAccess(client=client, account_id=saas_account_id) - stack.enter_context(api_access.allowed_ip()) - - # Create a temporary database and waite till it becomes operational - db = stack.enter_context(api_access.database( - name=timestamp_name('TE_CI'), - idle_time=timedelta(hours=12))) - api_access.wait_until_running(db.id) - yield db.id + if CURRENT_SAAS_DATABASE_ID in request.session.stash: + with ExitStack() as stack: + # Create and configure the SaaS client. + client = create_saas_client(host=saas_url, pat=saas_token) + api_access = OpenApiAccess(client=client, account_id=saas_account_id) + stack.enter_context(api_access.allowed_ip()) + + # Create a temporary database and waite till it becomes operational + db = stack.enter_context(api_access.database( + name=timestamp_name('TE_CI'), + idle_time=timedelta(hours=12))) + api_access.wait_until_running(db.id) + request.session.stash[CURRENT_SAAS_DATABASE_ID] = db.id + yield db.id + else: + yield request.session.stash[CURRENT_SAAS_DATABASE_ID] else: yield '' From 5d2d261574fdeae9f8c2ed81eee7d1f825c9e65f Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 00:48:18 +0200 Subject: [PATCH 25/31] Fix pytest stash usage [CodeBuild] --- tests/fixtures/database_connection_fixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures/database_connection_fixture.py b/tests/fixtures/database_connection_fixture.py index a95667a0..a734ade7 100644 --- a/tests/fixtures/database_connection_fixture.py +++ b/tests/fixtures/database_connection_fixture.py @@ -61,7 +61,7 @@ def saas_token(backend) -> str: @pytest.fixture(scope="session") def saas_database_id(request: FixtureRequest, backend, saas_url, saas_account_id, saas_token) -> str: if backend == BACKEND_SAAS: - if CURRENT_SAAS_DATABASE_ID in request.session.stash: + if CURRENT_SAAS_DATABASE_ID not in request.session.stash: with ExitStack() as stack: # Create and configure the SaaS client. client = create_saas_client(host=saas_url, pat=saas_token) From 8d06a406884f35794e51c27a742404e16c2ba7da Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 01:43:24 +0200 Subject: [PATCH 26/31] Use pytest stash to export and upload the slc only once [CodeBuild] --- tests/fixtures/language_container_fixture.py | 46 ++++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tests/fixtures/language_container_fixture.py b/tests/fixtures/language_container_fixture.py index 0f5c38b0..4797267e 100644 --- a/tests/fixtures/language_container_fixture.py +++ b/tests/fixtures/language_container_fixture.py @@ -4,6 +4,7 @@ import time import pytest +from _pytest.fixtures import FixtureRequest from exasol_script_languages_container_tool.lib.tasks.export.export_info import ExportInfo from exasol.python_extension_common.deployment.language_container_deployer import LanguageContainerDeployer import exasol.bucketfs as bfs @@ -14,6 +15,9 @@ LANGUAGE_ALIAS = "PYTHON3_TE" CONTAINER_FILE_NAME = "exasol_transformers_extension_container.tar.gz" +SLC_EXPORT = pytest.StashKey[ExportInfo]() +SLC_UPLOADED = pytest.StashKey[bool]() + @pytest.fixture(scope="session") def flavor_path() -> Path: @@ -21,31 +25,37 @@ def flavor_path() -> Path: @pytest.fixture(scope="session") -def export_slc(flavor_path: Path) -> ExportInfo: - language_container.prepare_flavor(flavor_path=flavor_path) - export_result = language_container.export(flavor_path=flavor_path) - export_info = export_result.export_infos[str(flavor_path)]["release"] - return export_info +def export_slc(request: FixtureRequest, flavor_path: Path) -> ExportInfo: + if SLC_EXPORT not in request.session.stash: + language_container.prepare_flavor(flavor_path=flavor_path) + export_result = language_container.export(flavor_path=flavor_path) + export_info = export_result.export_infos[str(flavor_path)]["release"] + request.session.stash[SLC_EXPORT] = export_info + return request.session.stash[SLC_EXPORT] @pytest.fixture(scope="session") -def upload_slc(backend, bucketfs_location, pyexasol_connection, export_slc: ExportInfo) -> None: - cleanup_images() +def upload_slc(request: FixtureRequest, backend, bucketfs_location, pyexasol_connection, + export_slc: ExportInfo) -> None: + if SLC_UPLOADED not in request.session.stash: + cleanup_images() + + container_file_path = Path(export_slc.cache_file) - container_file_path = Path(export_slc.cache_file) + deployer = LanguageContainerDeployer(pyexasol_connection=pyexasol_connection, + language_alias=LANGUAGE_ALIAS, + bucketfs_path=bucketfs_location) - deployer = LanguageContainerDeployer(pyexasol_connection=pyexasol_connection, - language_alias=LANGUAGE_ALIAS, - bucketfs_path=bucketfs_location) + deployer.run(container_file=container_file_path, + bucket_file_path=CONTAINER_FILE_NAME, + allow_override=True, + wait_for_completion=True) - deployer.run(container_file=container_file_path, - bucket_file_path=CONTAINER_FILE_NAME, - allow_override=True, - wait_for_completion=True) + # Let's see if this helps + if backend == BACKEND_SAAS: + time.sleep(300) + request.session.stash[SLC_UPLOADED] = True - # Let's see if this helps - if backend == BACKEND_SAAS: - time.sleep(300) def cleanup_images(): From a12d8f8c663983a16b48a4d1323c1739a69aef56 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 01:47:08 +0200 Subject: [PATCH 27/31] Fix bucketfs_fixture.py [CodeBuild] --- tests/fixtures/bucketfs_fixture.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/fixtures/bucketfs_fixture.py b/tests/fixtures/bucketfs_fixture.py index 8ebe8ecc..316217b4 100644 --- a/tests/fixtures/bucketfs_fixture.py +++ b/tests/fixtures/bucketfs_fixture.py @@ -1,8 +1,8 @@ from __future__ import annotations -import pytest -from pytest_itde.config import TestConfig import exasol.bucketfs as bfs +import pytest +import pytest_itde from exasol_transformers_extension.utils.bucketfs_operations import create_bucketfs_location from tests.fixtures.database_connection_fixture import BACKEND_SAAS, BACKEND_ONPREM @@ -11,15 +11,14 @@ @pytest.fixture(scope="session") def bucketfs_location_onprem(backend, - itde_config: TestConfig) -> bfs.path.PathLike | None: - + bucketfs_config: pytest_itde.config.BucketFs) -> bfs.path.PathLike | None: if backend == BACKEND_ONPREM: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, bucketfs_name=bucketfs_params.name, - bucketfs_url=itde_config.bucketfs.url, - bucketfs_user=itde_config.bucketfs.username, - bucketfs_password=itde_config.bucketfs.password, + bucketfs_url=bucketfs_config.url, + bucketfs_user=bucketfs_config.username, + bucketfs_password=bucketfs_config.password, bucket=bucketfs_params.bucket) return None @@ -30,7 +29,6 @@ def bucketfs_location_saas(backend, saas_account_id, saas_database_id, saas_token) -> bfs.path.PathLike | None: - if backend == BACKEND_SAAS: return create_bucketfs_location( path_in_bucket=bucketfs_params.path_in_bucket, @@ -45,7 +43,6 @@ def bucketfs_location_saas(backend, def bucketfs_location(backend, bucketfs_location_onprem, bucketfs_location_saas) -> bfs.path.PathLike: - if backend == BACKEND_ONPREM: assert bucketfs_location_onprem is not None return bucketfs_location_onprem From 3b1281a4c83a02795d4fc4078bac58ae00c0172b Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 02:38:12 +0200 Subject: [PATCH 28/31] Save in pytest stash for which backend we uploaded the slc [CodeBuild] --- tests/fixtures/language_container_fixture.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fixtures/language_container_fixture.py b/tests/fixtures/language_container_fixture.py index 4797267e..60bad4ae 100644 --- a/tests/fixtures/language_container_fixture.py +++ b/tests/fixtures/language_container_fixture.py @@ -2,6 +2,7 @@ import subprocess from pathlib import Path import time +from typing import Dict import pytest from _pytest.fixtures import FixtureRequest @@ -16,7 +17,7 @@ CONTAINER_FILE_NAME = "exasol_transformers_extension_container.tar.gz" SLC_EXPORT = pytest.StashKey[ExportInfo]() -SLC_UPLOADED = pytest.StashKey[bool]() +SLC_UPLOADED = pytest.StashKey[Dict[str, bool]]() @pytest.fixture(scope="session") @@ -37,7 +38,7 @@ def export_slc(request: FixtureRequest, flavor_path: Path) -> ExportInfo: @pytest.fixture(scope="session") def upload_slc(request: FixtureRequest, backend, bucketfs_location, pyexasol_connection, export_slc: ExportInfo) -> None: - if SLC_UPLOADED not in request.session.stash: + if SLC_UPLOADED not in request.session.stash or backend not in request.session.stash[SLC_UPLOADED]: cleanup_images() container_file_path = Path(export_slc.cache_file) @@ -54,8 +55,7 @@ def upload_slc(request: FixtureRequest, backend, bucketfs_location, pyexasol_con # Let's see if this helps if backend == BACKEND_SAAS: time.sleep(300) - request.session.stash[SLC_UPLOADED] = True - + request.session.stash[SLC_UPLOADED][backend] = True def cleanup_images(): From 92905ed9540f89ad2377cdd6b2635b2c2e70ea57 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 03:35:50 +0200 Subject: [PATCH 29/31] Fix upload_slc [CodeBuild] --- tests/fixtures/language_container_fixture.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/language_container_fixture.py b/tests/fixtures/language_container_fixture.py index 60bad4ae..5f49eda1 100644 --- a/tests/fixtures/language_container_fixture.py +++ b/tests/fixtures/language_container_fixture.py @@ -38,7 +38,9 @@ def export_slc(request: FixtureRequest, flavor_path: Path) -> ExportInfo: @pytest.fixture(scope="session") def upload_slc(request: FixtureRequest, backend, bucketfs_location, pyexasol_connection, export_slc: ExportInfo) -> None: - if SLC_UPLOADED not in request.session.stash or backend not in request.session.stash[SLC_UPLOADED]: + if SLC_UPLOADED not in request.session.stash: + request.session.stash[SLC_UPLOADED] = dict() + if backend not in request.session.stash[SLC_UPLOADED]: cleanup_images() container_file_path = Path(export_slc.cache_file) From 9a3632185dc378cb95c08eadc45e17e65a6c937a Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 07:38:24 +0200 Subject: [PATCH 30/31] Increase DB Mem Size for ITDE to hopefully stabalize onprem tests in CodeBuild [CodeBuild] --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 1b5cb504..8cd86546 100644 --- a/noxfile.py +++ b/noxfile.py @@ -65,5 +65,5 @@ def start_database(session): '--environment-name', 'test', '--database-port-forward', '8888', '--bucketfs-port-forward', '6666', - '--db-mem-size', '4GB', + '--db-mem-size', '8GB', '--nameserver', '8.8.8.8') From cef193243bd68485cad83d40f6dbfd3374d3d018 Mon Sep 17 00:00:00 2001 From: Torsten Kilias Date: Wed, 7 Aug 2024 07:40:19 +0200 Subject: [PATCH 31/31] Increase VM Size for onprem tests in CodeBuild to hopefully stabalize them [CodeBuild] --- buildspec.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildspec.yml b/buildspec.yml index e67bbde6..290779e3 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -15,6 +15,6 @@ batch: buildspec: ./buildspec_saas.yml - identifier: onprem_tests env: - compute-type: BUILD_GENERAL1_MEDIUM + compute-type: BUILD_GENERAL1_LARGE privileged-mode: true buildspec: ./buildspec_onprem.yml