exasol · MarleneKress79789 · Aug 7, 2024 · Jun 26, 2024 · Jun 28, 2024 · Jul 18, 2024
diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py
@@ -19,7 +19,7 @@
 @click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model)
 @click.option('--sub-dir', type=str, required=True,
               help="directory where the model is stored in the BucketFS")
-@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") #todo chnage docu
+@click.option('--token', type=str, help="Hugging Face hub token for private models") #todo chnage docu
 @click.option('--local-model-path', type=click.Path(exists=True, file_okay=True),
               required=True, help="local path where model is located")
 @click.option('--bucketfs-name', type=str)
@@ -50,7 +50,6 @@ def main(
         task_type: str,
         sub_dir: str,
         token: str | None,
-        #local_model_path: str,
         bucketfs_name: str,
         bucketfs_host: str,
         bucketfs_port: int,

diff --git a/exasol_transformers_extension/utils/model_specification.py b/exasol_transformers_extension/utils/model_specification.py
@@ -9,9 +9,9 @@ class ModelSpecification:
     def __init__(self, model_name: str, task_type: str):
         # task_type, model_version
         self.model_name = model_name
-        self.task_type = task_type
+        self.task_type = self._set_task_type_from_udf_name(task_type)
 
-    def set_task_type_from_udf_name(self, text):
+    def _set_task_type_from_udf_name(self, text):
         """
         switches user input(matching udf name) to transformers task types
         """
@@ -31,7 +31,7 @@ def set_task_type_from_udf_name(self, text):
             task_type = "zero-shot-classification"
         else:
             task_type = text
-        self.task_type = task_type
+        return task_type
 
     def get_model_specs_for_download(self):#todo change usages?
         """
@@ -47,7 +47,7 @@ def __eq__(self, other):
         return False
 
     def get_model_specific_path_suffix(self) -> PurePosixPath:
-        return PurePosixPath(self.model_name + "_" + self.task_type) #model_name-version-task
+        return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# todo change
 
     def get_model_factory(self):
         """
@@ -57,15 +57,15 @@ def get_model_factory(self):
         if model_task_type == "fill-mask":
             model_factory = transformers.AutoModelForMaskedLM
         elif model_task_type == "translation":
-            model_factory = transformers.T5Model #todo correct? se to seq in translation udf
+            model_factory = transformers.AutoModelForSeq2SeqLM
         elif model_task_type == "zero-shot-classification":
             model_factory = transformers.AutoModelForSequenceClassification
         elif model_task_type == "text-classification":
             model_factory = transformers.AutoModelForSequenceClassification
         elif model_task_type == "question-answering":
             model_factory = transformers.AutoModelForQuestionAnswering
-        #elif model_task_type == "text-generation":
-        #    model_factory = transformers.AutoModelFor
+        elif model_task_type == "text-generation":
+            model_factory = transformers.AutoModelForCausalLM
         elif model_task_type == "token-classification":
             model_factory = transformers.AutoModelForTokenClassification
         else:

diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py
@@ -53,10 +53,12 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification,
     current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification,
                                                                               "",
                                                                               model_params.sub_dir)
-    tmpdir = tmpdir_factory.mktemp(current_model_specs.get_model_specific_path_suffix())
+
+    tmpdir = tmpdir_factory.mktemp(current_model_specs.task_type)
     model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path()
 
     bucketfs_path_for_model = tmpdir / model_path_in_bucketfs
+    print(bucketfs_path_for_model)
     download_model_to_path(current_model_specs, bucketfs_path_for_model)
     return tmpdir
 
@@ -69,43 +71,43 @@ def prepare_filling_mask_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPa
 
 @pytest.fixture(scope="session")
 def prepare_question_answering_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "question-answering"
+    model_specification = model_params.q_a_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
 @pytest.fixture(scope="session")
 def prepare_sequence_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "text-classification"
+    model_specification = model_params.sequence_class_model_specs
+    bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
+    yield bucketfs_path
+
+@pytest.fixture(scope="session")
+def prepare_sequence_classification_pair_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
+    model_specification = model_params.sequence_class_pair_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
 @pytest.fixture(scope="session")
 def prepare_text_generation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "text-generation"
+    model_specification = model_params.text_gen_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
 @pytest.fixture(scope="session")
 def prepare_token_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "token-classification"
+    model_specification = model_params.token_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
 @pytest.fixture(scope="session")
 def prepare_translation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "translation"
+    model_specification = model_params.seq2seq_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
 @pytest.fixture(scope="session")
 def prepare_zero_shot_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
-    model_specification = model_params.base_model_specs
-    model_specification.task_type = "zero-shot-classification"
+    model_specification = model_params.zero_shot_model_specs
     bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
     yield bucketfs_path
 
@@ -139,7 +141,7 @@ def upload_filling_mask_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
     base_model_specs = model_params.base_model_specs
     base_model_specs.task_type = "fill-mask"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
@@ -148,59 +150,62 @@ def upload_filling_mask_model_to_bucketfs(
 @pytest.fixture(scope="session")
 def upload_question_answering_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "question-answering"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.q_a_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
 
 @pytest.fixture(scope="session")
 def upload_sequence_classification_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "text-classification"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.sequence_class_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
+    with upload_model_to_bucketfs(
+            base_model_specs, tmpdir, bucketfs_location) as path:
+        yield path
+
+@pytest.fixture(scope="session")
+def upload_sequence_classification_pair_model_to_bucketfs(
+        bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
+    base_model_specs = model_params.sequence_class_pair_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
 
 @pytest.fixture(scope="session")
 def upload_text_generation_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "text-generation"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.text_gen_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
 
 @pytest.fixture(scope="session")
 def upload_token_classification_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "token-classification"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.token_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
 
 @pytest.fixture(scope="session")
 def upload_translation_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "translation"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.seq2seq_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
 
 @pytest.fixture(scope="session")
 def upload_zero_shot_classification_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
-    base_model_specs = model_params.base_model_specs
-    base_model_specs.task_type = "zero-shot-classification"
-    tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
+    base_model_specs = model_params.zero_shot_model_specs
+    tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
     with upload_model_to_bucketfs(
             base_model_specs, tmpdir, bucketfs_location) as path:
         yield path
@@ -211,7 +216,7 @@ def upload_zero_shot_classification_model_to_bucketfs(
 def upload_seq2seq_model_to_bucketfs(
         bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
     model_specification = model_params.seq2seq_model_specs
-    tmpdir = tmpdir_factory.mktemp(model_specification.get_model_specific_path_suffix())
+    tmpdir = tmpdir_factory.mktemp(model_specification.task_type)
     with upload_model_to_bucketfs(
             model_specification, tmpdir, bucketfs_location) as path:
         yield path
diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py
@@ -6,10 +6,6 @@
 import exasol.bucketfs as bfs
 
 from exasol_transformers_extension import upload_model
-from exasol_transformers_extension.utils import bucketfs_operations
-from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \
-    CurrentModelSpecificationFromModelSpecs
-from exasol_transformers_extension.utils.model_specification import ModelSpecification
 from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql
 from tests.utils import postprocessing
 from tests.utils.parameters import bucketfs_params, model_params

diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py
@@ -30,8 +30,9 @@ def test_prediction_with_downloader_udf(
             t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name));
             """
 
-        pyexasol_connection.execute(query).fetchall()
+        result = pyexasol_connection.execute(query).fetchall()
         time.sleep(10)
+        print(result)
 
         # execute the filling mask UDF
         text_data = "I <mask> you so much."

diff --git a/tests/integration_tests/with_db/udfs/test_question_answering_script.py b/tests/integration_tests/with_db/udfs/test_question_answering_script.py
@@ -12,7 +12,7 @@
 def test_question_answering_script(
         setup_database, pyexasol_connection, upload_question_answering_model_to_bucketfs):
     bucketfs_conn_name, schema_name = setup_database
-    question = "How many syllables are in the word Syllable?"
+    question = "Where is Exasol based?"
     n_rows = 100
     top_k = 1
     input_data = []
@@ -21,9 +21,9 @@ def test_question_answering_script(
             '',
             bucketfs_conn_name,
             str(model_params.sub_dir),
-            model_params.base_model_specs.model_name,
+            model_params.q_a_model_specs.model_name,
             question,
-            ' '.join((model_params.text_data, str(i))),
+            model_params.text_data,
             top_k
         ))
 
@@ -53,7 +53,7 @@ def test_question_answering_script(
     for i in range(5):
         print(result[i])
     results = [result[i][6] for i in range(len(result))]
-    acceptable_results = ["three", "3", "want", "need"]
+    acceptable_results = ["Nuremberg", "Germany"]
     number_accepted_results = 0
 
     def contains(string, list):

diff --git a/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py b/tests/integration_tests/with_db/udfs/test_sequence_classification_single_text_script.py
@@ -2,20 +2,26 @@
 from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql
 from tests.utils.parameters import model_params
 
+#debug
+from tests.fixtures.model_fixture import *
+from tests.fixtures.setup_database_fixture import *
+from tests.fixtures.language_container_fixture import *
+from tests.fixtures.bucketfs_fixture import *
+from tests.fixtures.database_connection_fixture import *
 
 def test_sequence_classification_single_text_script(
         setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs):
     bucketfs_conn_name, schema_name = setup_database
-    n_labels = 2
+    n_labels = 3 # negative, neutral, positive
     n_rows = 100
     input_data = []
     for i in range(n_rows):
         input_data.append((
             '',
             bucketfs_conn_name,
             str(model_params.sub_dir),
-            model_params.base_model_specs.model_name,
-            model_params.text_data))
+            model_params.sequence_class_model_specs.model_name,
+            "I am so happy to be working on the Transformers Extension."))
 
     query = f"SELECT TE_SEQUENCE_CLASSIFICATION_SINGLE_TEXT_UDF(" \
             f"t.device_id, " \
@@ -38,18 +44,13 @@ def test_sequence_classification_single_text_script(
     n_cols_result = len(input_data[0]) + (added_columns - removed_columns)
     assert len(result) == n_rows_result and len(result[0]) == n_cols_result
 
-    for i in range(10):
-        print(result[i])
-
     # lenient test for quality of results, will be replaced by deterministic test later
-    results = [result[i][5] for i in range(len(result))]
-    acceptable_results = ["love", "miss", "want", "need"]
-    number_accepted_results = 0
-
-    def contains(string,list):
-        return any(map(lambda x: x in string, list))
 
-    for i in range(len(results)):
-        if contains(results[i], acceptable_results):
+    number_accepted_results = 0
+    for i in range(len(result)):
+        if (result[i][4] == "positive" and
+                result[i][5] > 0.8): #check if confidence resonably high
+            number_accepted_results += 1
+        elif result[i][5] < 0.2:
             number_accepted_results += 1
-    assert number_accepted_results > n_rows_result/2
+    assert number_accepted_results > n_rows_result / 1.5