Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added task_type parameter for correct model loading #245

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
5bbb23b
started adding task_type
MarleneKress79789 Jun 26, 2024
ea94780
started adding quality control tests
MarleneKress79789 Jun 28, 2024
27b6fb0
fixed integration tests, added separate models for tasks
MarleneKress79789 Jul 18, 2024
61df528
cleanup and renamings, added docu for new parameters
MarleneKress79789 Jul 31, 2024
f8d5721
Merge branch 'main' into bug/add_task_type_parameter_for_correct_mode…
MarleneKress79789 Jul 31, 2024
f5c8c7c
[CodeBuild] remove prints, changelog
MarleneKress79789 Jul 31, 2024
34fea48
[CodeBuild] fixed run integration tests without saas, fixed import
MarleneKress79789 Aug 1, 2024
2674d8b
[CodeBuild] fixed text replace error
MarleneKress79789 Aug 5, 2024
1ac2088
[CodeBuild] prepare release
MarleneKress79789 Aug 5, 2024
10cc7d4
Apply suggestions from code review [CodeBuild]
tkilias Aug 5, 2024
f756d0b
Apply suggestions from code review [CodeBuild]
tkilias Aug 5, 2024
8b174c7
[CodeBuild] fix saas db naming error
MarleneKress79789 Aug 6, 2024
c28f186
Use batch build for AWS CodeBuild to speed up tests against backends.…
tkilias Aug 6, 2024
7380563
[CodeBuild]
tkilias Aug 6, 2024
9e22f43
[CodeBuild]
tkilias Aug 6, 2024
727fead
[CodeBuild]
tkilias Aug 6, 2024
4f4c11a
[CodeBuild]
tkilias Aug 6, 2024
6df3989
Fix buildspec.yml [CodeBuild]
tkilias Aug 6, 2024
3e3bd6d
Use correct backend strings for comparison in tests [CodeBuild]
tkilias Aug 6, 2024
0f51211
Use correct backend strings for onprem for comparison in tests [CodeB…
tkilias Aug 6, 2024
3e07bcd
Add --setup-show to running integration tests nox sessions [CodeBuild]
tkilias Aug 6, 2024
99732c1
[CodeBuild]
tkilias Aug 6, 2024
836be30
Build and export SLC before running SaaS integration tests to avoid w…
tkilias Aug 6, 2024
501ba04
Use itde_config fixture instead itde fiture to avoid starting the itd…
tkilias Aug 6, 2024
406c7ce
Save SaaS Database id in pytest stash to not recreate a SaaS DB for e…
tkilias Aug 6, 2024
5d2d261
Fix pytest stash usage [CodeBuild]
tkilias Aug 6, 2024
8d06a40
Use pytest stash to export and upload the slc only once [CodeBuild]
tkilias Aug 6, 2024
a12d8f8
Fix bucketfs_fixture.py [CodeBuild]
tkilias Aug 6, 2024
3b1281a
Save in pytest stash for which backend we uploaded the slc [CodeBuild]
tkilias Aug 7, 2024
92905ed
Fix upload_slc [CodeBuild]
tkilias Aug 7, 2024
9a36321
Increase DB Mem Size for ITDE to hopefully stabalize onprem tests in …
tkilias Aug 7, 2024
cef1932
Increase VM Size for onprem tests in CodeBuild to hopefully stabalize…
tkilias Aug 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions exasol_transformers_extension/upload_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
@click.option('--task_type', type=str, required=True) #todo change docu (needed to know where to safe model)
@click.option('--sub-dir', type=str, required=True,
help="directory where the model is stored in the BucketFS")
@click.option('--token', type=str, default=None, help="Hugging Face hub token for private models") #todo chnage docu
@click.option('--token', type=str, help="Hugging Face hub token for private models") #todo chnage docu
MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved
@click.option('--local-model-path', type=click.Path(exists=True, file_okay=True),
required=True, help="local path where model is located")
@click.option('--bucketfs-name', type=str)
Expand Down Expand Up @@ -50,7 +50,6 @@ def main(
task_type: str,
sub_dir: str,
token: str | None,
#local_model_path: str,
bucketfs_name: str,
bucketfs_host: str,
bucketfs_port: int,
Expand Down
14 changes: 7 additions & 7 deletions exasol_transformers_extension/utils/model_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ class ModelSpecification:
def __init__(self, model_name: str, task_type: str):
# task_type, model_version
self.model_name = model_name
self.task_type = task_type
self.task_type = self._set_task_type_from_udf_name(task_type)

def set_task_type_from_udf_name(self, text):
def _set_task_type_from_udf_name(self, text):
"""
switches user input(matching udf name) to transformers task types
"""
Expand All @@ -31,7 +31,7 @@ def set_task_type_from_udf_name(self, text):
task_type = "zero-shot-classification"
else:
task_type = text
self.task_type = task_type
return task_type

def get_model_specs_for_download(self):#todo change usages?
"""
Expand All @@ -47,7 +47,7 @@ def __eq__(self, other):
return False

def get_model_specific_path_suffix(self) -> PurePosixPath:
return PurePosixPath(self.model_name + "_" + self.task_type) #model_name-version-task
return PurePosixPath(self.model_name.replace(".", "_") + "_" + self.task_type) #model_name-version-task# todo change

def get_model_factory(self):
"""
Expand All @@ -57,15 +57,15 @@ def get_model_factory(self):
if model_task_type == "fill-mask":
model_factory = transformers.AutoModelForMaskedLM
elif model_task_type == "translation":
model_factory = transformers.T5Model #todo correct? se to seq in translation udf
model_factory = transformers.AutoModelForSeq2SeqLM
elif model_task_type == "zero-shot-classification":
model_factory = transformers.AutoModelForSequenceClassification
elif model_task_type == "text-classification":
model_factory = transformers.AutoModelForSequenceClassification
elif model_task_type == "question-answering":
model_factory = transformers.AutoModelForQuestionAnswering
#elif model_task_type == "text-generation":
# model_factory = transformers.AutoModelFor
elif model_task_type == "text-generation":
model_factory = transformers.AutoModelForCausalLM
elif model_task_type == "token-classification":
model_factory = transformers.AutoModelForTokenClassification
else:
Expand Down
71 changes: 38 additions & 33 deletions tests/fixtures/model_fixture.py
MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ def prepare_model_for_local_bucketfs(model_specification: ModelSpecification,
current_model_specs = CurrentModelSpecificationFromModelSpecs().transform(model_specification,
"",
model_params.sub_dir)
tmpdir = tmpdir_factory.mktemp(current_model_specs.get_model_specific_path_suffix())

tmpdir = tmpdir_factory.mktemp(current_model_specs.task_type)
model_path_in_bucketfs = current_model_specs.get_bucketfs_model_save_path()

bucketfs_path_for_model = tmpdir / model_path_in_bucketfs
print(bucketfs_path_for_model)
download_model_to_path(current_model_specs, bucketfs_path_for_model)
return tmpdir

Expand All @@ -69,43 +71,43 @@ def prepare_filling_mask_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPa

@pytest.fixture(scope="session")
def prepare_question_answering_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "question-answering"
model_specification = model_params.q_a_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_sequence_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "text-classification"
model_specification = model_params.sequence_class_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_sequence_classification_pair_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.sequence_class_pair_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_text_generation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "text-generation"
model_specification = model_params.text_gen_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_token_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "token-classification"
model_specification = model_params.token_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_translation_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "translation"
model_specification = model_params.seq2seq_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

@pytest.fixture(scope="session")
def prepare_zero_shot_classification_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath:
model_specification = model_params.base_model_specs
model_specification.task_type = "zero-shot-classification"
model_specification = model_params.zero_shot_model_specs
bucketfs_path = prepare_model_for_local_bucketfs(model_specification, tmpdir_factory)
yield bucketfs_path

Expand Down Expand Up @@ -139,7 +141,7 @@ def upload_filling_mask_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "fill-mask"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path
Expand All @@ -148,59 +150,62 @@ def upload_filling_mask_model_to_bucketfs(
@pytest.fixture(scope="session")
def upload_question_answering_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "question-answering"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.q_a_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_sequence_classification_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "text-classification"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.sequence_class_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_sequence_classification_pair_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.sequence_class_pair_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_text_generation_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "text-generation"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.text_gen_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_token_classification_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "token-classification"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.token_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_translation_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "translation"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.seq2seq_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path

@pytest.fixture(scope="session")
def upload_zero_shot_classification_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
base_model_specs = model_params.base_model_specs
base_model_specs.task_type = "zero-shot-classification"
tmpdir = tmpdir_factory.mktemp(base_model_specs.get_model_specific_path_suffix())
base_model_specs = model_params.zero_shot_model_specs
tmpdir = tmpdir_factory.mktemp(base_model_specs.task_type)
with upload_model_to_bucketfs(
base_model_specs, tmpdir, bucketfs_location) as path:
yield path
Expand All @@ -211,7 +216,7 @@ def upload_zero_shot_classification_model_to_bucketfs(
def upload_seq2seq_model_to_bucketfs(
bucketfs_location: bfs.path.PathLike, tmpdir_factory) -> PurePosixPath:
model_specification = model_params.seq2seq_model_specs
tmpdir = tmpdir_factory.mktemp(model_specification.get_model_specific_path_suffix())
tmpdir = tmpdir_factory.mktemp(model_specification.task_type)
with upload_model_to_bucketfs(
model_specification, tmpdir, bucketfs_location) as path:
yield path
4 changes: 0 additions & 4 deletions tests/integration_tests/with_db/test_upload_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
import exasol.bucketfs as bfs

from exasol_transformers_extension import upload_model
from exasol_transformers_extension.utils import bucketfs_operations
from exasol_transformers_extension.utils.current_model_specification import CurrentModelSpecification, \
CurrentModelSpecificationFromModelSpecs
from exasol_transformers_extension.utils.model_specification import ModelSpecification
from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql
from tests.utils import postprocessing
from tests.utils.parameters import bucketfs_params, model_params
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ def test_prediction_with_downloader_udf(
t(model_name, task_type, sub_dir, bucketfs_conn_name, token_conn_name));
"""

pyexasol_connection.execute(query).fetchall()
result = pyexasol_connection.execute(query).fetchall()
time.sleep(10)
print(result)

# execute the filling mask UDF
text_data = "I <mask> you so much."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def test_question_answering_script(
setup_database, pyexasol_connection, upload_question_answering_model_to_bucketfs):
bucketfs_conn_name, schema_name = setup_database
question = "How many syllables are in the word Syllable?"
question = "Where is Exasol based?"
n_rows = 100
top_k = 1
input_data = []
Expand All @@ -21,9 +21,9 @@ def test_question_answering_script(
'',
bucketfs_conn_name,
str(model_params.sub_dir),
model_params.base_model_specs.model_name,
model_params.q_a_model_specs.model_name,
question,
' '.join((model_params.text_data, str(i))),
model_params.text_data,
top_k
))

Expand Down Expand Up @@ -53,7 +53,7 @@ def test_question_answering_script(
for i in range(5):
print(result[i])
results = [result[i][6] for i in range(len(result))]
acceptable_results = ["three", "3", "want", "need"]
acceptable_results = ["Nuremberg", "Germany"]
number_accepted_results = 0

def contains(string, list):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,26 @@
from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql
from tests.utils.parameters import model_params

#debug
from tests.fixtures.model_fixture import *
from tests.fixtures.setup_database_fixture import *
from tests.fixtures.language_container_fixture import *
from tests.fixtures.bucketfs_fixture import *
from tests.fixtures.database_connection_fixture import *

def test_sequence_classification_single_text_script(
setup_database, pyexasol_connection, upload_sequence_classification_model_to_bucketfs):
bucketfs_conn_name, schema_name = setup_database
n_labels = 2
n_labels = 3 # negative, neutral, positive
n_rows = 100
input_data = []
for i in range(n_rows):
input_data.append((
'',
bucketfs_conn_name,
str(model_params.sub_dir),
model_params.base_model_specs.model_name,
model_params.text_data))
model_params.sequence_class_model_specs.model_name,
"I am so happy to be working on the Transformers Extension."))

query = f"SELECT TE_SEQUENCE_CLASSIFICATION_SINGLE_TEXT_UDF(" \
f"t.device_id, " \
Expand All @@ -38,18 +44,13 @@ def test_sequence_classification_single_text_script(
n_cols_result = len(input_data[0]) + (added_columns - removed_columns)
assert len(result) == n_rows_result and len(result[0]) == n_cols_result

for i in range(10):
print(result[i])

# lenient test for quality of results, will be replaced by deterministic test later
results = [result[i][5] for i in range(len(result))]
acceptable_results = ["love", "miss", "want", "need"]
number_accepted_results = 0

def contains(string,list):
return any(map(lambda x: x in string, list))

for i in range(len(results)):
if contains(results[i], acceptable_results):
number_accepted_results = 0
for i in range(len(result)):
if (result[i][4] == "positive" and
result[i][5] > 0.8): #check if confidence resonably high
number_accepted_results += 1
elif result[i][5] < 0.2:
number_accepted_results += 1
assert number_accepted_results > n_rows_result/2
assert number_accepted_results > n_rows_result / 1.5
Loading
Loading