Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Language Validation Test #257

Merged
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
8fc4b9a
test to validate languages
NIXBLACK11 Oct 31, 2023
9a3228b
test to validate languages
NIXBLACK11 Oct 31, 2023
ad9a588
Delete flores directory
NIXBLACK11 Oct 31, 2023
7f32d7a
Update validate_models.py
NIXBLACK11 Oct 31, 2023
ff3254b
Update validate_models.py
NIXBLACK11 Oct 31, 2023
cb2d91a
Update validate_models.py
NIXBLACK11 Oct 31, 2023
f4e84d2
Update validate_models.py
NIXBLACK11 Oct 31, 2023
109eac2
Update .gitignore
NIXBLACK11 Oct 31, 2023
2236fe0
added pytest to validate_models.py
NIXBLACK11 Nov 1, 2023
472657b
Update validate_models.py
NIXBLACK11 Nov 1, 2023
c744030
Update validate_models.py
NIXBLACK11 Nov 1, 2023
c71aec7
Update validate_models.py using mock downloader
NIXBLACK11 Nov 4, 2023
c816d79
Update validate_models.py
NIXBLACK11 Nov 6, 2023
31aa252
Update validate_models.py
NIXBLACK11 Nov 6, 2023
c34279d
Update validate_models.py
NIXBLACK11 Nov 6, 2023
8b25a3d
Update validate_models.py
NIXBLACK11 Nov 6, 2023
302d068
Update validate_models.py
NIXBLACK11 Nov 7, 2023
73f873f
Update download_models.py according to 1.
NIXBLACK11 Nov 7, 2023
5e04a2a
Update download_models.py
NIXBLACK11 Nov 7, 2023
e3552a7
Update download_models.py
NIXBLACK11 Nov 7, 2023
1d74246
Update download_models.py
NIXBLACK11 Nov 7, 2023
1bddd81
Update validate_models.py
NIXBLACK11 Nov 8, 2023
e4f3fd0
Update models.py
NIXBLACK11 Nov 8, 2023
03284a2
Update laser_tokenizer.py
NIXBLACK11 Nov 8, 2023
43f4d1a
Update download_models.py
NIXBLACK11 Nov 8, 2023
6ef54c2
Update validate_models.py
NIXBLACK11 Nov 8, 2023
89c9dde
Update validate_models.py
NIXBLACK11 Nov 8, 2023
d883ee0
Added slow and fast tests to validate_models.py
NIXBLACK11 Nov 9, 2023
e1e22a3
Update validate_models.py
NIXBLACK11 Nov 9, 2023
a8f4135
Update validate_models.py
NIXBLACK11 Nov 9, 2023
4cd83e8
Create test_validate_models.py
NIXBLACK11 Nov 9, 2023
e0be04f
Rename test_validate_models.py to test_models_initialization.py
NIXBLACK11 Nov 9, 2023
9ec012f
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
fbbc6fc
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
99ebbfd
Update download_models.py
NIXBLACK11 Nov 9, 2023
6356c4d
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
eac3674
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
d3935f9
Update download_models.py
NIXBLACK11 Nov 9, 2023
18c1657
Update validate_models.py
NIXBLACK11 Nov 14, 2023
c26e775
Update validate_models.py
NIXBLACK11 Nov 14, 2023
023eab2
Update validate_models.py
NIXBLACK11 Nov 14, 2023
3944556
Update validate_models.py
NIXBLACK11 Nov 14, 2023
0a4d983
Update validate_models.py
NIXBLACK11 Nov 14, 2023
e5823d6
Update validate_models.py
NIXBLACK11 Nov 14, 2023
92345be
Update validate_models.py
NIXBLACK11 Nov 14, 2023
87a08e9
Update validate_models.py
NIXBLACK11 Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion laser_encoders/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,14 @@ def download_laser2(self):
self.download("laser2.cvocab")

def download_laser3(self, lang: str, spm: bool = False):
lang = self.get_language_code(LASER3_LANGUAGE, lang)
result = self.get_language_code(LASER3_LANGUAGE, lang)

if isinstance(result, list):
raise ValueError(
f"There are script-specific models available for {lang}. Please choose one from the following: {result}"
)

lang = result
self.download(f"laser3-{lang}.v1.pt")
if spm:
if lang in SPM_LANGUAGE:
Expand Down
4 changes: 3 additions & 1 deletion laser_encoders/laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,14 @@ def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = N
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE:
if lang in LASER3_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
elif lang in LASER2_LANGUAGE:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch!

filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
Expand Down
2 changes: 1 addition & 1 deletion laser_encoders/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,8 @@ def initialize_encoder(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in LASER3_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif lang in LASER2_LANGUAGE:
Expand Down
58 changes: 58 additions & 0 deletions laser_encoders/test_models_initialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import tempfile

import pytest

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
from laser_encoders.laser_tokenizer import initialize_tokenizer
from laser_encoders.models import initialize_encoder


def test_validate_achnese_models_and_tokenize_laser3(lang="acehnese"):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser3(lang)
encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


def test_validate_english_models_and_tokenize_laser2(lang="english"):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser2()

encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


def test_validate_kashmiri_models_and_tokenize_laser3(lang="kas"):
avidale marked this conversation as resolved.
Show resolved Hide resolved
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
lang_codes = LASER3_LANGUAGE[lang]

for code in lang_codes:
avidale marked this conversation as resolved.
Show resolved Hide resolved
downloader.download_laser3(code)
encoder = initialize_encoder(code, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(code, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")
122 changes: 122 additions & 0 deletions laser_encoders/validate_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import os
import tempfile

import pytest

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
from laser_encoders.laser_tokenizer import initialize_tokenizer
from laser_encoders.models import initialize_encoder


@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
def test_validate_language_models_and_tokenize_laser3(lang):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
if lang in ["kashmiri", "kas", "central kanuri", "knc"]:
with pytest.raises(ValueError) as excinfo:
downloader.download_laser3(lang)
assert "ValueError" in str(excinfo.value)
print(f"{lang} language model raised a ValueError as expected.")
else:
downloader.download_laser3(lang)
encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
def test_validate_language_models_and_tokenize_laser2(lang):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser2()

encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


class MockLaserModelDownloader:
def __init__(self, model_dir):
self.model_dir = model_dir

def get_language_code(self, language_list: dict, lang: str) -> str:
try:
lang_3_4 = language_list[lang]
if isinstance(lang_3_4, tuple):
options = ", ".join(f"'{opt}'" for opt in lang_3_4)
raise ValueError(
f"Language '{lang_3_4}' has multiple options: {options}. Please specify using --lang."
)
return lang_3_4
except KeyError:
raise ValueError(
f"language name: {lang} not found in language list. Specify a supported language name"
)

def download_laser3(self, lang):
Copy link
Contributor

@heffernankevin heffernankevin Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC @avidale's suggestion for the mock downloader was just to check if the language codes exist? (and then have a real downloader for a couple of languages like you have in test_models_initialization.py?). Maybe I misunderstood this comment: #257 (comment).

For example, you could parameterise it with the LASER3 langs, but the func download_laser3 inside the mock downloader just checks if the language code exists instead of actually downloading it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(resolved in chat)

lang = self.get_language_code(LASER3_LANGUAGE, lang)
file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt")
if os.path.exists(file_path):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can simply say:

return os.path.exists(file_path)

Copy link
Contributor Author

@NIXBLACK11 NIXBLACK11 Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@heffernankevin It should return opposite of this, as it returns true when there is error and false if there is no error.

Copy link
Contributor

@heffernankevin heffernankevin Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make it more readable, I would suggest instead maybe raising an error if the lang code doesn't exist and then check it using something like:

try:
  download_laser3(lang)
except:
  [...]

return False
else:
return True

def download_laser2(self):
files = ["laser2.pt", "laser2.spm", "laser2.cvocab"]
for file_name in files:
file_path = os.path.join(self.model_dir, file_name)
if os.path.exists(file_path):
return False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will return early won't check the other two files? (laser2.spm and laser2.cvocab)

else:
return True


CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory

# This uses the mock downloader
@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
def test_validate_language_models_and_tokenize_mock_laser3(lang):
downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)
err = downloader.download_laser3(lang)
if err == True:
raise pytest.error(f"Skipping test for {lang} language.")

encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)

tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


# This uses the mock downloader
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't use the mock downloader? (L112)

@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
def test_validate_language_models_and_tokenize_mock_laser2(lang):
downloader = LaserModelDownloader(model_dir=CACHE_DIR)
err = downloader.download_laser2()
if err == True:
raise pytest.error()

encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)

tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")
Loading