Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Language Validation Test #257

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
8fc4b9a
test to validate languages
NIXBLACK11 Oct 31, 2023
9a3228b
test to validate languages
NIXBLACK11 Oct 31, 2023
ad9a588
Delete flores directory
NIXBLACK11 Oct 31, 2023
7f32d7a
Update validate_models.py
NIXBLACK11 Oct 31, 2023
ff3254b
Update validate_models.py
NIXBLACK11 Oct 31, 2023
cb2d91a
Update validate_models.py
NIXBLACK11 Oct 31, 2023
f4e84d2
Update validate_models.py
NIXBLACK11 Oct 31, 2023
109eac2
Update .gitignore
NIXBLACK11 Oct 31, 2023
2236fe0
added pytest to validate_models.py
NIXBLACK11 Nov 1, 2023
472657b
Update validate_models.py
NIXBLACK11 Nov 1, 2023
c744030
Update validate_models.py
NIXBLACK11 Nov 1, 2023
c71aec7
Update validate_models.py using mock downloader
NIXBLACK11 Nov 4, 2023
c816d79
Update validate_models.py
NIXBLACK11 Nov 6, 2023
31aa252
Update validate_models.py
NIXBLACK11 Nov 6, 2023
c34279d
Update validate_models.py
NIXBLACK11 Nov 6, 2023
8b25a3d
Update validate_models.py
NIXBLACK11 Nov 6, 2023
302d068
Update validate_models.py
NIXBLACK11 Nov 7, 2023
73f873f
Update download_models.py according to 1.
NIXBLACK11 Nov 7, 2023
5e04a2a
Update download_models.py
NIXBLACK11 Nov 7, 2023
e3552a7
Update download_models.py
NIXBLACK11 Nov 7, 2023
1d74246
Update download_models.py
NIXBLACK11 Nov 7, 2023
1bddd81
Update validate_models.py
NIXBLACK11 Nov 8, 2023
e4f3fd0
Update models.py
NIXBLACK11 Nov 8, 2023
03284a2
Update laser_tokenizer.py
NIXBLACK11 Nov 8, 2023
43f4d1a
Update download_models.py
NIXBLACK11 Nov 8, 2023
6ef54c2
Update validate_models.py
NIXBLACK11 Nov 8, 2023
89c9dde
Update validate_models.py
NIXBLACK11 Nov 8, 2023
d883ee0
Added slow and fast tests to validate_models.py
NIXBLACK11 Nov 9, 2023
e1e22a3
Update validate_models.py
NIXBLACK11 Nov 9, 2023
a8f4135
Update validate_models.py
NIXBLACK11 Nov 9, 2023
4cd83e8
Create test_validate_models.py
NIXBLACK11 Nov 9, 2023
e0be04f
Rename test_validate_models.py to test_models_initialization.py
NIXBLACK11 Nov 9, 2023
9ec012f
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
fbbc6fc
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
99ebbfd
Update download_models.py
NIXBLACK11 Nov 9, 2023
6356c4d
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
eac3674
Update test_models_initialization.py
NIXBLACK11 Nov 9, 2023
d3935f9
Update download_models.py
NIXBLACK11 Nov 9, 2023
18c1657
Update validate_models.py
NIXBLACK11 Nov 14, 2023
c26e775
Update validate_models.py
NIXBLACK11 Nov 14, 2023
023eab2
Update validate_models.py
NIXBLACK11 Nov 14, 2023
3944556
Update validate_models.py
NIXBLACK11 Nov 14, 2023
0a4d983
Update validate_models.py
NIXBLACK11 Nov 14, 2023
e5823d6
Update validate_models.py
NIXBLACK11 Nov 14, 2023
92345be
Update validate_models.py
NIXBLACK11 Nov 14, 2023
87a08e9
Update validate_models.py
NIXBLACK11 Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions laser_encoders/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ def download(self, filename: str):
def get_language_code(self, language_list: dict, lang: str) -> str:
try:
lang_3_4 = language_list[lang]
if isinstance(lang_3_4, tuple):
if isinstance(lang_3_4, list):
options = ", ".join(f"'{opt}'" for opt in lang_3_4)
raise ValueError(
f"Language '{lang_3_4}' has multiple options: {options}. Please specify using --lang."
f"Language '{lang}' has multiple options: {options}. Please specify using the 'lang' argument."
)
return lang_3_4
except KeyError:
Expand All @@ -88,7 +88,14 @@ def download_laser2(self):
self.download("laser2.cvocab")

def download_laser3(self, lang: str, spm: bool = False):
lang = self.get_language_code(LASER3_LANGUAGE, lang)
result = self.get_language_code(LASER3_LANGUAGE, lang)

if isinstance(result, list):
raise ValueError(
f"There are script-specific models available for {lang}. Please choose one from the following: {result}"
)

lang = result
self.download(f"laser3-{lang}.v1.pt")
if spm:
if lang in SPM_LANGUAGE:
Expand Down
4 changes: 3 additions & 1 deletion laser_encoders/laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,14 @@ def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = N
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE:
if lang in LASER3_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in SPM_LANGUAGE:
filename = f"laser3-{lang}.v1.spm"
else:
filename = "laser2.spm"
elif lang in LASER2_LANGUAGE:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch!

filename = "laser2.spm"
else:
raise ValueError(
f"Unsupported language name: {lang}. Please specify a supported language name."
Expand Down
2 changes: 1 addition & 1 deletion laser_encoders/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,8 @@ def initialize_encoder(
f"Unsupported laser model: {laser}. Choose either laser2 or laser3."
)
else:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
if lang in LASER3_LANGUAGE:
lang = downloader.get_language_code(LASER3_LANGUAGE, lang)
downloader.download_laser3(lang=lang, spm=spm)
file_path = f"laser3-{lang}.v1"
elif lang in LASER2_LANGUAGE:
Expand Down
57 changes: 57 additions & 0 deletions laser_encoders/test_models_initialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import tempfile

import pytest

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
from laser_encoders.laser_tokenizer import initialize_tokenizer
from laser_encoders.models import initialize_encoder


def test_validate_achnese_models_and_tokenize_laser3(lang="acehnese"):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser3(lang)
encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


def test_validate_english_models_and_tokenize_laser2(lang="english"):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser2()

encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


def test_validate_kashmiri_models_and_tokenize_laser3(lang="kas"):
avidale marked this conversation as resolved.
Show resolved Hide resolved
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
with pytest.raises(ValueError):
downloader.download_laser3(lang)

encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")
108 changes: 108 additions & 0 deletions laser_encoders/validate_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import os
import tempfile

import pytest

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE
from laser_encoders.laser_tokenizer import initialize_tokenizer
from laser_encoders.models import initialize_encoder


@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
def test_validate_language_models_and_tokenize_laser3(lang):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
if lang in ["kashmiri", "kas", "central kanuri", "knc"]:
with pytest.raises(ValueError) as excinfo:
downloader.download_laser3(lang)
assert "ValueError" in str(excinfo.value)
print(f"{lang} language model raised a ValueError as expected.")
else:
downloader.download_laser3(lang)
encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
def test_validate_language_models_and_tokenize_laser2(lang):
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"Created temporary directory for {lang}", tmp_dir)

downloader = LaserModelDownloader(model_dir=tmp_dir)
downloader.download_laser2()

encoder = initialize_encoder(lang, model_dir=tmp_dir)
tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir)

# Test tokenization with a sample sentence
tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


class MockLaserModelDownloader(LaserModelDownloader):
def __init__(self, model_dir):
self.model_dir = model_dir

def download_laser3(self, lang):
Copy link
Contributor

@heffernankevin heffernankevin Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC @avidale's suggestion for the mock downloader was just to check if the language codes exist? (and then have a real downloader for a couple of languages like you have in test_models_initialization.py?). Maybe I misunderstood this comment: #257 (comment).

For example, you could parameterise it with the LASER3 langs, but the func download_laser3 inside the mock downloader just checks if the language code exists instead of actually downloading it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(resolved in chat)

lang = self.get_language_code(LASER3_LANGUAGE, lang)
file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt")
if not os.path.exists(file_path):
raise FileNotFoundError(f"Could not find {file_path}.")

def download_laser2(self):
files = ["laser2.pt", "laser2.spm", "laser2.cvocab"]
for file_name in files:
file_path = os.path.join(self.model_dir, file_name)
if not os.path.exists(file_path):
raise FileNotFoundError(f"Could not find {file_path}.")


CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory

# This uses the mock downloader
@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER3_LANGUAGE)
def test_validate_language_models_and_tokenize_mock_laser3(lang):
downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)

try:
downloader.download_laser3(lang)
except FileNotFoundError as e:
raise pytest.error(str(e))

encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)

tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")


# This uses the mock downloader
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't use the mock downloader? (L112)

@pytest.mark.slow
@pytest.mark.parametrize("lang", LASER2_LANGUAGE)
def test_validate_language_models_and_tokenize_mock_laser2(lang):
downloader = MockLaserModelDownloader(model_dir=CACHE_DIR)

try:
downloader.download_laser2()
except FileNotFoundError as e:
raise pytest.error(str(e))

encoder = initialize_encoder(lang, model_dir=CACHE_DIR)
tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR)

tokenized = tokenizer.tokenize("This is a sample sentence.")

print(f"{lang} model validated successfully")
Loading