diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 8172bcc2..e044ec95 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -85,7 +85,7 @@ jobs: fi # For Python 3.10: if [[ ${{ matrix.python-version }} == '3.10' ]]; then - poetry install -E "fasttext spacy"; + poetry install -E "fasttext spacy estnltk"; # download the small English pretrained spaCy model needed by spacy analyzer poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed fi diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 81f52511..fcd57baf 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -8,7 +8,7 @@ import annif from annif.util import parse_args -from . import simple, simplemma, snowball, spacy, voikko +from . import estnltk, simple, simplemma, snowball, spacy, voikko if TYPE_CHECKING: from annif.analyzer.analyzer import Analyzer @@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer: register_analyzer(simplemma.SimplemmaAnalyzer) register_analyzer(voikko.VoikkoAnalyzer) register_analyzer(spacy.SpacyAnalyzer) +register_analyzer(estnltk.EstNLTKAnalyzer) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py new file mode 100644 index 00000000..9c2f38be --- /dev/null +++ b/annif/analyzer/estnltk.py @@ -0,0 +1,31 @@ +"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" + +from __future__ import annotations + +import importlib + +from . import analyzer + + +class EstNLTKAnalyzer(analyzer.Analyzer): + name = "estnltk" + + @staticmethod + def is_available() -> bool: + # return True iff EstNLTK is installed + return importlib.util.find_spec("estnltk") is not None + + def __init__(self, param: str, **kwargs) -> None: + self.param = param + super().__init__(**kwargs) + + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: + import estnltk + + txt = estnltk.Text(text.strip()) + txt.tag_layer() + return [ + lemma + for lemma in [lemmas[0] for lemmas in txt.lemma] + if (not filter or self.is_valid_token(lemma)) + ] diff --git a/pyproject.toml b/pyproject.toml index de8410e6..924ae9ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1" fasttext-wheel = { version = "0.9.2", optional = true } voikko = { version = "0.5.*", optional = true } +estnltk = { version = "1.7.3", optional = true } tensorflow-cpu = { version = "~2.17.0", optional = true } lmdb = { version = "~1.5.1", optional = true } omikuji = { version = "0.5.*", optional = true } @@ -73,6 +74,7 @@ schemathesis = "3.*.*" [tool.poetry.extras] fasttext = ["fasttext-wheel"] voikko = ["voikko"] +estnltk = ["estnltk"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py new file mode 100644 index 00000000..3892b422 --- /dev/null +++ b/tests/test_analyzer_estnltk.py @@ -0,0 +1,53 @@ +"""Unit tests for EstNLTK analyzer in Annif""" + +import pytest + +import annif.analyzer +import annif.analyzer.estnltk + +pytestmark = pytest.mark.skipif( + not annif.analyzer.estnltk.EstNLTKAnalyzer.is_available(), + reason="EstNLTK is required", +) + + +def test_estnltk_tokenize_words(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """ + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + "köök", + "olema", + "kõik", + "endine", + ] + + +def test_estnltk_tokenize_words_no_filter(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """, + filter=False, + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + ".", + "ka", + "köök", + "olema", + "kõik", + "endine", + ".", + ]