NatLibFi · osma · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -85,7 +85,7 @@ jobs:
         fi
         # For Python 3.10:
         if [[ ${{ matrix.python-version }} == '3.10' ]]; then
-          poetry install -E "fasttext spacy";
+          poetry install -E "fasttext spacy estnltk";
           # download the small English pretrained spaCy model needed by spacy analyzer
           poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
         fi

diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -8,7 +8,7 @@
 import annif
 from annif.util import parse_args
 
-from . import simple, simplemma, snowball, spacy, voikko
+from . import estnltk, simple, simplemma, snowball, spacy, voikko
 
 if TYPE_CHECKING:
     from annif.analyzer.analyzer import Analyzer
@@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer:
 register_analyzer(simplemma.SimplemmaAnalyzer)
 register_analyzer(voikko.VoikkoAnalyzer)
 register_analyzer(spacy.SpacyAnalyzer)
+register_analyzer(estnltk.EstNLTKAnalyzer)
diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
@@ -0,0 +1,31 @@
+"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
+
+from __future__ import annotations
+
+import importlib
+
+from . import analyzer
+
+
+class EstNLTKAnalyzer(analyzer.Analyzer):
+    name = "estnltk"
+
+    @staticmethod
+    def is_available() -> bool:
+        # return True iff EstNLTK is installed
+        return importlib.util.find_spec("estnltk") is not None
+
+    def __init__(self, param: str, **kwargs) -> None:
+        self.param = param
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
+        import estnltk
+
+        txt = estnltk.Text(text.strip())
+        txt.tag_layer()
+        return [
+            lemma
+            for lemma in [lemmas[0] for lemmas in txt.lemma]
+            if (not filter or self.is_valid_token(lemma))
+        ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"
 
 fasttext-wheel = { version = "0.9.2", optional = true }
 voikko = { version = "0.5.*", optional = true }
+estnltk = { version = "1.7.3", optional = true }
 tensorflow-cpu = { version = "~2.17.0", optional = true }
 lmdb = { version = "~1.5.1", optional = true }
 omikuji = { version = "0.5.*", optional = true }
@@ -73,6 +74,7 @@ schemathesis = "3.*.*"
 [tool.poetry.extras]
 fasttext = ["fasttext-wheel"]
 voikko = ["voikko"]
+estnltk = ["estnltk"]
 nn = ["tensorflow-cpu", "lmdb"]
 omikuji = ["omikuji"]
 yake = ["yake"]

diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
@@ -0,0 +1,53 @@
+"""Unit tests for EstNLTK analyzer in Annif"""
+
+import pytest
+
+import annif.analyzer
+import annif.analyzer.estnltk
+
+pytestmark = pytest.mark.skipif(
+    not annif.analyzer.estnltk.EstNLTKAnalyzer.is_available(),
+    reason="EstNLTK is required",
+)
+
+
+def test_estnltk_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+    ]
+
+
+def test_estnltk_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """,
+        filter=False,
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        ".",
+        "ka",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+        ".",
+    ]