fix: better handling hyphen and apostrophe

openfoodfacts · Oct 23, 2024 · 1270c3f · 1270c3f
1 parent 792d848
commit 1270c3f
Show file tree

Hide file tree

Showing 6 changed files with 244 additions and 9 deletions.
diff --git a/app/indexing.py b/app/indexing.py
@@ -19,7 +19,8 @@
 from app.utils import load_class_object_from_string
 from app.utils.analyzers import (
     get_autocomplete_analyzer,
-    get_taxonomy_analyzer,
+    get_taxonomy_indexing_analyzer,
+    get_taxonomy_search_analyzer,
     number_of_fields,
 )
 
@@ -64,10 +65,10 @@ def generate_dsl_field(
             raise ValueError("Taxonomy field must have a taxonomy_name set in config")
         sub_fields = {
             lang: dsl_field.Text(
-                # we must use keyword analyzer as we really map synonyms to a keyword
-                analyzer="keyword",
+                # we almost use keyword analyzer as we really map synonyms to a keyword
+                analyzer=get_taxonomy_indexing_analyzer(field.taxonomy_name, lang),
                 # but on query we need to fold and match with synonyms
-                search_analyzer=get_taxonomy_analyzer(
+                search_analyzer=get_taxonomy_search_analyzer(
                     field.taxonomy_name, lang, with_synonyms=True
                 ),
             )

diff --git a/app/utils/analyzers.py b/app/utils/analyzers.py
@@ -1,6 +1,8 @@
 """Defines some analyzers for the elesaticsearch fields."""
 
-from elasticsearch_dsl import Mapping, analyzer, token_filter
+from typing import Optional
+
+from elasticsearch_dsl import Mapping, analyzer, char_filter, token_filter
 
 from app._types import JSONType
 
@@ -21,6 +23,49 @@
 }
 
 
+# TODO: this could be provided by the taxonomy / per language
+STOP_WORDS = {
+    "ar": "_arabic_",
+    "hy": "_armenian_",
+    "eu": "_basque_",
+    "bn": "_bengali_",
+    # "pt_BR": _brazilian_
+    "bg": "_bulgarian_",
+    "ca": "_catalan_",
+    "ja": "_cjk_",
+    "zh": "_cjk_",
+    "ko": "_cjk_",
+    "cs": "_czech_",
+    "da": "_danish_",
+    "nl": "_dutch_",
+    "en": "_english_",
+    "et": "_estonian_",
+    "fi": "_finnish_",
+    "fr": "_french_",
+    "gl": "_galician_",
+    "de": "_german_",
+    "el": "_greek_",
+    "hi": "_hindi_",
+    "hu": "_hungarian_",
+    "id": "_indonesian_",
+    "ga": "_irish_",
+    "it": "_italian_",
+    "lv": "_latvian_",
+    "lt": "_lithuanian_",
+    "no": "_norwegian_",
+    "fa": "_persian_",
+    "pt": "_portuguese_",
+    "ro": "_romanian_",
+    "ru": "_russian_",
+    "sr": "_serbian_",
+    # "": "_sorani_",
+    "es": "_spanish_",
+    "sv": "_swedish_",
+    "th": "_thai_",
+    "tr": "_turkish_ ",
+}
+
+
 def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter:
     """Return the synonym filter to use for the taxonomized field analyzer"""
     return token_filter(
@@ -31,23 +76,73 @@ def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter:
     )
 
 
-def get_taxonomy_analyzer(taxonomy: str, lang: str, with_synonyms: bool) -> analyzer:
+def get_taxonomy_stop_words_filter(taxonomy: str, lang: str) -> Optional[token_filter]:
+    """Return the stop words filter to use for the taxonomized field analyzer
+
+    IMPORTANT: de-activated for now !
+    If we want to handle them, we have to remove them in synonyms, so we need the list.
+    """
+    stop_words = STOP_WORDS.get(lang)
+    # deactivate for now
+    if False and stop_words:
+        return token_filter(
+            f"taxonomy_stop_words_{lang}",
+            type="stop",
+            stopwords=stop_words,
+            remove_trailing=True,
+        )
+    return None
+
+
+TAXONOMIES_CHAR_FILTER = char_filter(
+    "taxonomies_char_filter",
+    type="mapping",
+    mappings=[
+        # hyphen to underscore
+        "- => _",
+        # and escape quotes, so that ES cut words on them
+        r"' => \\'",
+        r"’ => \\'",
+    ],
+)
+
+
+def get_taxonomy_indexing_analyzer(taxonomy: str, lang: str) -> analyzer:
+    """We want to index taxonomies terms as keywords (as we only store the id),
+    but with a specific tweak: transform hyphens into underscores,
+    """
+    # does not really depends on taxonomy and lang
+    return analyzer(
+        "taxonomy_indexing",
+        tokenizer="keyword",
+        char_filter=[TAXONOMIES_CHAR_FILTER],
+    )
+
+
+def get_taxonomy_search_analyzer(
+    taxonomy: str, lang: str, with_synonyms: bool
+) -> analyzer:
     """Return the search analyzer to use for the taxonomized field
 
     :param taxonomy: the taxonomy name
     :param lang: the language code
     :param with_synonyms: whether to add the synonym filter
     """
+    # we replace hyphen with  underscore
     filters: list[str | token_filter] = [
         "lowercase",
-        SPECIAL_NORMALIZERS.get(lang, "asciifolding"),
     ]
+    stop_words = get_taxonomy_stop_words_filter(taxonomy, lang)
+    if stop_words:
+        filters.append(stop_words)
+    filters.append(SPECIAL_NORMALIZERS.get(lang, "asciifolding"))
     if with_synonyms:
         filters.append(
             get_taxonomy_synonym_filter(taxonomy, lang),
         )
     return analyzer(
         f"search_{taxonomy}_{lang}",
+        char_filter=[TAXONOMIES_CHAR_FILTER],
         tokenizer="standard",
         filter=filters,
     )

diff --git a/docs/users/explain-taxonomies.md b/docs/users/explain-taxonomies.md
@@ -50,3 +50,16 @@ You can also use the [autocompletion API](../ref-openapi/#operation/taxonomy_aut
 
 If you defined taxonomies,
 you must import them using the [import-taxonomies command](../devs/ref-python/cli.html#python3-m-app-import-taxonomies).
+
+
+## Technical details on taxonomy fields
+
+A taxonomy field is stored in Elasticsearch as an object.
+For each language it has a specific field, but in this field we just store the taxonomy entry id (eg. for organic, we always store `en:organic`). The analyzer is almost set to `keyword` which means it won't be tokenized (but it is not completely true, as we also transform hyphen to underscore).
+
+Note that the value of this field must be considered a unique token by elasticsearch standard tokenizer.
+So you should only use letters, numbers, columns and the underscore.
+As an exception, we allow the hyphen character, transforming it to "_" before tokenization.
+
+But those field have a specific *search analyzer*, so that when you enter a search query,
+The query text is tokenized using standard analyzer, then lower cased, and we then look for synonyms in the taxonomy.
diff --git a/tests/int/conftest.py b/tests/int/conftest.py
@@ -27,6 +27,7 @@ def test_off_config():
 
 @pytest.fixture(scope="session")
 def index_config(test_off_config):
+    """Fiytures that return the IndexConfig corresponding to test_off"""
     return test_off_config.get_index_config("test_off")[1]
 
 

diff --git a/tests/int/test_analyze.py b/tests/int/test_analyze.py
@@ -0,0 +1,114 @@
+"""Some tests on analyzer
+
+Those are placed as integration test because we want to test against Elasticsearch
+from the analyzers built by search-a-licious
+
+For explanations on what we our testing here,
+see https://openfoodfacts.github.io/search-a-licious/users/explain-taxonomies
+"""
+
+import pytest
+
+from app.utils.analyzers import (
+    get_taxonomy_indexing_analyzer,
+    get_taxonomy_search_analyzer,
+)
+
+
+def _tokens(result):
+    return [part["token"] for part in result["tokens"]]
+
+
+def test_taxonomy_indexing_analyzer(es_connection, data_ingester):
+    # create the index, with synonyms
+    data_ingester([])
+    index_en = get_taxonomy_indexing_analyzer("labels", "en").to_dict()
+    index_fr = get_taxonomy_indexing_analyzer("labels", "fr").to_dict()
+    # no change for simple entries
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=index_en,
+        text="en:organic",
+    )
+    assert _tokens(result) == ["en:organic"]
+
+    # the hyphen is replaced by underscore
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=index_en,
+        text="en:organic-farming_2",
+    )
+    assert _tokens(result) == ["en:organic_farming_2"]
+    # whatever the language
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=index_fr,
+        text="en:organic-farming_2",
+    )
+    assert _tokens(result) == ["en:organic_farming_2"]
+
+
+def test_taxonomy_search_analyzer(es_connection, data_ingester):
+    # create the index, with synonyms
+    data_ingester([])
+    search_en = get_taxonomy_search_analyzer("labels", "en", True).to_dict()
+    search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict()
+    # bare term is not changed, but hyphen is replaced by underscore
+    for analyzer in [search_en, search_fr]:
+        result = es_connection.indices.analyze(
+            index="test_off",
+            analyzer=analyzer,
+            text="en:organic-farming_2",
+        )
+        assert _tokens(result) == ["en:organic_farming_2"]
+
+    # synonym is replaced by the synonym
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_en,
+        text="organically grown plants",
+    )
+    assert "en:organic" in _tokens(result)
+    # with hyphen to underscore
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_en,
+        text="european leaf",
+    )
+    assert _tokens(result) == ["en:eu_organic"]
+    # french synonyms
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_fr,
+        text="feuille bio",
+    )
+    assert _tokens(result) == ["en:eu_organic"]
+    # quote handling
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_fr,
+        text="l'agriculture",
+    )
+    assert _tokens(result) == ["l", "agriculture"]
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_fr,
+        text="issue de l'agriculture biologique",
+    )
+    assert _tokens(result) == ["en:organic"]
+
+
+@pytest.mark.xfail(reason="No stop words support yet")
+def test_taxonomy_search_analyzer_stopwords(es_connection, data_ingester):
+    # create the index, with synonyms
+    data_ingester([])
+    search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict()
+
+    # simple stop words taken into account
+    result = es_connection.indices.analyze(
+        index="test_off",
+        analyzer=search_fr,
+        # en ignored as well as "de l'" in target synonym
+        text="issue en agriculture biologique",
+    )
+    assert _tokens(result) == ["en:eu_organic"]
diff --git a/tests/int/test_search.py b/tests/int/test_search.py
@@ -157,6 +157,7 @@ def test_search_sort_by_created_t(req_type, sample_data, test_client):
 
 ALL_CODES = [s["code"] for s in search_sample()]
 ORGANIC_CODES = ["3012345670001", "3012345670002", "3012345670005"]
+NO_LACTOSE_CODES = ["3012345670001", "3012345670003"]
 BROWN_SUGAR_CODES = ["3012345670005", "3012345670006"]
 
 
@@ -186,11 +187,12 @@ def xfail_param(*args):
         ),
         # as phrase
         ({"q": '"organically grown"'}, ORGANIC_CODES),
-        # Note: we need this double escape for simple quote, I'm not sure why…
         (
-            {"q": '"issu de l\\\'agriculture biologique"', "langs": ["fr"]},
+            {"q": '"issu de l\'agriculture biologique"', "langs": ["fr"]},
             ORGANIC_CODES,
         ),
+        # handling of '-'
+        ({"q": 'labels:"en:no-lactose"', "langs": ["fr"]}, NO_LACTOSE_CODES),
         # synonyms on label field
         ({"q": 'labels:"organically grown"'}, ORGANIC_CODES),
         # search a field
@@ -200,6 +202,15 @@ def xfail_param(*args):
         ({"q": 'product_name:"Sucre roux"', "langs": ["fr"]}, BROWN_SUGAR_CODES),
         # search in multiple fields
         ({"q": '"brown sugar" organic'}, ["3012345670005"]),
+        # search can use main language as fallback
+        ({"q": "Lactose", "langs": ["fr", "main"]}, ["3012345670003"]),
+        ({"q": "product_name:Lactose", "langs": ["fr", "main"]}, ["3012345670003"]),
+        (
+            {"q": '"No Lactose Granulated Sugar"', "langs": ["fr", "main"]},
+            ["3012345670003"],
+        ),
+        # without main fallback, no result
+        ({"q": "Lactose", "langs": ["fr"]}, []),
     ],
 )
 def test_search_full_text(req_type, req, codes, sample_data, test_client):