diff --git a/app/indexing.py b/app/indexing.py index 2f8d9aa2..d2da59cd 100644 --- a/app/indexing.py +++ b/app/indexing.py @@ -19,7 +19,8 @@ from app.utils import load_class_object_from_string from app.utils.analyzers import ( get_autocomplete_analyzer, - get_taxonomy_analyzer, + get_taxonomy_indexing_analyzer, + get_taxonomy_search_analyzer, number_of_fields, ) @@ -64,10 +65,10 @@ def generate_dsl_field( raise ValueError("Taxonomy field must have a taxonomy_name set in config") sub_fields = { lang: dsl_field.Text( - # we must use keyword analyzer as we really map synonyms to a keyword - analyzer="keyword", + # we almost use keyword analyzer as we really map synonyms to a keyword + analyzer=get_taxonomy_indexing_analyzer(field.taxonomy_name, lang), # but on query we need to fold and match with synonyms - search_analyzer=get_taxonomy_analyzer( + search_analyzer=get_taxonomy_search_analyzer( field.taxonomy_name, lang, with_synonyms=True ), ) diff --git a/app/utils/analyzers.py b/app/utils/analyzers.py index 7037fda4..b25e2423 100644 --- a/app/utils/analyzers.py +++ b/app/utils/analyzers.py @@ -1,6 +1,8 @@ """Defines some analyzers for the elesaticsearch fields.""" -from elasticsearch_dsl import Mapping, analyzer, token_filter +from typing import Optional + +from elasticsearch_dsl import Mapping, analyzer, char_filter, token_filter from app._types import JSONType @@ -21,6 +23,49 @@ } +# TODO: this could be provided by the taxonomy / per language +STOP_WORDS = { + "ar": "_arabic_", + "hy": "_armenian_", + "eu": "_basque_", + "bn": "_bengali_", + # "pt_BR": _brazilian_ + "bg": "_bulgarian_", + "ca": "_catalan_", + "ja": "_cjk_", + "zh": "_cjk_", + "ko": "_cjk_", + "cs": "_czech_", + "da": "_danish_", + "nl": "_dutch_", + "en": "_english_", + "et": "_estonian_", + "fi": "_finnish_", + "fr": "_french_", + "gl": "_galician_", + "de": "_german_", + "el": "_greek_", + "hi": "_hindi_", + "hu": "_hungarian_", + "id": "_indonesian_", + "ga": "_irish_", + "it": "_italian_", + "lv": "_latvian_", + "lt": "_lithuanian_", + "no": "_norwegian_", + "fa": "_persian_", + "pt": "_portuguese_", + "ro": "_romanian_", + "ru": "_russian_", + "sr": "_serbian_", + # "": "_sorani_", + "es": "_spanish_", + "sv": "_swedish_", + "th": "_thai_", + "tr": "_turkish_ ", +} + + def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter: """Return the synonym filter to use for the taxonomized field analyzer""" return token_filter( @@ -31,23 +76,73 @@ def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter: ) -def get_taxonomy_analyzer(taxonomy: str, lang: str, with_synonyms: bool) -> analyzer: +def get_taxonomy_stop_words_filter(taxonomy: str, lang: str) -> Optional[token_filter]: + """Return the stop words filter to use for the taxonomized field analyzer + + IMPORTANT: de-activated for now ! + If we want to handle them, we have to remove them in synonyms, so we need the list. + """ + stop_words = STOP_WORDS.get(lang) + # deactivate for now + if False and stop_words: + return token_filter( + f"taxonomy_stop_words_{lang}", + type="stop", + stopwords=stop_words, + remove_trailing=True, + ) + return None + + +TAXONOMIES_CHAR_FILTER = char_filter( + "taxonomies_char_filter", + type="mapping", + mappings=[ + # hyphen to underscore + "- => _", + # and escape quotes, so that ES cut words on them + r"' => \\'", + r"’ => \\'", + ], +) + + +def get_taxonomy_indexing_analyzer(taxonomy: str, lang: str) -> analyzer: + """We want to index taxonomies terms as keywords (as we only store the id), + but with a specific tweak: transform hyphens into underscores, + """ + # does not really depends on taxonomy and lang + return analyzer( + "taxonomy_indexing", + tokenizer="keyword", + char_filter=[TAXONOMIES_CHAR_FILTER], + ) + + +def get_taxonomy_search_analyzer( + taxonomy: str, lang: str, with_synonyms: bool +) -> analyzer: """Return the search analyzer to use for the taxonomized field :param taxonomy: the taxonomy name :param lang: the language code :param with_synonyms: whether to add the synonym filter """ + # we replace hyphen with underscore filters: list[str | token_filter] = [ "lowercase", - SPECIAL_NORMALIZERS.get(lang, "asciifolding"), ] + stop_words = get_taxonomy_stop_words_filter(taxonomy, lang) + if stop_words: + filters.append(stop_words) + filters.append(SPECIAL_NORMALIZERS.get(lang, "asciifolding")) if with_synonyms: filters.append( get_taxonomy_synonym_filter(taxonomy, lang), ) return analyzer( f"search_{taxonomy}_{lang}", + char_filter=[TAXONOMIES_CHAR_FILTER], tokenizer="standard", filter=filters, ) diff --git a/docs/users/explain-taxonomies.md b/docs/users/explain-taxonomies.md index 168118ef..9c90b80f 100644 --- a/docs/users/explain-taxonomies.md +++ b/docs/users/explain-taxonomies.md @@ -50,3 +50,16 @@ You can also use the [autocompletion API](../ref-openapi/#operation/taxonomy_aut If you defined taxonomies, you must import them using the [import-taxonomies command](../devs/ref-python/cli.html#python3-m-app-import-taxonomies). + + +## Technical details on taxonomy fields + +A taxonomy field is stored in Elasticsearch as an object. +For each language it has a specific field, but in this field we just store the taxonomy entry id (eg. for organic, we always store `en:organic`). The analyzer is almost set to `keyword` which means it won't be tokenized (but it is not completely true, as we also transform hyphen to underscore). + +Note that the value of this field must be considered a unique token by elasticsearch standard tokenizer. +So you should only use letters, numbers, columns and the underscore. +As an exception, we allow the hyphen character, transforming it to "_" before tokenization. + +But those field have a specific *search analyzer*, so that when you enter a search query, +The query text is tokenized using standard analyzer, then lower cased, and we then look for synonyms in the taxonomy. \ No newline at end of file diff --git a/tests/int/conftest.py b/tests/int/conftest.py index bbef9693..3b83c69e 100644 --- a/tests/int/conftest.py +++ b/tests/int/conftest.py @@ -27,6 +27,7 @@ def test_off_config(): @pytest.fixture(scope="session") def index_config(test_off_config): + """Fiytures that return the IndexConfig corresponding to test_off""" return test_off_config.get_index_config("test_off")[1] diff --git a/tests/int/test_analyze.py b/tests/int/test_analyze.py new file mode 100644 index 00000000..4de81d45 --- /dev/null +++ b/tests/int/test_analyze.py @@ -0,0 +1,114 @@ +"""Some tests on analyzer + +Those are placed as integration test because we want to test against Elasticsearch +from the analyzers built by search-a-licious + +For explanations on what we our testing here, +see https://openfoodfacts.github.io/search-a-licious/users/explain-taxonomies +""" + +import pytest + +from app.utils.analyzers import ( + get_taxonomy_indexing_analyzer, + get_taxonomy_search_analyzer, +) + + +def _tokens(result): + return [part["token"] for part in result["tokens"]] + + +def test_taxonomy_indexing_analyzer(es_connection, data_ingester): + # create the index, with synonyms + data_ingester([]) + index_en = get_taxonomy_indexing_analyzer("labels", "en").to_dict() + index_fr = get_taxonomy_indexing_analyzer("labels", "fr").to_dict() + # no change for simple entries + result = es_connection.indices.analyze( + index="test_off", + analyzer=index_en, + text="en:organic", + ) + assert _tokens(result) == ["en:organic"] + + # the hyphen is replaced by underscore + result = es_connection.indices.analyze( + index="test_off", + analyzer=index_en, + text="en:organic-farming_2", + ) + assert _tokens(result) == ["en:organic_farming_2"] + # whatever the language + result = es_connection.indices.analyze( + index="test_off", + analyzer=index_fr, + text="en:organic-farming_2", + ) + assert _tokens(result) == ["en:organic_farming_2"] + + +def test_taxonomy_search_analyzer(es_connection, data_ingester): + # create the index, with synonyms + data_ingester([]) + search_en = get_taxonomy_search_analyzer("labels", "en", True).to_dict() + search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict() + # bare term is not changed, but hyphen is replaced by underscore + for analyzer in [search_en, search_fr]: + result = es_connection.indices.analyze( + index="test_off", + analyzer=analyzer, + text="en:organic-farming_2", + ) + assert _tokens(result) == ["en:organic_farming_2"] + + # synonym is replaced by the synonym + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_en, + text="organically grown plants", + ) + assert "en:organic" in _tokens(result) + # with hyphen to underscore + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_en, + text="european leaf", + ) + assert _tokens(result) == ["en:eu_organic"] + # french synonyms + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_fr, + text="feuille bio", + ) + assert _tokens(result) == ["en:eu_organic"] + # quote handling + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_fr, + text="l'agriculture", + ) + assert _tokens(result) == ["l", "agriculture"] + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_fr, + text="issue de l'agriculture biologique", + ) + assert _tokens(result) == ["en:organic"] + + +@pytest.mark.xfail(reason="No stop words support yet") +def test_taxonomy_search_analyzer_stopwords(es_connection, data_ingester): + # create the index, with synonyms + data_ingester([]) + search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict() + + # simple stop words taken into account + result = es_connection.indices.analyze( + index="test_off", + analyzer=search_fr, + # en ignored as well as "de l'" in target synonym + text="issue en agriculture biologique", + ) + assert _tokens(result) == ["en:eu_organic"] diff --git a/tests/int/test_search.py b/tests/int/test_search.py index 9fb60036..92ee7460 100644 --- a/tests/int/test_search.py +++ b/tests/int/test_search.py @@ -157,6 +157,7 @@ def test_search_sort_by_created_t(req_type, sample_data, test_client): ALL_CODES = [s["code"] for s in search_sample()] ORGANIC_CODES = ["3012345670001", "3012345670002", "3012345670005"] +NO_LACTOSE_CODES = ["3012345670001", "3012345670003"] BROWN_SUGAR_CODES = ["3012345670005", "3012345670006"] @@ -186,11 +187,12 @@ def xfail_param(*args): ), # as phrase ({"q": '"organically grown"'}, ORGANIC_CODES), - # Note: we need this double escape for simple quote, I'm not sure why… ( - {"q": '"issu de l\\\'agriculture biologique"', "langs": ["fr"]}, + {"q": '"issu de l\'agriculture biologique"', "langs": ["fr"]}, ORGANIC_CODES, ), + # handling of '-' + ({"q": 'labels:"en:no-lactose"', "langs": ["fr"]}, NO_LACTOSE_CODES), # synonyms on label field ({"q": 'labels:"organically grown"'}, ORGANIC_CODES), # search a field @@ -200,6 +202,15 @@ def xfail_param(*args): ({"q": 'product_name:"Sucre roux"', "langs": ["fr"]}, BROWN_SUGAR_CODES), # search in multiple fields ({"q": '"brown sugar" organic'}, ["3012345670005"]), + # search can use main language as fallback + ({"q": "Lactose", "langs": ["fr", "main"]}, ["3012345670003"]), + ({"q": "product_name:Lactose", "langs": ["fr", "main"]}, ["3012345670003"]), + ( + {"q": '"No Lactose Granulated Sugar"', "langs": ["fr", "main"]}, + ["3012345670003"], + ), + # without main fallback, no result + ({"q": "Lactose", "langs": ["fr"]}, []), ], ) def test_search_full_text(req_type, req, codes, sample_data, test_client):