Skip to content

Commit

Permalink
fix: better handling hyphen and apostrophe
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Oct 23, 2024
1 parent 792d848 commit 1270c3f
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 9 deletions.
9 changes: 5 additions & 4 deletions app/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from app.utils import load_class_object_from_string
from app.utils.analyzers import (
get_autocomplete_analyzer,
get_taxonomy_analyzer,
get_taxonomy_indexing_analyzer,
get_taxonomy_search_analyzer,
number_of_fields,
)

Expand Down Expand Up @@ -64,10 +65,10 @@ def generate_dsl_field(
raise ValueError("Taxonomy field must have a taxonomy_name set in config")
sub_fields = {
lang: dsl_field.Text(
# we must use keyword analyzer as we really map synonyms to a keyword
analyzer="keyword",
# we almost use keyword analyzer as we really map synonyms to a keyword
analyzer=get_taxonomy_indexing_analyzer(field.taxonomy_name, lang),
# but on query we need to fold and match with synonyms
search_analyzer=get_taxonomy_analyzer(
search_analyzer=get_taxonomy_search_analyzer(
field.taxonomy_name, lang, with_synonyms=True
),
)
Expand Down
101 changes: 98 additions & 3 deletions app/utils/analyzers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Defines some analyzers for the elesaticsearch fields."""

from elasticsearch_dsl import Mapping, analyzer, token_filter
from typing import Optional

from elasticsearch_dsl import Mapping, analyzer, char_filter, token_filter

from app._types import JSONType

Expand All @@ -21,6 +23,49 @@
}


# TODO: this could be provided by the taxonomy / per language
STOP_WORDS = {
"ar": "_arabic_",
"hy": "_armenian_",
"eu": "_basque_",
"bn": "_bengali_",
# "pt_BR": _brazilian_
"bg": "_bulgarian_",
"ca": "_catalan_",
"ja": "_cjk_",
"zh": "_cjk_",
"ko": "_cjk_",
"cs": "_czech_",
"da": "_danish_",
"nl": "_dutch_",
"en": "_english_",
"et": "_estonian_",
"fi": "_finnish_",
"fr": "_french_",
"gl": "_galician_",
"de": "_german_",
"el": "_greek_",
"hi": "_hindi_",
"hu": "_hungarian_",
"id": "_indonesian_",
"ga": "_irish_",
"it": "_italian_",
"lv": "_latvian_",
"lt": "_lithuanian_",
"no": "_norwegian_",
"fa": "_persian_",
"pt": "_portuguese_",
"ro": "_romanian_",
"ru": "_russian_",
"sr": "_serbian_",
# "": "_sorani_",
"es": "_spanish_",
"sv": "_swedish_",
"th": "_thai_",
"tr": "_turkish_ ",
}


def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter:
"""Return the synonym filter to use for the taxonomized field analyzer"""
return token_filter(
Expand All @@ -31,23 +76,73 @@ def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter:
)


def get_taxonomy_analyzer(taxonomy: str, lang: str, with_synonyms: bool) -> analyzer:
def get_taxonomy_stop_words_filter(taxonomy: str, lang: str) -> Optional[token_filter]:
"""Return the stop words filter to use for the taxonomized field analyzer
IMPORTANT: de-activated for now !
If we want to handle them, we have to remove them in synonyms, so we need the list.
"""
stop_words = STOP_WORDS.get(lang)
# deactivate for now
if False and stop_words:
return token_filter(
f"taxonomy_stop_words_{lang}",
type="stop",
stopwords=stop_words,
remove_trailing=True,
)
return None


TAXONOMIES_CHAR_FILTER = char_filter(
"taxonomies_char_filter",
type="mapping",
mappings=[
# hyphen to underscore
"- => _",
# and escape quotes, so that ES cut words on them
r"' => \\'",
r"’ => \\'",
],
)


def get_taxonomy_indexing_analyzer(taxonomy: str, lang: str) -> analyzer:
"""We want to index taxonomies terms as keywords (as we only store the id),
but with a specific tweak: transform hyphens into underscores,
"""
# does not really depends on taxonomy and lang
return analyzer(
"taxonomy_indexing",
tokenizer="keyword",
char_filter=[TAXONOMIES_CHAR_FILTER],
)


def get_taxonomy_search_analyzer(
taxonomy: str, lang: str, with_synonyms: bool
) -> analyzer:
"""Return the search analyzer to use for the taxonomized field
:param taxonomy: the taxonomy name
:param lang: the language code
:param with_synonyms: whether to add the synonym filter
"""
# we replace hyphen with underscore
filters: list[str | token_filter] = [
"lowercase",
SPECIAL_NORMALIZERS.get(lang, "asciifolding"),
]
stop_words = get_taxonomy_stop_words_filter(taxonomy, lang)
if stop_words:
filters.append(stop_words)
filters.append(SPECIAL_NORMALIZERS.get(lang, "asciifolding"))
if with_synonyms:
filters.append(
get_taxonomy_synonym_filter(taxonomy, lang),
)
return analyzer(
f"search_{taxonomy}_{lang}",
char_filter=[TAXONOMIES_CHAR_FILTER],
tokenizer="standard",
filter=filters,
)
Expand Down
13 changes: 13 additions & 0 deletions docs/users/explain-taxonomies.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,16 @@ You can also use the [autocompletion API](../ref-openapi/#operation/taxonomy_aut

If you defined taxonomies,
you must import them using the [import-taxonomies command](../devs/ref-python/cli.html#python3-m-app-import-taxonomies).


## Technical details on taxonomy fields

A taxonomy field is stored in Elasticsearch as an object.
For each language it has a specific field, but in this field we just store the taxonomy entry id (eg. for organic, we always store `en:organic`). The analyzer is almost set to `keyword` which means it won't be tokenized (but it is not completely true, as we also transform hyphen to underscore).

Note that the value of this field must be considered a unique token by elasticsearch standard tokenizer.
So you should only use letters, numbers, columns and the underscore.
As an exception, we allow the hyphen character, transforming it to "_" before tokenization.

But those field have a specific *search analyzer*, so that when you enter a search query,
The query text is tokenized using standard analyzer, then lower cased, and we then look for synonyms in the taxonomy.
1 change: 1 addition & 0 deletions tests/int/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_off_config():

@pytest.fixture(scope="session")
def index_config(test_off_config):
"""Fiytures that return the IndexConfig corresponding to test_off"""
return test_off_config.get_index_config("test_off")[1]


Expand Down
114 changes: 114 additions & 0 deletions tests/int/test_analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Some tests on analyzer
Those are placed as integration test because we want to test against Elasticsearch
from the analyzers built by search-a-licious
For explanations on what we our testing here,
see https://openfoodfacts.github.io/search-a-licious/users/explain-taxonomies
"""

import pytest

from app.utils.analyzers import (
get_taxonomy_indexing_analyzer,
get_taxonomy_search_analyzer,
)


def _tokens(result):
return [part["token"] for part in result["tokens"]]


def test_taxonomy_indexing_analyzer(es_connection, data_ingester):
# create the index, with synonyms
data_ingester([])
index_en = get_taxonomy_indexing_analyzer("labels", "en").to_dict()
index_fr = get_taxonomy_indexing_analyzer("labels", "fr").to_dict()
# no change for simple entries
result = es_connection.indices.analyze(
index="test_off",
analyzer=index_en,
text="en:organic",
)
assert _tokens(result) == ["en:organic"]

# the hyphen is replaced by underscore
result = es_connection.indices.analyze(
index="test_off",
analyzer=index_en,
text="en:organic-farming_2",
)
assert _tokens(result) == ["en:organic_farming_2"]
# whatever the language
result = es_connection.indices.analyze(
index="test_off",
analyzer=index_fr,
text="en:organic-farming_2",
)
assert _tokens(result) == ["en:organic_farming_2"]


def test_taxonomy_search_analyzer(es_connection, data_ingester):
# create the index, with synonyms
data_ingester([])
search_en = get_taxonomy_search_analyzer("labels", "en", True).to_dict()
search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict()
# bare term is not changed, but hyphen is replaced by underscore
for analyzer in [search_en, search_fr]:
result = es_connection.indices.analyze(
index="test_off",
analyzer=analyzer,
text="en:organic-farming_2",
)
assert _tokens(result) == ["en:organic_farming_2"]

# synonym is replaced by the synonym
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_en,
text="organically grown plants",
)
assert "en:organic" in _tokens(result)
# with hyphen to underscore
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_en,
text="european leaf",
)
assert _tokens(result) == ["en:eu_organic"]
# french synonyms
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_fr,
text="feuille bio",
)
assert _tokens(result) == ["en:eu_organic"]
# quote handling
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_fr,
text="l'agriculture",
)
assert _tokens(result) == ["l", "agriculture"]
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_fr,
text="issue de l'agriculture biologique",
)
assert _tokens(result) == ["en:organic"]


@pytest.mark.xfail(reason="No stop words support yet")
def test_taxonomy_search_analyzer_stopwords(es_connection, data_ingester):
# create the index, with synonyms
data_ingester([])
search_fr = get_taxonomy_search_analyzer("labels", "fr", True).to_dict()

# simple stop words taken into account
result = es_connection.indices.analyze(
index="test_off",
analyzer=search_fr,
# en ignored as well as "de l'" in target synonym
text="issue en agriculture biologique",
)
assert _tokens(result) == ["en:eu_organic"]
15 changes: 13 additions & 2 deletions tests/int/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def test_search_sort_by_created_t(req_type, sample_data, test_client):

ALL_CODES = [s["code"] for s in search_sample()]
ORGANIC_CODES = ["3012345670001", "3012345670002", "3012345670005"]
NO_LACTOSE_CODES = ["3012345670001", "3012345670003"]
BROWN_SUGAR_CODES = ["3012345670005", "3012345670006"]


Expand Down Expand Up @@ -186,11 +187,12 @@ def xfail_param(*args):
),
# as phrase
({"q": '"organically grown"'}, ORGANIC_CODES),
# Note: we need this double escape for simple quote, I'm not sure why…
(
{"q": '"issu de l\\\'agriculture biologique"', "langs": ["fr"]},
{"q": '"issu de l\'agriculture biologique"', "langs": ["fr"]},
ORGANIC_CODES,
),
# handling of '-'
({"q": 'labels:"en:no-lactose"', "langs": ["fr"]}, NO_LACTOSE_CODES),
# synonyms on label field
({"q": 'labels:"organically grown"'}, ORGANIC_CODES),
# search a field
Expand All @@ -200,6 +202,15 @@ def xfail_param(*args):
({"q": 'product_name:"Sucre roux"', "langs": ["fr"]}, BROWN_SUGAR_CODES),
# search in multiple fields
({"q": '"brown sugar" organic'}, ["3012345670005"]),
# search can use main language as fallback
({"q": "Lactose", "langs": ["fr", "main"]}, ["3012345670003"]),
({"q": "product_name:Lactose", "langs": ["fr", "main"]}, ["3012345670003"]),
(
{"q": '"No Lactose Granulated Sugar"', "langs": ["fr", "main"]},
["3012345670003"],
),
# without main fallback, no result
({"q": "Lactose", "langs": ["fr"]}, []),
],
)
def test_search_full_text(req_type, req, codes, sample_data, test_client):
Expand Down

0 comments on commit 1270c3f

Please sign in to comment.