From 7eb77f2372c5f245400222d026a54406a3828325 Mon Sep 17 00:00:00 2001 From: James Addison <55152140+jayaddison@users.noreply.github.com> Date: Mon, 8 Jul 2024 12:34:11 +0100 Subject: [PATCH] HTML Search: omit anchor reference from document titles in the search index. (#12047) --- CHANGES.rst | 3 ++ sphinx/environment/__init__.py | 4 +-- sphinx/search/__init__.py | 20 +++++++---- tests/js/fixtures/multiterm/searchindex.js | 2 +- tests/js/fixtures/partial/searchindex.js | 2 +- tests/js/searchtools.js | 9 ----- tests/test_search.py | 40 ++++++++++++++++++---- 7 files changed, 54 insertions(+), 26 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c36233573a3..cf70dee2f38 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -89,6 +89,9 @@ Bugs fixed * #12494: Fix invalid genindex.html file produced with translated docs (regression in 7.1.0). Patch by Nicolas Peugnet. +* #11961: Omit anchor references from document title entries in the search index, + removing duplication of search results. + Patch by James Addison. Testing ------- diff --git a/sphinx/environment/__init__.py b/sphinx/environment/__init__.py index bd652fe51d2..0aecc62855c 100644 --- a/sphinx/environment/__init__.py +++ b/sphinx/environment/__init__.py @@ -253,7 +253,7 @@ def __init__(self, app: Sphinx) -> None: # search index data # docname -> title - self._search_index_titles: dict[str, str] = {} + self._search_index_titles: dict[str, str | None] = {} # docname -> filename self._search_index_filenames: dict[str, str] = {} # stemmed words -> set(docname) @@ -261,7 +261,7 @@ def __init__(self, app: Sphinx) -> None: # stemmed words in titles -> set(docname) self._search_index_title_mapping: dict[str, set[str]] = {} # docname -> all titles in document - self._search_index_all_titles: dict[str, list[tuple[str, str]]] = {} + self._search_index_all_titles: dict[str, list[tuple[str, str | None]]] = {} # docname -> list(index entry) self._search_index_index_entries: dict[str, list[tuple[str, str, str]]] = {} # objtype -> index diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index 2638f92ffb4..ec194ef6e96 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -198,7 +198,7 @@ def _is_meta_keywords( @dataclasses.dataclass class WordStore: words: list[str] = dataclasses.field(default_factory=list) - titles: list[tuple[str, str]] = dataclasses.field(default_factory=list) + titles: list[tuple[str, str | None]] = dataclasses.field(default_factory=list) title_words: list[str] = dataclasses.field(default_factory=list) @@ -253,7 +253,7 @@ class IndexBuilder: def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None: self.env = env # docname -> title - self._titles: dict[str, str] = env._search_index_titles + self._titles: dict[str, str | None] = env._search_index_titles # docname -> filename self._filenames: dict[str, str] = env._search_index_filenames # stemmed words -> set(docname) @@ -261,7 +261,7 @@ def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], sc # stemmed words in titles -> set(docname) self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping # docname -> all titles in document - self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles + self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles # docname -> list(index entry) self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries # objtype -> index @@ -369,6 +369,13 @@ def get_objects(self, fn2index: dict[str, int] return rv def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]: + """ + Return a mapping of document and title terms to their corresponding sorted document IDs. + + When a term is only found within a single document, then the value for that term will be + an integer value. When a term is found within multiple documents, the value will be a list + of integers. + """ rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {}) for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): for k, v in mapping.items(): @@ -391,7 +398,7 @@ def freeze(self) -> dict[str, Any]: objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()} objnames = self._objnames - alltitles: dict[str, list[tuple[int, str]]] = {} + alltitles: dict[str, list[tuple[int, str | None]]] = {} for docname, titlelist in sorted(self._all_titles.items()): for title, titleid in titlelist: alltitles.setdefault(title, []).append((fn2index[docname], titleid)) @@ -502,9 +509,10 @@ def _visit_nodes(node): elif isinstance(node, nodes.Text): word_store.words.extend(split(node.astext())) elif isinstance(node, nodes.title): - title = node.astext() + title, is_main_title = node.astext(), len(word_store.titles) == 0 ids = node.parent['ids'] - word_store.titles.append((title, ids[0] if ids else None)) + title_node_id = None if is_main_title else ids[0] if ids else None + word_store.titles.append((title, title_node_id)) word_store.title_words.extend(split(title)) for child in node.children: _visit_nodes(child) diff --git a/tests/js/fixtures/multiterm/searchindex.js b/tests/js/fixtures/multiterm/searchindex.js index b791df93d11..096b97eb7a3 100644 --- a/tests/js/fixtures/multiterm/searchindex.js +++ b/tests/js/fixtures/multiterm/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Main Page": [[0, "main-page"]]}, "docnames": ["index"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"At": 0, "adjac": 0, "all": 0, "an": 0, "appear": 0, "applic": 0, "ar": 0, "built": 0, "can": 0, "check": 0, "contain": 0, "do": 0, "document": 0, "doesn": 0, "each": 0, "fixtur": 0, "format": 0, "function": 0, "futur": 0, "html": 0, "i": 0, "includ": 0, "match": 0, "messag": 0, "multipl": 0, "multiterm": 0, "order": 0, "other": 0, "output": 0, "perform": 0, "perhap": 0, "phrase": 0, "project": 0, "queri": 0, "requir": 0, "same": 0, "search": 0, "successfulli": 0, "support": 0, "t": 0, "term": 0, "test": 0, "thi": 0, "time": 0, "us": 0, "when": 0, "write": 0}, "titles": ["Main Page"], "titleterms": {"main": 0, "page": 0}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Main Page": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"At": 0, "adjac": 0, "all": 0, "an": 0, "appear": 0, "applic": 0, "ar": 0, "built": 0, "can": 0, "check": 0, "contain": 0, "do": 0, "document": 0, "doesn": 0, "each": 0, "fixtur": 0, "format": 0, "function": 0, "futur": 0, "html": 0, "i": 0, "includ": 0, "match": 0, "messag": 0, "multipl": 0, "multiterm": 0, "order": 0, "other": 0, "output": 0, "perform": 0, "perhap": 0, "phrase": 0, "project": 0, "queri": 0, "requir": 0, "same": 0, "search": 0, "successfulli": 0, "support": 0, "t": 0, "term": 0, "test": 0, "thi": 0, "time": 0, "us": 0, "when": 0, "write": 0}, "titles": ["Main Page"], "titleterms": {"main": 0, "page": 0}}) \ No newline at end of file diff --git a/tests/js/fixtures/partial/searchindex.js b/tests/js/fixtures/partial/searchindex.js index 6ccfbd6d07e..6d9206e0988 100644 --- a/tests/js/fixtures/partial/searchindex.js +++ b/tests/js/fixtures/partial/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"sphinx_utils module": [[0, "sphinx-utils-module"]]}, "docnames": ["index"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"also": 0, "ar": 0, "built": 0, "confirm": 0, "document": 0, "function": 0, "html": 0, "i": 0, "includ": 0, "input": 0, "javascript": 0, "known": 0, "match": 0, "partial": 0, "possibl": 0, "prefix": 0, "project": 0, "provid": 0, "restructuredtext": 0, "sampl": 0, "search": 0, "should": 0, "thi": 0, "titl": 0, "us": 0, "when": 0}, "titles": ["sphinx_utils module"], "titleterms": {"modul": 0, "sphinx_util": 0}}) \ No newline at end of file +Search.setIndex({"alltitles": {"sphinx_utils module": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"also": 0, "ar": 0, "built": 0, "confirm": 0, "document": 0, "function": 0, "html": 0, "i": 0, "includ": 0, "input": 0, "javascript": 0, "known": 0, "match": 0, "partial": 0, "possibl": 0, "prefix": 0, "project": 0, "provid": 0, "restructuredtext": 0, "sampl": 0, "search": 0, "should": 0, "thi": 0, "titl": 0, "us": 0, "when": 0}, "titles": ["sphinx_utils module"], "titleterms": {"modul": 0, "sphinx_util": 0}}) \ No newline at end of file diff --git a/tests/js/searchtools.js b/tests/js/searchtools.js index 5e97572fb3e..d020e40d904 100644 --- a/tests/js/searchtools.js +++ b/tests/js/searchtools.js @@ -70,21 +70,12 @@ describe('Basic html theme search', function() { searchParameters = Search._parseQuery('main page'); - // fixme: duplicate result due to https://github.com/sphinx-doc/sphinx/issues/11961 hits = [ [ 'index', 'Main Page', '', null, - 15, - 'index.rst' - ], - [ - 'index', - 'Main Page', - '#main-page', - null, 100, 'index.rst' ] diff --git a/tests/test_search.py b/tests/test_search.py index 3b3413db8d9..a2b01c17b6b 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -71,6 +71,9 @@ def is_registered_term(index, keyword): .. test that comments are not indexed: boson +another_title +============= + test that non-comments are indexed: fermion ''' @@ -168,6 +171,10 @@ def test_IndexBuilder(): 'docname2_1': 'title2_1', 'docname2_2': 'title2_2'} assert index._filenames == {'docname1_1': 'filename1_1', 'docname1_2': 'filename1_2', 'docname2_1': 'filename2_1', 'docname2_2': 'filename2_2'} + # note: element iteration order (sort order) is important when the index + # is frozen (serialized) during build -- however, the _mapping-related + # dictionaries below may be iterated in arbitrary order by Python at + # runtime. assert index._mapping == { 'ar': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, 'fermion': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, @@ -176,7 +183,10 @@ def test_IndexBuilder(): 'index': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, 'test': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, } - assert index._title_mapping == {'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}} + assert index._title_mapping == { + 'another_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, + 'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}, + } assert index._objtypes == {} assert index._objnames == {} @@ -196,8 +206,14 @@ def test_IndexBuilder(): 'non': [0, 1, 2, 3], 'test': [0, 1, 2, 3]}, 'titles': ('title1_1', 'title1_2', 'title2_1', 'title2_2'), - 'titleterms': {'section_titl': [0, 1, 2, 3]}, - 'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title'), (2, 'section-title'), (3, 'section-title')]}, + 'titleterms': { + 'another_titl': [0, 1, 2, 3], + 'section_titl': [0, 1, 2, 3], + }, + 'alltitles': { + 'another_title': [(0, 'another-title'), (1, 'another-title'), (2, 'another-title'), (3, 'another-title')], + 'section_title': [(0, None), (1, None), (2, None), (3, None)], + }, 'indexentries': {}, } assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1} @@ -238,7 +254,10 @@ def test_IndexBuilder(): 'index': {'docname1_2', 'docname2_2'}, 'test': {'docname1_2', 'docname2_2'}, } - assert index._title_mapping == {'section_titl': {'docname1_2', 'docname2_2'}} + assert index._title_mapping == { + 'another_titl': {'docname1_2', 'docname2_2'}, + 'section_titl': {'docname1_2', 'docname2_2'}, + } assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1} assert index._objnames == {0: ('dummy1', 'objtype1', 'objtype1'), 1: ('dummy2', 'objtype1', 'objtype1')} @@ -257,8 +276,14 @@ def test_IndexBuilder(): 'non': [0, 1], 'test': [0, 1]}, 'titles': ('title1_2', 'title2_2'), - 'titleterms': {'section_titl': [0, 1]}, - 'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title')]}, + 'titleterms': { + 'another_titl': [0, 1], + 'section_titl': [0, 1], + }, + 'alltitles': { + 'another_title': [(0, 'another-title'), (1, 'another-title')], + 'section_title': [(0, None), (1, None)], + }, 'indexentries': {}, } assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1} @@ -347,7 +372,8 @@ def assert_is_sorted(item, path: str): assert_is_sorted(value, f'{path}.{key}') elif isinstance(item, list): if not is_title_tuple_type(item) and path not in lists_not_to_sort: - assert item == sorted(item), f'{err_path} is not sorted' + # sort nulls last; http://stackoverflow.com/questions/19868767/ + assert item == sorted(item, key=lambda x: (x is None, x)), f'{err_path} is not sorted' for i, child in enumerate(item): assert_is_sorted(child, f'{path}[{i}]')