HTML Search: omit anchor reference from document titles in the search…

… index. (#12047)
sphinx-doc · Jul 8, 2024 · 7eb77f2 · 7eb77f2
1 parent 082f13f
commit 7eb77f2
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 26 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -89,6 +89,9 @@ Bugs fixed
 * #12494: Fix invalid genindex.html file produced with translated docs
   (regression in 7.1.0).
   Patch by Nicolas Peugnet.
+* #11961: Omit anchor references from document title entries in the search index,
+  removing duplication of search results.
+  Patch by James Addison.
 
 Testing
 -------

diff --git a/sphinx/environment/__init__.py b/sphinx/environment/__init__.py
@@ -253,15 +253,15 @@ def __init__(self, app: Sphinx) -> None:
         # search index data
 
         # docname -> title
-        self._search_index_titles: dict[str, str] = {}
+        self._search_index_titles: dict[str, str | None] = {}
         # docname -> filename
         self._search_index_filenames: dict[str, str] = {}
         # stemmed words -> set(docname)
         self._search_index_mapping: dict[str, set[str]] = {}
         # stemmed words in titles -> set(docname)
         self._search_index_title_mapping: dict[str, set[str]] = {}
         # docname -> all titles in document
-        self._search_index_all_titles: dict[str, list[tuple[str, str]]] = {}
+        self._search_index_all_titles: dict[str, list[tuple[str, str | None]]] = {}
         # docname -> list(index entry)
         self._search_index_index_entries: dict[str, list[tuple[str, str, str]]] = {}
         # objtype -> index

diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
@@ -198,7 +198,7 @@ def _is_meta_keywords(
 @dataclasses.dataclass
 class WordStore:
     words: list[str] = dataclasses.field(default_factory=list)
-    titles: list[tuple[str, str]] = dataclasses.field(default_factory=list)
+    titles: list[tuple[str, str | None]] = dataclasses.field(default_factory=list)
     title_words: list[str] = dataclasses.field(default_factory=list)
 
 
@@ -253,15 +253,15 @@ class IndexBuilder:
     def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
         self.env = env
         # docname -> title
-        self._titles: dict[str, str] = env._search_index_titles
+        self._titles: dict[str, str | None] = env._search_index_titles
         # docname -> filename
         self._filenames: dict[str, str] = env._search_index_filenames
         # stemmed words -> set(docname)
         self._mapping: dict[str, set[str]] = env._search_index_mapping
         # stemmed words in titles -> set(docname)
         self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
         # docname -> all titles in document
-        self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles
+        self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles
         # docname -> list(index entry)
         self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
         # objtype -> index
@@ -369,6 +369,13 @@ def get_objects(self, fn2index: dict[str, int]
         return rv
 
     def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
+        """
+        Return a mapping of document and title terms to their corresponding sorted document IDs.
+
+        When a term is only found within a single document, then the value for that term will be
+        an integer value.  When a term is found within multiple documents, the value will be a list
+        of integers.
+        """
         rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {})
         for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
             for k, v in mapping.items():
@@ -391,7 +398,7 @@ def freeze(self) -> dict[str, Any]:
         objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
         objnames = self._objnames
 
-        alltitles: dict[str, list[tuple[int, str]]] = {}
+        alltitles: dict[str, list[tuple[int, str | None]]] = {}
         for docname, titlelist in sorted(self._all_titles.items()):
             for title, titleid in titlelist:
                 alltitles.setdefault(title, []).append((fn2index[docname], titleid))
@@ -502,9 +509,10 @@ def _visit_nodes(node):
             elif isinstance(node, nodes.Text):
                 word_store.words.extend(split(node.astext()))
             elif isinstance(node, nodes.title):
-                title = node.astext()
+                title, is_main_title = node.astext(), len(word_store.titles) == 0
                 ids = node.parent['ids']
-                word_store.titles.append((title, ids[0] if ids else None))
+                title_node_id = None if is_main_title else ids[0] if ids else None
+                word_store.titles.append((title, title_node_id))
                 word_store.title_words.extend(split(title))
             for child in node.children:
                 _visit_nodes(child)

diff --git a/tests/js/fixtures/multiterm/searchindex.js b/tests/js/fixtures/multiterm/searchindex.js
diff --git a/tests/js/fixtures/partial/searchindex.js b/tests/js/fixtures/partial/searchindex.js
diff --git a/tests/js/searchtools.js b/tests/js/searchtools.js
@@ -70,21 +70,12 @@ describe('Basic html theme search', function() {
 
       searchParameters = Search._parseQuery('main page');
 
-      // fixme: duplicate result due to https://github.com/sphinx-doc/sphinx/issues/11961
       hits = [
         [
           'index',
           'Main Page',
           '',
           null,
-          15,
-          'index.rst'
-        ],
-        [
-          'index',
-          'Main Page',
-          '#main-page',
-          null,
           100,
           'index.rst'
         ]

diff --git a/tests/test_search.py b/tests/test_search.py
@@ -71,6 +71,9 @@ def is_registered_term(index, keyword):
 
 .. test that comments are not indexed: boson
 
+another_title
+=============
+
 test that non-comments are indexed: fermion
 '''
 
@@ -168,6 +171,10 @@ def test_IndexBuilder():
                              'docname2_1': 'title2_1', 'docname2_2': 'title2_2'}
     assert index._filenames == {'docname1_1': 'filename1_1', 'docname1_2': 'filename1_2',
                                 'docname2_1': 'filename2_1', 'docname2_2': 'filename2_2'}
+    # note: element iteration order (sort order) is important when the index
+    # is frozen (serialized) during build -- however, the _mapping-related
+    # dictionaries below may be iterated in arbitrary order by Python at
+    # runtime.
     assert index._mapping == {
         'ar': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
         'fermion': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
@@ -176,7 +183,10 @@ def test_IndexBuilder():
         'index': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
         'test': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
     }
-    assert index._title_mapping == {'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}}
+    assert index._title_mapping == {
+        'another_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
+        'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
+    }
     assert index._objtypes == {}
     assert index._objnames == {}
 
@@ -196,8 +206,14 @@ def test_IndexBuilder():
                   'non': [0, 1, 2, 3],
                   'test': [0, 1, 2, 3]},
         'titles': ('title1_1', 'title1_2', 'title2_1', 'title2_2'),
-        'titleterms': {'section_titl': [0, 1, 2, 3]},
-        'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title'), (2, 'section-title'), (3, 'section-title')]},
+        'titleterms': {
+            'another_titl': [0, 1, 2, 3],
+            'section_titl': [0, 1, 2, 3],
+        },
+        'alltitles': {
+            'another_title': [(0, 'another-title'), (1, 'another-title'), (2, 'another-title'), (3, 'another-title')],
+            'section_title': [(0, None), (1, None), (2, None), (3, None)],
+        },
         'indexentries': {},
     }
     assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
@@ -238,7 +254,10 @@ def test_IndexBuilder():
         'index': {'docname1_2', 'docname2_2'},
         'test': {'docname1_2', 'docname2_2'},
     }
-    assert index._title_mapping == {'section_titl': {'docname1_2', 'docname2_2'}}
+    assert index._title_mapping == {
+        'another_titl': {'docname1_2', 'docname2_2'},
+        'section_titl': {'docname1_2', 'docname2_2'},
+    }
     assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
     assert index._objnames == {0: ('dummy1', 'objtype1', 'objtype1'), 1: ('dummy2', 'objtype1', 'objtype1')}
 
@@ -257,8 +276,14 @@ def test_IndexBuilder():
                   'non': [0, 1],
                   'test': [0, 1]},
         'titles': ('title1_2', 'title2_2'),
-        'titleterms': {'section_titl': [0, 1]},
-        'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title')]},
+        'titleterms': {
+            'another_titl': [0, 1],
+            'section_titl': [0, 1],
+        },
+        'alltitles': {
+            'another_title': [(0, 'another-title'), (1, 'another-title')],
+            'section_title': [(0, None), (1, None)],
+        },
         'indexentries': {},
     }
     assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
@@ -347,7 +372,8 @@ def assert_is_sorted(item, path: str):
             assert_is_sorted(value, f'{path}.{key}')
     elif isinstance(item, list):
         if not is_title_tuple_type(item) and path not in lists_not_to_sort:
-            assert item == sorted(item), f'{err_path} is not sorted'
+            # sort nulls last; http://stackoverflow.com/questions/19868767/
+            assert item == sorted(item, key=lambda x: (x is None, x)), f'{err_path} is not sorted'
         for i, child in enumerate(item):
             assert_is_sorted(child, f'{path}[{i}]')