Skip to content

Commit

Permalink
HTML Search: omit anchor reference from document titles in the search…
Browse files Browse the repository at this point in the history
… index. (#12047)
  • Loading branch information
jayaddison authored Jul 8, 2024
1 parent 082f13f commit 7eb77f2
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 26 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ Bugs fixed
* #12494: Fix invalid genindex.html file produced with translated docs
(regression in 7.1.0).
Patch by Nicolas Peugnet.
* #11961: Omit anchor references from document title entries in the search index,
removing duplication of search results.
Patch by James Addison.

Testing
-------
Expand Down
4 changes: 2 additions & 2 deletions sphinx/environment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,15 @@ def __init__(self, app: Sphinx) -> None:
# search index data

# docname -> title
self._search_index_titles: dict[str, str] = {}
self._search_index_titles: dict[str, str | None] = {}
# docname -> filename
self._search_index_filenames: dict[str, str] = {}
# stemmed words -> set(docname)
self._search_index_mapping: dict[str, set[str]] = {}
# stemmed words in titles -> set(docname)
self._search_index_title_mapping: dict[str, set[str]] = {}
# docname -> all titles in document
self._search_index_all_titles: dict[str, list[tuple[str, str]]] = {}
self._search_index_all_titles: dict[str, list[tuple[str, str | None]]] = {}
# docname -> list(index entry)
self._search_index_index_entries: dict[str, list[tuple[str, str, str]]] = {}
# objtype -> index
Expand Down
20 changes: 14 additions & 6 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _is_meta_keywords(
@dataclasses.dataclass
class WordStore:
words: list[str] = dataclasses.field(default_factory=list)
titles: list[tuple[str, str]] = dataclasses.field(default_factory=list)
titles: list[tuple[str, str | None]] = dataclasses.field(default_factory=list)
title_words: list[str] = dataclasses.field(default_factory=list)


Expand Down Expand Up @@ -253,15 +253,15 @@ class IndexBuilder:
def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
self.env = env
# docname -> title
self._titles: dict[str, str] = env._search_index_titles
self._titles: dict[str, str | None] = env._search_index_titles
# docname -> filename
self._filenames: dict[str, str] = env._search_index_filenames
# stemmed words -> set(docname)
self._mapping: dict[str, set[str]] = env._search_index_mapping
# stemmed words in titles -> set(docname)
self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
# docname -> all titles in document
self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles
self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles
# docname -> list(index entry)
self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
# objtype -> index
Expand Down Expand Up @@ -369,6 +369,13 @@ def get_objects(self, fn2index: dict[str, int]
return rv

def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
"""
Return a mapping of document and title terms to their corresponding sorted document IDs.
When a term is only found within a single document, then the value for that term will be
an integer value. When a term is found within multiple documents, the value will be a list
of integers.
"""
rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {})
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
for k, v in mapping.items():
Expand All @@ -391,7 +398,7 @@ def freeze(self) -> dict[str, Any]:
objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
objnames = self._objnames

alltitles: dict[str, list[tuple[int, str]]] = {}
alltitles: dict[str, list[tuple[int, str | None]]] = {}
for docname, titlelist in sorted(self._all_titles.items()):
for title, titleid in titlelist:
alltitles.setdefault(title, []).append((fn2index[docname], titleid))
Expand Down Expand Up @@ -502,9 +509,10 @@ def _visit_nodes(node):
elif isinstance(node, nodes.Text):
word_store.words.extend(split(node.astext()))
elif isinstance(node, nodes.title):
title = node.astext()
title, is_main_title = node.astext(), len(word_store.titles) == 0
ids = node.parent['ids']
word_store.titles.append((title, ids[0] if ids else None))
title_node_id = None if is_main_title else ids[0] if ids else None
word_store.titles.append((title, title_node_id))
word_store.title_words.extend(split(title))
for child in node.children:
_visit_nodes(child)
Expand Down
2 changes: 1 addition & 1 deletion tests/js/fixtures/multiterm/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/partial/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 0 additions & 9 deletions tests/js/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,12 @@ describe('Basic html theme search', function() {

searchParameters = Search._parseQuery('main page');

// fixme: duplicate result due to https://github.com/sphinx-doc/sphinx/issues/11961
hits = [
[
'index',
'Main Page',
'',
null,
15,
'index.rst'
],
[
'index',
'Main Page',
'#main-page',
null,
100,
'index.rst'
]
Expand Down
40 changes: 33 additions & 7 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def is_registered_term(index, keyword):
.. test that comments are not indexed: boson
another_title
=============
test that non-comments are indexed: fermion
'''

Expand Down Expand Up @@ -168,6 +171,10 @@ def test_IndexBuilder():
'docname2_1': 'title2_1', 'docname2_2': 'title2_2'}
assert index._filenames == {'docname1_1': 'filename1_1', 'docname1_2': 'filename1_2',
'docname2_1': 'filename2_1', 'docname2_2': 'filename2_2'}
# note: element iteration order (sort order) is important when the index
# is frozen (serialized) during build -- however, the _mapping-related
# dictionaries below may be iterated in arbitrary order by Python at
# runtime.
assert index._mapping == {
'ar': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
'fermion': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
Expand All @@ -176,7 +183,10 @@ def test_IndexBuilder():
'index': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
'test': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
}
assert index._title_mapping == {'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}}
assert index._title_mapping == {
'another_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
}
assert index._objtypes == {}
assert index._objnames == {}

Expand All @@ -196,8 +206,14 @@ def test_IndexBuilder():
'non': [0, 1, 2, 3],
'test': [0, 1, 2, 3]},
'titles': ('title1_1', 'title1_2', 'title2_1', 'title2_2'),
'titleterms': {'section_titl': [0, 1, 2, 3]},
'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title'), (2, 'section-title'), (3, 'section-title')]},
'titleterms': {
'another_titl': [0, 1, 2, 3],
'section_titl': [0, 1, 2, 3],
},
'alltitles': {
'another_title': [(0, 'another-title'), (1, 'another-title'), (2, 'another-title'), (3, 'another-title')],
'section_title': [(0, None), (1, None), (2, None), (3, None)],
},
'indexentries': {},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
Expand Down Expand Up @@ -238,7 +254,10 @@ def test_IndexBuilder():
'index': {'docname1_2', 'docname2_2'},
'test': {'docname1_2', 'docname2_2'},
}
assert index._title_mapping == {'section_titl': {'docname1_2', 'docname2_2'}}
assert index._title_mapping == {
'another_titl': {'docname1_2', 'docname2_2'},
'section_titl': {'docname1_2', 'docname2_2'},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
assert index._objnames == {0: ('dummy1', 'objtype1', 'objtype1'), 1: ('dummy2', 'objtype1', 'objtype1')}

Expand All @@ -257,8 +276,14 @@ def test_IndexBuilder():
'non': [0, 1],
'test': [0, 1]},
'titles': ('title1_2', 'title2_2'),
'titleterms': {'section_titl': [0, 1]},
'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title')]},
'titleterms': {
'another_titl': [0, 1],
'section_titl': [0, 1],
},
'alltitles': {
'another_title': [(0, 'another-title'), (1, 'another-title')],
'section_title': [(0, None), (1, None)],
},
'indexentries': {},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
Expand Down Expand Up @@ -347,7 +372,8 @@ def assert_is_sorted(item, path: str):
assert_is_sorted(value, f'{path}.{key}')
elif isinstance(item, list):
if not is_title_tuple_type(item) and path not in lists_not_to_sort:
assert item == sorted(item), f'{err_path} is not sorted'
# sort nulls last; http://stackoverflow.com/questions/19868767/
assert item == sorted(item, key=lambda x: (x is None, x)), f'{err_path} is not sorted'
for i, child in enumerate(item):
assert_is_sorted(child, f'{path}[{i}]')

Expand Down

0 comments on commit 7eb77f2

Please sign in to comment.