diff --git a/docs/source/howto/index.rst b/docs/source/howto/index.rst
index 59033e36..4ef7baaa 100644
--- a/docs/source/howto/index.rst
+++ b/docs/source/howto/index.rst
@@ -10,4 +10,5 @@ How-to guides help you to solve specific problems with pdfminer.six.
images
acro_forms
+ toc_target_page
character_properties
diff --git a/docs/source/howto/toc_target_page.rst b/docs/source/howto/toc_target_page.rst
new file mode 100644
index 00000000..f56c291d
--- /dev/null
+++ b/docs/source/howto/toc_target_page.rst
@@ -0,0 +1,235 @@
+.. _toc_target_page:
+
+How to resolve the target page of ToC entries
+*********************************************
+
+pdfminer.six allows to access the Table of Contents (or "Outlines" as called in
+the PDF internal structure) of a document through the method
+:meth:`PDFDocument.get_outlines`.
+
+A minimal example would be:
+
+.. code-block:: python
+
+ from pathlib import Path
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
+ from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+
+
+ file_name = Path("...")
+
+ with open(file_name, "rb") as fp:
+ try:
+ parser = PDFParser(fp)
+ document = PDFDocument(parser)
+ outlines = document.get_outlines()
+ for (level, title, dest, a, se) in outlines:
+ ... # do something
+ except PDFNoOutlines:
+ print("No outlines found.")
+ except PDFSyntaxError:
+ print("Corrupted PDF or non-PDF file.")
+ finally:
+ parser.close()
+
+But what do the different fields of each outline entry mean? To answer this
+question we can refer to the section *12.3.3 Document Outline* of the
+`PDF Reference `__:
+
+* **Level** (:obj:`int`): This is, unsurprisingly, the level at which the entry
+ is. Entries at the top level will have level ``1``. Entries nested within
+ those ones (i.e., their children), will have level ``2``, and so on.
+* **Title** (:obj:`str`): Again, quite self-explanatory, this field contains the
+ name of the entry. For example: "1. Introduction".
+* **Dest** (:obj:`Union[list, bytes]`, `optional`): This
+ is where things start to get interesting. First thing to mention is that if a
+ **Dest** entry is present, the **A** entry shall not be present. Both of them
+ allow to specify the object the entry targets (this could be a page or any
+ other object). Destinations can be specified in multiple ways. In order to not
+ paraphrase what the
+ `PDF Reference `__ states, we refer
+ the reader to the chapter *12.3.2 Destinations* for more information on the
+ topic.
+* **A** (:obj:`pdfminer.pdftypes.PDFObjRef`, `optional`): Alternatively to using
+ a destination, the target of an entry can also be specified as an action.
+ Again, actions can get somewhat complicated, so we refer the reader to the
+ chapter *12.6 Actions* of the reference.
+* **SE** (:obj:`pdfminer.pdftypes.PDFObjRef`, `optional`): This field contains
+ the structure element the entry points at. More information about structure
+ elements can be found in the chapter *14.7.2 Structure Hierarchy*. It is worth
+ mentioning that most PDFs will not include this field, using **Dest** or **A**
+ instead, or if they do, they might still include a destination (**Dest**) to
+ keep compatibility with PDF versions previous to 1.3.
+
+Unfortunately, pdfminer.six doesn't expose the page number that each of the
+entries targets. However, once we know what each of the fields above mean, we
+can implement a ToC-entry page number resolver ourselves:
+
+.. code-block:: python
+
+ from enum import Enum, auto
+ from pathlib import Path
+ from typing import Any, Optional
+ from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+ from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
+ from pdfminer.pdftypes import PDFObjRef
+
+
+ class PDFRefType(Enum):
+ """PDF reference type."""
+
+ PDF_OBJ_REF = auto()
+ DICTIONARY = auto()
+ LIST = auto()
+ NAMED_REF = auto()
+ UNK = auto() # fallback
+
+
+ class RefPageNumberResolver:
+ """PDF Reference to page number resolver.
+
+ .. note::
+
+ Remote Go-To Actions (see 12.6.4.3 in
+ `https://www.adobe.com/go/pdfreference/`__)
+ are out of the scope of this resolver.
+
+ Attributes:
+ document (:obj:`pdfminer.pdfdocument.PDFDocument`):
+ The document that contains the references.
+ objid_to_pagenum (:obj:`dict[int, int]`):
+ Mapping from an object id to the number of the page that contains
+ that object.
+ """
+
+ def __init__(self, document: PDFDocument):
+ self.document = document
+ # obj_id -> page_number
+ self.objid_to_pagenum: dict[int, int] = {
+ page.pageid: page_num
+ for page_num, page in enumerate(PDFPage.create_pages(document), 1)
+ }
+
+ @classmethod
+ def get_ref_type(cls, ref: Any) -> PDFRefType:
+ """Get the type of a PDF reference."""
+ if isinstance(ref, PDFObjRef):
+ return PDFRefType.PDF_OBJ_REF
+ elif isinstance(ref, dict) and "D" in ref:
+ return PDFRefType.DICTIONARY
+ elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
+ return PDFRefType.LIST
+ elif isinstance(ref, bytes):
+ return PDFRefType.NAMED_REF
+ else:
+ return PDFRefType.UNK
+
+ @classmethod
+ def is_ref_page(cls, ref: Any) -> bool:
+ """Check whether a reference is of type '/Page'.
+
+ Args:
+ ref (:obj:`Any`):
+ The PDF reference.
+
+ Returns:
+ :obj:`bool`: :obj:`True` if the reference references
+ a page, :obj:`False` otherwise.
+ """
+ return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
+
+ def resolve(self, ref: Any) -> Optional[int]:
+ """Resolve a PDF reference to a page number recursively.
+
+ Args:
+ ref (:obj:`Any`):
+ The PDF reference.
+
+ Returns:
+ :obj:`Optional[int]`: The page number or :obj:`None`
+ if the reference could not be resolved (e.g., remote Go-To
+ Actions or malformed references).
+ """
+ ref_type = self.get_ref_type(ref)
+
+ if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
+ return self.objid_to_pagenum.get(ref.objid)
+ elif ref_type is PDFRefType.PDF_OBJ_REF:
+ return self.resolve(ref.resolve())
+
+ if ref_type is PDFRefType.DICTIONARY:
+ return self.resolve(ref["D"])
+
+ if ref_type is PDFRefType.LIST:
+ # Get the PDFObjRef in the list (usually first element).
+ return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
+
+ if ref_type is PDFRefType.NAMED_REF:
+ return self.resolve(self.document.get_dest(ref))
+
+ return None # PDFRefType.UNK
+
+The class :class:`PDFRefType` is just a helper to categorize the type of
+reference we are dealing with. Due to the fact that a reference can point to
+another reference, in some cases we will have to recursively call
+:meth:`RefPageNumberResolver.resolve` until we finally reach a page object.
+Then, we can get the page number by accessing the dictionary
+:attr:`RefPageNumberResolver.objid_to_pagenum`, which maps the page object id to
+the page number.
+
+Using this page number resolver, we can for example print the Table of Contents
+of a document in a human-readable format with the following code:
+
+.. code-block:: python
+
+ def print_outlines(file: str) -> dict[int, int]:
+ """Pretty print the outlines (ToC) of a PDF document."""
+ with open(file, "rb") as fp:
+ try:
+ parser = PDFParser(fp)
+ document = PDFDocument(parser)
+
+ ref_pagenum_resolver = RefPageNumberResolver(document)
+
+ outlines = list(document.get_outlines())
+ if not outlines:
+ print("No outlines found.")
+ for (level, title, dest, a, se) in outlines:
+ if dest:
+ page_num = ref_pagenum_resolver.resolve(dest)
+ elif a:
+ page_num = ref_pagenum_resolver.resolve(a)
+ elif se:
+ page_num = ref_pagenum_resolver.resolve(se)
+ else:
+ page_num = None
+
+ # Calculate leading spaces and filling dots for formatting.
+ leading_spaces = (level-1) * 4
+ fill_dots = 80 - len(title) - leading_spaces
+
+ print(
+ f"{' ' * leading_spaces}"
+ f"{title}",
+ f"{'.' * fill_dots}",
+ f"{page_num:>3}"
+ )
+ except PDFNoOutlines:
+ print("No outlines found.")
+ except PDFSyntaxError:
+ print("Corrupted PDF or non-PDF file.")
+ finally:
+ try:
+ parser.close()
+ except NameError:
+ pass # nothing to do
+
+
+ def main():
+ file_name = Path("...")
+ print_outlines(file_name)
+
+
+ if __name__ == "__main__":
+ main()