From 123351f5fbad5a14de270fdda8cd2eeabf0bf046 Mon Sep 17 00:00:00 2001
From: Michael Skarlinski <mskarlinski@futurehouse.org>
Date: Tue, 12 Mar 2024 14:09:40 -0700
Subject: [PATCH] add overload type signature for read_doc

---
 paperqa/docs.py    |  8 +++----
 paperqa/readers.py | 54 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/paperqa/docs.py b/paperqa/docs.py
index 5cf36e4a0..cfdab5ce2 100644
--- a/paperqa/docs.py
+++ b/paperqa/docs.py
@@ -353,7 +353,7 @@ async def aadd(
             texts = read_doc(path, fake_doc, chunk_chars=chunk_chars, overlap=100)
             if len(texts) == 0:
                 raise ValueError(f"Could not read document {path}. Is it empty?")
-            chain_result = await cite_chain({"text": texts[0].text}, None)  # type: ignore[union-attr]
+            chain_result = await cite_chain({"text": texts[0].text}, None)
             citation = chain_result.text
             if (
                 len(citation) < 3  # noqa: PLR2004
@@ -385,13 +385,13 @@ async def aadd(
         # loose check to see if document was loaded
         if (
             len(texts) == 0
-            or len(texts[0].text) < 10  # type: ignore[union-attr] # noqa: PLR2004
-            or (not disable_check and not maybe_is_text(texts[0].text))  # type: ignore[union-attr]
+            or len(texts[0].text) < 10  # noqa: PLR2004
+            or (not disable_check and not maybe_is_text(texts[0].text))
         ):
             raise ValueError(
                 f"This does not look like a text document: {path}. Path disable_check to ignore this error."
             )
-        if await self.aadd_texts(texts, doc):  # type: ignore[arg-type]
+        if await self.aadd_texts(texts, doc):
             return docname
         return None
 
diff --git a/paperqa/readers.py b/paperqa/readers.py
index 2d56fda99..b4ad4eed6 100644
--- a/paperqa/readers.py
+++ b/paperqa/readers.py
@@ -2,7 +2,7 @@
 
 from math import ceil
 from pathlib import Path
-from typing import List  # noqa: F401
+from typing import List, Literal, overload  # noqa: F401
 
 import html2text
 import tiktoken
@@ -206,14 +206,62 @@ def chunk_code_text(
     return texts
 
 
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False],
+    include_metadata: Literal[False],
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> list[Text]: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False] = ...,
+    include_metadata: Literal[False] = ...,
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> list[Text]: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[True],
+    include_metadata: bool = ...,
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> ParsedText: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False],
+    include_metadata: Literal[True],
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> tuple[list[Text], ParsedMetadata]: ...
+
+
 def read_doc(  # noqa: PLR0912
     path: Path,
     doc: Doc,
+    parsed_text_only: bool = False,
+    include_metadata: bool = False,
     chunk_chars: int = 3000,
     overlap: int = 100,
     force_pypdf: bool = False,
-    parsed_text_only: bool = False,
-    include_metadata: bool = False,
 ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
     """Parse a document and split into chunks.