add overload type signature for read_doc

Future-House · Mar 12, 2024 · 123351f · 123351f
1 parent f780f2c
commit 123351f
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 7 deletions.
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -353,7 +353,7 @@ async def aadd(
             texts = read_doc(path, fake_doc, chunk_chars=chunk_chars, overlap=100)
             if len(texts) == 0:
                 raise ValueError(f"Could not read document {path}. Is it empty?")
-            chain_result = await cite_chain({"text": texts[0].text}, None)  # type: ignore[union-attr]
+            chain_result = await cite_chain({"text": texts[0].text}, None)
             citation = chain_result.text
             if (
                 len(citation) < 3  # noqa: PLR2004
@@ -385,13 +385,13 @@ async def aadd(
         # loose check to see if document was loaded
         if (
             len(texts) == 0
-            or len(texts[0].text) < 10  # type: ignore[union-attr] # noqa: PLR2004
-            or (not disable_check and not maybe_is_text(texts[0].text))  # type: ignore[union-attr]
+            or len(texts[0].text) < 10  # noqa: PLR2004
+            or (not disable_check and not maybe_is_text(texts[0].text))
         ):
             raise ValueError(
                 f"This does not look like a text document: {path}. Path disable_check to ignore this error."
             )
-        if await self.aadd_texts(texts, doc):  # type: ignore[arg-type]
+        if await self.aadd_texts(texts, doc):
             return docname
         return None
 

diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -2,7 +2,7 @@
 
 from math import ceil
 from pathlib import Path
-from typing import List  # noqa: F401
+from typing import List, Literal, overload  # noqa: F401
 
 import html2text
 import tiktoken
@@ -206,14 +206,62 @@ def chunk_code_text(
     return texts
 
 
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False],
+    include_metadata: Literal[False],
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> list[Text]: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False] = ...,
+    include_metadata: Literal[False] = ...,
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> list[Text]: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[True],
+    include_metadata: bool = ...,
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> ParsedText: ...
+
+
+@overload
+def read_doc(
+    path: Path,
+    doc: Doc,
+    parsed_text_only: Literal[False],
+    include_metadata: Literal[True],
+    chunk_chars: int = ...,
+    overlap: int = ...,
+    force_pypdf: bool = ...,
+) -> tuple[list[Text], ParsedMetadata]: ...
+
+
 def read_doc(  # noqa: PLR0912
     path: Path,
     doc: Doc,
+    parsed_text_only: bool = False,
+    include_metadata: bool = False,
     chunk_chars: int = 3000,
     overlap: int = 100,
     force_pypdf: bool = False,
-    parsed_text_only: bool = False,
-    include_metadata: bool = False,
 ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
     """Parse a document and split into chunks.