From 123351f5fbad5a14de270fdda8cd2eeabf0bf046 Mon Sep 17 00:00:00 2001 From: Michael Skarlinski Date: Tue, 12 Mar 2024 14:09:40 -0700 Subject: [PATCH] add overload type signature for read_doc --- paperqa/docs.py | 8 +++---- paperqa/readers.py | 54 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/paperqa/docs.py b/paperqa/docs.py index 5cf36e4a0..cfdab5ce2 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -353,7 +353,7 @@ async def aadd( texts = read_doc(path, fake_doc, chunk_chars=chunk_chars, overlap=100) if len(texts) == 0: raise ValueError(f"Could not read document {path}. Is it empty?") - chain_result = await cite_chain({"text": texts[0].text}, None) # type: ignore[union-attr] + chain_result = await cite_chain({"text": texts[0].text}, None) citation = chain_result.text if ( len(citation) < 3 # noqa: PLR2004 @@ -385,13 +385,13 @@ async def aadd( # loose check to see if document was loaded if ( len(texts) == 0 - or len(texts[0].text) < 10 # type: ignore[union-attr] # noqa: PLR2004 - or (not disable_check and not maybe_is_text(texts[0].text)) # type: ignore[union-attr] + or len(texts[0].text) < 10 # noqa: PLR2004 + or (not disable_check and not maybe_is_text(texts[0].text)) ): raise ValueError( f"This does not look like a text document: {path}. Path disable_check to ignore this error." ) - if await self.aadd_texts(texts, doc): # type: ignore[arg-type] + if await self.aadd_texts(texts, doc): return docname return None diff --git a/paperqa/readers.py b/paperqa/readers.py index 2d56fda99..b4ad4eed6 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -2,7 +2,7 @@ from math import ceil from pathlib import Path -from typing import List # noqa: F401 +from typing import List, Literal, overload # noqa: F401 import html2text import tiktoken @@ -206,14 +206,62 @@ def chunk_code_text( return texts +@overload +def read_doc( + path: Path, + doc: Doc, + parsed_text_only: Literal[False], + include_metadata: Literal[False], + chunk_chars: int = ..., + overlap: int = ..., + force_pypdf: bool = ..., +) -> list[Text]: ... + + +@overload +def read_doc( + path: Path, + doc: Doc, + parsed_text_only: Literal[False] = ..., + include_metadata: Literal[False] = ..., + chunk_chars: int = ..., + overlap: int = ..., + force_pypdf: bool = ..., +) -> list[Text]: ... + + +@overload +def read_doc( + path: Path, + doc: Doc, + parsed_text_only: Literal[True], + include_metadata: bool = ..., + chunk_chars: int = ..., + overlap: int = ..., + force_pypdf: bool = ..., +) -> ParsedText: ... + + +@overload +def read_doc( + path: Path, + doc: Doc, + parsed_text_only: Literal[False], + include_metadata: Literal[True], + chunk_chars: int = ..., + overlap: int = ..., + force_pypdf: bool = ..., +) -> tuple[list[Text], ParsedMetadata]: ... + + def read_doc( # noqa: PLR0912 path: Path, doc: Doc, + parsed_text_only: bool = False, + include_metadata: bool = False, chunk_chars: int = 3000, overlap: int = 100, force_pypdf: bool = False, - parsed_text_only: bool = False, - include_metadata: bool = False, ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]: """Parse a document and split into chunks.