Skip to content

Commit

Permalink
add overload type signature for read_doc
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Skarlinski committed Mar 12, 2024
1 parent f780f2c commit 123351f
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 7 deletions.
8 changes: 4 additions & 4 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ async def aadd(
texts = read_doc(path, fake_doc, chunk_chars=chunk_chars, overlap=100)
if len(texts) == 0:
raise ValueError(f"Could not read document {path}. Is it empty?")
chain_result = await cite_chain({"text": texts[0].text}, None) # type: ignore[union-attr]
chain_result = await cite_chain({"text": texts[0].text}, None)
citation = chain_result.text
if (
len(citation) < 3 # noqa: PLR2004
Expand Down Expand Up @@ -385,13 +385,13 @@ async def aadd(
# loose check to see if document was loaded
if (
len(texts) == 0
or len(texts[0].text) < 10 # type: ignore[union-attr] # noqa: PLR2004
or (not disable_check and not maybe_is_text(texts[0].text)) # type: ignore[union-attr]
or len(texts[0].text) < 10 # noqa: PLR2004
or (not disable_check and not maybe_is_text(texts[0].text))
):
raise ValueError(
f"This does not look like a text document: {path}. Path disable_check to ignore this error."
)
if await self.aadd_texts(texts, doc): # type: ignore[arg-type]
if await self.aadd_texts(texts, doc):
return docname
return None

Expand Down
54 changes: 51 additions & 3 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from math import ceil
from pathlib import Path
from typing import List # noqa: F401
from typing import List, Literal, overload # noqa: F401

import html2text
import tiktoken
Expand Down Expand Up @@ -206,14 +206,62 @@ def chunk_code_text(
return texts


@overload
def read_doc(
path: Path,
doc: Doc,
parsed_text_only: Literal[False],
include_metadata: Literal[False],
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> list[Text]: ...


@overload
def read_doc(
path: Path,
doc: Doc,
parsed_text_only: Literal[False] = ...,
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> list[Text]: ...


@overload
def read_doc(
path: Path,
doc: Doc,
parsed_text_only: Literal[True],
include_metadata: bool = ...,
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> ParsedText: ...


@overload
def read_doc(
path: Path,
doc: Doc,
parsed_text_only: Literal[False],
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
force_pypdf: bool = ...,
) -> tuple[list[Text], ParsedMetadata]: ...


def read_doc( # noqa: PLR0912
path: Path,
doc: Doc,
parsed_text_only: bool = False,
include_metadata: bool = False,
chunk_chars: int = 3000,
overlap: int = 100,
force_pypdf: bool = False,
parsed_text_only: bool = False,
include_metadata: bool = False,
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
"""Parse a document and split into chunks.
Expand Down

0 comments on commit 123351f

Please sign in to comment.