From 59dca4b45a1324224a3413e58faf1bd6c6c4c66a Mon Sep 17 00:00:00 2001 From: Andrew White Date: Sun, 5 Nov 2023 19:39:45 -0800 Subject: [PATCH 1/4] Added stripping --- paperqa/docs.py | 5 +++ paperqa/utils.py | 8 +++++ paperqa/version.py | 2 +- tests/test_paperqa.py | 76 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 89 insertions(+), 2 deletions(-) diff --git a/paperqa/docs.py b/paperqa/docs.py index 7580bf09d..704c72b90 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -35,6 +35,7 @@ maybe_is_text, md5sum, name_in_text, + strip_citations, ) @@ -59,6 +60,8 @@ class Docs(BaseModel, arbitrary_types_allowed=True, smart_union=True): memory: bool = False memory_model: Optional[BaseChatMemory] = None jit_texts_index: bool = False + # This is used to strip indirect citations that come up from the summary llm + strip_citations: bool = True # TODO: Not sure how to get this to work # while also passing mypy checks @@ -505,6 +508,8 @@ async def process(match): raise e if "not applicable" in context.lower() or "not relevant" in context.lower(): return None + if self.strip_citations: + context = strip_citations(context) c = Context( context=context, text=Text( diff --git a/paperqa/utils.py b/paperqa/utils.py index d7da8ee75..dd28e01b1 100644 --- a/paperqa/utils.py +++ b/paperqa/utils.py @@ -97,3 +97,11 @@ def get_llm_name(llm: BaseLanguageModel) -> str: return llm.model_name # type: ignore except AttributeError: return llm.model # type: ignore + + +def strip_citations(text: str) -> str: + # Combined regex for identifying citations (see unit tests for examples) + citation_regex = r"\b[\w\-]+\set\sal\.\s\([0-9]{4}\)|\((?:[^\)]*?[a-zA-Z][^\)]*?[0-9]{4}[^\)]*?)\)" + # Remove the citations from the text + text = re.sub(citation_regex, "", text, flags=re.MULTILINE) + return text diff --git a/paperqa/version.py b/paperqa/version.py index e7e98ee6f..d1a7f1e0d 100644 --- a/paperqa/version.py +++ b/paperqa/version.py @@ -1 +1 @@ -__version__ = "3.11.2" +__version__ = "3.12.0" diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 6a69f8b57..ae7f4e073 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -15,7 +15,13 @@ from paperqa.chains import get_score from paperqa.readers import read_doc from paperqa.types import Doc -from paperqa.utils import maybe_is_html, maybe_is_text, name_in_text, strings_similarity +from paperqa.utils import ( + maybe_is_html, + maybe_is_text, + name_in_text, + strings_similarity, + strip_citations, +) class TestHandler(AsyncCallbackHandler): @@ -23,6 +29,74 @@ async def on_llm_new_token(self, token: str, **kwargs: Any) -> None: print(token) +# Assume strip_citations is imported or defined in this file. + + +def test_single_author(): + text = "This was first proposed by (Smith 1999)." + assert strip_citations(text) == "This was first proposed by ." + + +def test_multiple_authors(): + text = "Recent studies (Smith et al. 1999) show that this is true." + assert strip_citations(text) == "Recent studies show that this is true." + + +def test_multiple_citations(): + text = "As discussed by several authors (Smith et al. 1999; Johnson 2001; Lee et al. 2003)." + assert strip_citations(text) == "As discussed by several authors ." + + +def test_citations_with_pages(): + text = "This is shown in (Smith et al. 1999, p. 150)." + assert strip_citations(text) == "This is shown in ." + + +def test_citations_without_space(): + text = "Findings by(Smith et al. 1999)were significant." + assert strip_citations(text) == "Findings bywere significant." + + +def test_citations_with_commas(): + text = "The method was adopted by (Smith, 1999, 2001; Johnson, 2002)." + assert strip_citations(text) == "The method was adopted by ." + + +def test_citations_with_text(): + text = "This was noted (see Smith, 1999, for a review)." + assert strip_citations(text) == "This was noted ." + + +def test_no_citations(): + text = "There are no references in this text." + assert strip_citations(text) == "There are no references in this text." + + +def test_malformed_citations(): + text = "This is a malformed citation (Smith 199)." + assert strip_citations(text) == "This is a malformed citation (Smith 199)." + + +def test_edge_case_citations(): + text = "Edge cases like (Smith et al.1999) should be handled." + assert strip_citations(text) == "Edge cases like should be handled." + + +def test_citations_with_special_characters(): + text = "Some names have dashes (O'Neil et al. 2000; Smith-Jones 1998)." + assert strip_citations(text) == "Some names have dashes ." + + +def test_citations_with_nonstandard_chars(): + text = ( + "In non-English languages, citations might look different (Müller et al. 1999)." + ) + assert ( + strip_citations(text) + == "In non-English languages, citations might look different ." + ) + + def test_ablations(): tests_dir = os.path.dirname(os.path.abspath(__file__)) doc_path = os.path.join(tests_dir, "paper.pdf") From aa027f9c5861dd884e51efa85f25f455128b54dc Mon Sep 17 00:00:00 2001 From: Andrew White Date: Mon, 6 Nov 2023 08:52:05 -0800 Subject: [PATCH 2/4] Update paperqa/docs.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jakub Lála <68380659+jakublala@users.noreply.github.com> --- paperqa/docs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paperqa/docs.py b/paperqa/docs.py index 704c72b90..0ae458a1c 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -509,6 +509,7 @@ async def process(match): if "not applicable" in context.lower() or "not relevant" in context.lower(): return None if self.strip_citations: + # remove citations that collide with our grounded citations (for the answer LLM) context = strip_citations(context) c = Context( context=context, From 5345c7d5fc3a4c55dc324437b9d916a619e23938 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Mon, 6 Nov 2023 08:58:09 -0800 Subject: [PATCH 3/4] Fixde pre-commit --- paperqa/docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperqa/docs.py b/paperqa/docs.py index 0ae458a1c..cdc064d25 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -509,7 +509,7 @@ async def process(match): if "not applicable" in context.lower() or "not relevant" in context.lower(): return None if self.strip_citations: - # remove citations that collide with our grounded citations (for the answer LLM) + # remove citations that collide with our grounded citations (for the answer LLM) context = strip_citations(context) c = Context( context=context, From 75c5bc0681b17c163b920edbd50559e263e52e7d Mon Sep 17 00:00:00 2001 From: Andrew White Date: Mon, 6 Nov 2023 10:23:57 -0800 Subject: [PATCH 4/4] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c277abaa3..d2463365d 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ "pypdf", "pydantic<2", "langchain>=0.0.303", - "openai >= 0.27.8", + "openai <1", "faiss-cpu", "PyCryptodome", "html2text",