Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added option to strip indirect citations #203

Merged
merged 4 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
maybe_is_text,
md5sum,
name_in_text,
strip_citations,
)


Expand All @@ -59,6 +60,8 @@ class Docs(BaseModel, arbitrary_types_allowed=True, smart_union=True):
memory: bool = False
memory_model: Optional[BaseChatMemory] = None
jit_texts_index: bool = False
# This is used to strip indirect citations that come up from the summary llm
strip_citations: bool = True

# TODO: Not sure how to get this to work
# while also passing mypy checks
Expand Down Expand Up @@ -505,6 +508,9 @@ async def process(match):
raise e
if "not applicable" in context.lower() or "not relevant" in context.lower():
return None
if self.strip_citations:
whitead marked this conversation as resolved.
Show resolved Hide resolved
# remove citations that collide with our grounded citations (for the answer LLM)
context = strip_citations(context)
c = Context(
context=context,
text=Text(
Expand Down
8 changes: 8 additions & 0 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,11 @@ def get_llm_name(llm: BaseLanguageModel) -> str:
return llm.model_name # type: ignore
except AttributeError:
return llm.model # type: ignore


def strip_citations(text: str) -> str:
# Combined regex for identifying citations (see unit tests for examples)
citation_regex = r"\b[\w\-]+\set\sal\.\s\([0-9]{4}\)|\((?:[^\)]*?[a-zA-Z][^\)]*?[0-9]{4}[^\)]*?)\)"
# Remove the citations from the text
text = re.sub(citation_regex, "", text, flags=re.MULTILINE)
return text
2 changes: 1 addition & 1 deletion paperqa/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.11.2"
__version__ = "3.12.0"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"pypdf",
"pydantic<2",
"langchain>=0.0.303",
"openai >= 0.27.8",
"openai <1",
"faiss-cpu",
"PyCryptodome",
"html2text",
Expand Down
76 changes: 75 additions & 1 deletion tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,88 @@
from paperqa.chains import get_score
from paperqa.readers import read_doc
from paperqa.types import Doc
from paperqa.utils import maybe_is_html, maybe_is_text, name_in_text, strings_similarity
from paperqa.utils import (
maybe_is_html,
maybe_is_text,
name_in_text,
strings_similarity,
strip_citations,
)


class TestHandler(AsyncCallbackHandler):
async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
print(token)


# Assume strip_citations is imported or defined in this file.


def test_single_author():
text = "This was first proposed by (Smith 1999)."
assert strip_citations(text) == "This was first proposed by ."


def test_multiple_authors():
text = "Recent studies (Smith et al. 1999) show that this is true."
assert strip_citations(text) == "Recent studies show that this is true."


def test_multiple_citations():
text = "As discussed by several authors (Smith et al. 1999; Johnson 2001; Lee et al. 2003)."
assert strip_citations(text) == "As discussed by several authors ."


def test_citations_with_pages():
text = "This is shown in (Smith et al. 1999, p. 150)."
assert strip_citations(text) == "This is shown in ."


def test_citations_without_space():
text = "Findings by(Smith et al. 1999)were significant."
assert strip_citations(text) == "Findings bywere significant."


def test_citations_with_commas():
text = "The method was adopted by (Smith, 1999, 2001; Johnson, 2002)."
assert strip_citations(text) == "The method was adopted by ."


def test_citations_with_text():
text = "This was noted (see Smith, 1999, for a review)."
assert strip_citations(text) == "This was noted ."


def test_no_citations():
text = "There are no references in this text."
assert strip_citations(text) == "There are no references in this text."


def test_malformed_citations():
text = "This is a malformed citation (Smith 199)."
assert strip_citations(text) == "This is a malformed citation (Smith 199)."


def test_edge_case_citations():
text = "Edge cases like (Smith et al.1999) should be handled."
assert strip_citations(text) == "Edge cases like should be handled."


def test_citations_with_special_characters():
text = "Some names have dashes (O'Neil et al. 2000; Smith-Jones 1998)."
assert strip_citations(text) == "Some names have dashes ."


def test_citations_with_nonstandard_chars():
text = (
"In non-English languages, citations might look different (Müller et al. 1999)."
)
assert (
strip_citations(text)
== "In non-English languages, citations might look different ."
)


def test_ablations():
tests_dir = os.path.dirname(os.path.abspath(__file__))
doc_path = os.path.join(tests_dir, "paper.pdf")
Expand Down
Loading