Skip to content

Commit

Permalink
fix(extraction): pass complete pdf content (#55)
Browse files Browse the repository at this point in the history
* fix(extraction): pass complete pdf content

* fix(preview): pass pdf content correctly

* fix: remove redundant condition
  • Loading branch information
ArslanSaleem authored Feb 20, 2025
1 parent dd1365b commit 802f6db
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 51 deletions.
9 changes: 7 additions & 2 deletions backend/app/api/v1/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ async def extract(
asset_content = project_repository.get_asset_content(db, asset_id=asset.id)

if asset_content:
asset_content = "\n".join(asset_content.content["content"])

asset_content = (
"\n".join(item["text"] for item in asset_content.content.get("content", []) if "text" in item)
if asset_content.content
else None
)

data = extract_data(
api_token=api_key.key,
Expand All @@ -62,7 +67,7 @@ async def extract(
return {
"status": "success",
"message": "Data extracted successfully from the file.",
"data": data["fields"],
"data": data.fields,
}

except HTTPException:
Expand Down
79 changes: 40 additions & 39 deletions backend/app/processing/process_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,45 +210,46 @@ def wrapper(*args, **kwargs):
@handle_exceptions
def extract_process(api_key, process, process_step, asset_content):
pdf_content = ""
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
if (
("multiple_fields" not in process.details or not process.details["multiple_fields"])
and asset_content.content
and asset_content.content.get("word_count", 0) > 500
):
for field in process.details["fields"]:
relevant_docs = vectorstore.get_relevant_docs(
field["key"],
where={
"$and": [
{"asset_id": process_step.asset.id},
{"project_id": process.project_id},
]
},
k=5,
)

for index, metadata in enumerate(relevant_docs["metadatas"][0]):
segment_data = [relevant_docs["documents"][0][index]]
if metadata.get("previous_sentence_id", -1) != -1:
prev_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["previous_sentence_id"]]
)
if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
segment_data = [prev_sentence["documents"][0]] + segment_data
else:
logger.warning("Previous sentence document is empty.")

if metadata.get("next_sentence_id", -1) != -1:
next_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["next_sentence_id"]]
)
if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
segment_data.append(next_sentence["documents"][0])
else:
logger.warning("Next sentence document is empty.")

pdf_content += "\n" + " ".join(segment_data)
# TODO - Disable Vector store pdf content fetching temporarily until fixed.
# vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similarity_threshold=3)
# if (
# ("multiple_fields" not in process.details or not process.details["multiple_fields"])
# and asset_content.content
# and asset_content.content.get("word_count", 0) > 500
# ):
# for field in process.details["fields"]:
# relevant_docs = vectorstore.get_relevant_docs(
# field["key"],
# where={
# "$and": [
# {"asset_id": process_step.asset.id},
# {"project_id": process.project_id},
# ]
# },
# k=5,
# )

# for index, metadata in enumerate(relevant_docs["metadatas"][0]):
# segment_data = [relevant_docs["documents"][0][index]]
# if metadata.get("previous_sentence_id", -1) != -1:
# prev_sentence = vectorstore.get_relevant_docs_by_id(
# ids=[metadata["previous_sentence_id"]]
# )
# if prev_sentence["documents"] and len(prev_sentence["documents"][0]) > 0:
# segment_data = [prev_sentence["documents"][0]] + segment_data
# else:
# logger.warning("Previous sentence document is empty.")

# if metadata.get("next_sentence_id", -1) != -1:
# next_sentence = vectorstore.get_relevant_docs_by_id(
# ids=[metadata["next_sentence_id"]]
# )
# if next_sentence["documents"] and len(next_sentence["documents"][0]) > 0:
# segment_data.append(next_sentence["documents"][0])
# else:
# logger.warning("Next sentence document is empty.")

# pdf_content += "\n" + " ".join(segment_data)

if not pdf_content:
pdf_content = (
Expand Down
15 changes: 6 additions & 9 deletions backend/tests/extract/test_extract_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,15 @@ def test_extract_success(
mock_db,
):
"""Test successful extraction of fields"""
mock_asset = MagicMock(id=1, project_id=1, path="/path/to/file.pdf")
mock_asset = MagicMock(id=1, project_id=1, path=None)
mock_get_asset.return_value = mock_asset
mock_get_user_api_key.return_value = MagicMock(key="fake_api_key")
mock_get_asset_content.return_value = MagicMock(
content={"content": ["Page 1 content", "Page 2 content"]}
)
mock_extract_data.return_value = {
"fields": {
"extracted_field1": "value1",
"extracted_field2": "value2",
}
}
mock_extract_data.return_value = MagicMock(
fields={"extracted_field1": "value1", "extracted_field2": "value2"}
)

response = client.post("/v1/extract/1", json=extract_fields.dict())

Expand All @@ -81,12 +78,12 @@ def test_extract_success(
api_token="fake_api_key",
fields=extract_fields.dict(),
file_path=None,
pdf_content="Page 1 content\nPage 2 content",
pdf_content=None,
)


@patch("app.repositories.project_repository.get_asset")
def test_extract_asset_permission_error(mock_get_asset, extract_fields, mock_db):
def test_extract_asset_permission_error( mock_get_asset, extract_fields, mock_db):
"""Test extraction with asset permission error"""
mock_asset = MagicMock(id=1, project_id=2)
mock_get_asset.return_value = mock_asset
Expand Down
1 change: 0 additions & 1 deletion backend/tests/processing/test_process_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_extract_process(mock_chroma, mock_extract_data):
assert result["fields"] == [{"field1": "value1"}]
assert result["context"] == [[{'name': 'ESG_Reporting_Assurance', 'sources': ['Assurance'], 'page_numbers': None}]]
mock_extract_data.assert_called_once()
mock_chroma_instance.get_relevant_docs.assert_called()

def test_update_process_step_status():
mock_db = Mock()
Expand Down
4 changes: 4 additions & 0 deletions frontend/src/components/ExtractionForm.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { ExtractionField } from "@/interfaces/extract";
import AddFieldsAIDrawer from "./AddFieldsAIDrawer";
import { Card } from "@/components/ui/Card";
import Switch from "./ui/Switch";
import toast from "react-hot-toast";

const FIELD_TYPES = ["text", "number", "date", "list"] as const;

Expand Down Expand Up @@ -130,6 +131,9 @@ export default function ExtractionForm({
try {
await onSubmit(fields);
} catch (error) {
toast.error(
"The file isn't ready for preview yet, preprocessing is in progress."
);
console.error("Error submitting form:", error);
} finally {
setIsLoading(false);
Expand Down

0 comments on commit 802f6db

Please sign in to comment.