Skip to content

Commit

Permalink
feat: added pdf meta data endpoint (#31)
Browse files Browse the repository at this point in the history
* fix: move metrics to appinfo tag

* feat: added pdf meta data endpoint #28 

* feat: added pdf meta data api tests #28
  • Loading branch information
rueedlinger authored Jun 3, 2024
1 parent ef1ab1c commit cbb3654
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 3 deletions.
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ghostscript
# needed by camelot-py
opencv-python
PyPDF2
pikepdf
pytest
pytest-cov
locust
Expand Down
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ coverage[toml]==7.5.3
# via pytest-cov
cryptography==42.0.7
# via pdfminer-six
deprecated==1.2.14
# via pikepdf
dnspython==2.6.1
# via email-validator
email-validator==2.1.1
Expand Down Expand Up @@ -103,6 +105,8 @@ jinja2==3.1.4
# flask
locust==2.28.0
# via -r requirements.in
lxml==5.2.2
# via pikepdf
markdown-it-py==3.0.0
# via rich
markupsafe==2.1.5
Expand Down Expand Up @@ -130,6 +134,7 @@ packaging==24.0
# via
# black
# gunicorn
# pikepdf
# pytesseract
# pytest
pandas==2.2.2
Expand All @@ -140,9 +145,12 @@ pdf2image==1.17.0
# via -r requirements.in
pdfminer-six==20231228
# via camelot-py
pikepdf==9.0.0
# via -r requirements.in
pillow==10.3.0
# via
# pdf2image
# pikepdf
# pytesseract
platformdirs==4.2.2
# via black
Expand Down Expand Up @@ -241,6 +249,8 @@ werkzeug==3.0.3
# flask
# flask-login
# locust
wrapt==1.16.0
# via deprecated
zope-event==5.0
# via gevent
zope-interface==6.4.post2
Expand Down
22 changes: 20 additions & 2 deletions teal/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
LibreOfficePdfProfile,
HealthCheck,
ValidatePdfProfile,
PdfMetaDataReport,
)
from teal.pdf import PdfDataExtractor
from teal.pdf import PdfDataExtractor, PdfMetaDataExtractor
from teal.pdfa import PdfAValidator, PdfAConverter

app = FastAPI()
Expand Down Expand Up @@ -142,6 +143,23 @@ async def extract_table_from_pdf(
)


if is_feature_enabled("TEAL_FEATURE_PDF_META"):
logger.info("feature PDF meta data is enabled")

@app.post(
"/pdf/meta",
summary="Extract meata data from a PDF",
response_model=PdfMetaDataReport,
tags=["pdf"],
)
async def extract_text_from_pdf(
file: UploadFile,
) -> Any:
logger.debug(f"extract meta data from pdf file='{file.filename}'")
pdf = PdfMetaDataExtractor()
return pdf.extract_meta_data(data=await file.read(), filename=file.filename)


if is_feature_enabled("TEAL_FEATURE_PDFA_CONVERT"):
logger.info("feature PDF/A convert is enabled")

Expand Down Expand Up @@ -274,6 +292,6 @@ def custom_openapi():
excluded_handlers=["/app/*", "/docs/*", "/openapi.json"]
)
instrumentator.instrument(app).expose(
app, endpoint="/app/metrics", include_in_schema=True
app, endpoint="/app/metrics", include_in_schema=True, tags=["appinfo"]
)
instrumentator.add(metrics.requests())
10 changes: 10 additions & 0 deletions teal/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ class PdfAReport(BaseModel):
details: dict = {}


class PdfMetaDataReport(BaseModel):
fileName: str
fileSize: int
pdfVersion: str
pdfaClaim: str | None
pages: int
docInfo: dict = {}
xmp: dict = {}


class OcrPdfAProfile(str, Enum):
PDFA_1B = "pdfa-1b"
PDFA_2B = "pdfa-2b"
Expand Down
43 changes: 42 additions & 1 deletion teal/pdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import io
import json
import logging
import os
import tempfile

import camelot.io as camelot
import pikepdf
import pypdfium2 as pdfium
import pytesseract
from pdf2image import convert_from_bytes
Expand All @@ -14,7 +16,7 @@
make_tesseract_lang_param,
parse_page_ranges,
)
from teal.model import TextExtract, TableExtract
from teal.model import TextExtract, TableExtract, PdfMetaDataReport

_logger = logging.getLogger("teal.pdf")

Expand Down Expand Up @@ -124,3 +126,42 @@ def extract_table(
f.close()

return extracts


class PdfMetaDataExtractor:
def __init__(self):
self.supported_file_extensions = [".pdf"]

def extract_meta_data(
self,
data: bytes,
filename: str,
) -> PdfMetaDataReport | JSONResponse:
file_ext = os.path.splitext(filename)[1]
if file_ext not in self.supported_file_extensions:
return create_json_err_response(
400, f"file extension '{file_ext}' is not supported ({filename})."
)
meta_data = {}
pdf = pikepdf.open(io.BytesIO(data))
meta = pdf.open_metadata()
for m in meta:
meta_data[m] = meta.get(m)

doc_info = {}
for key, value in pdf.docinfo.items():
doc_info[key] = str(value)

return PdfMetaDataReport.model_validate(
{
"fileName": filename,
"fileSize": len(data),
"pdfVersion": pdf.pdf_version,
"pdfaClaim": (
None if meta.pdfa_status == "" else str(meta.pdfa_status)
),
"pages": len(pdf.pages),
"docInfo": doc_info,
"xmp": meta_data,
}
)
70 changes: 70 additions & 0 deletions tests/test_api_libreoffice_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,51 @@ def test_libreoffice_convert_docx_default():
assert response.status_code == 200
assert len(response.json()) == 3

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] is None


def test_libreoffice_convert_docx_pdf15():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/doc/word_document.docx"), "rb") as f:
response = client.post(
url="/libreoffice/convert?profile=pdf-1.5", files={"file": f}
)
assert response.status_code == 200
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(response.content)
with open(tmp.name, "rb") as pdf_file:
response = client.post(url="/pdf/text", files={"file": pdf_file})
assert response.status_code == 200
assert len(response.json()) == 3

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.5"
assert response.json()["pdfaClaim"] is None


def test_libreoffice_convert_docx_pdf16():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/doc/word_document.docx"), "rb") as f:
response = client.post(
url="/libreoffice/convert?profile=pdf-1.6", files={"file": f}
)
assert response.status_code == 200
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(response.content)
with open(tmp.name, "rb") as pdf_file:
response = client.post(url="/pdf/text", files={"file": pdf_file})
assert response.status_code == 200
assert len(response.json()) == 3

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] is None


def test_libreoffice_convert_txt_default():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -32,6 +77,11 @@ def test_libreoffice_convert_txt_default():
assert response.status_code == 200
assert len(response.json()) == 1

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] is None


def test_libreoffice_convert_pdf_default():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -45,6 +95,11 @@ def test_libreoffice_convert_pdf_default():
assert response.status_code == 200
assert len(response.json()) == 2

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] is None


def test_libreoffice_convert_with_selection():
client = TestClient(api.app, raise_server_exceptions=False)
Expand Down Expand Up @@ -87,6 +142,11 @@ def test_libreoffice_convert_pdfa1():
assert response.json()["compliant"] is True
assert response.json()["profile"] == "PDF/A-1A"

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.4"
assert response.json()["pdfaClaim"] == "1A"


def test_libreoffice_convert_pdfa2():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -103,6 +163,11 @@ def test_libreoffice_convert_pdfa2():
assert response.json()["compliant"] is True
assert response.json()["profile"] == "PDF/A-2B"

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] == "2B"


def test_libreoffice_convert_pdfa3():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -119,6 +184,11 @@ def test_libreoffice_convert_pdfa3():
assert response.json()["compliant"] is True
assert response.json()["profile"] == "PDF/A-3B"

response = client.post(url="/pdf/meta", files={"file": pdf_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] == "3B"


def test_libreoffice_convert_wrong_file_type():
client = TestClient(api.app, raise_server_exceptions=False)
Expand Down
28 changes: 28 additions & 0 deletions tests/test_api_pdf_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from starlette.testclient import TestClient

from teal import api
from tests import get_path


def test_pdf_extract_meta():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/digital_pdf/document_one_page.pdf"), "rb") as f:
response = client.post(url="/pdf/meta", files={"file": f})
assert response.status_code == 200
assert response.json()["fileName"] == "document_one_page.pdf"
assert response.json()["fileSize"] > 100
assert response.json()["pdfVersion"] == "1.3"
assert response.json()["pages"] == 1
assert response.json()["pdfaClaim"] is None
assert response.json()["docInfo"] is not None
assert response.json()["xmp"] is not None


def test_pdf_extract_meta_with_wrong_file_ending():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/doc/word_document.docx"), "rb") as f:
response = client.post(url="/pdf/text", files={"file": f})
assert response.status_code == 400
assert response.json() == {
"message": "file extension '.docx' is not supported (word_document.docx)."
}
20 changes: 20 additions & 0 deletions tests/test_api_pdfa_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def test_pdfa_convert_digital_pdf():
assert response.status_code == 200
assert len(response.json()) == 2

response = client.post(url="/pdf/meta", files={"file": pdfa_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.4"
assert response.json()["pdfaClaim"] == "1B"


def test_pdfa_convert_scanned_document_with_ocr():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -41,6 +46,11 @@ def test_pdfa_convert_scanned_document_with_ocr():
assert response.status_code == 200
assert len(response.json()) == 10

response = client.post(url="/pdf/meta", files={"file": pdfa_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.4"
assert response.json()["pdfaClaim"] == "1B"


def test_pdfa_convert_digital_pdf_with_lang():
client = TestClient(api.app, raise_server_exceptions=False)
Expand Down Expand Up @@ -159,6 +169,11 @@ def test_pdfa_convert_to_pdfa2():
assert response.json()["compliant"] is True
assert response.json()["profile"] == "PDF/A-2B"

response = client.post(url="/pdf/meta", files={"file": pdfa_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.7"
assert response.json()["pdfaClaim"] == "2B"


def test_pdfa_convert_to_pdfa3():
client = TestClient(api.app, raise_server_exceptions=False)
Expand All @@ -173,6 +188,11 @@ def test_pdfa_convert_to_pdfa3():
assert response.json()["compliant"] is True
assert response.json()["profile"] == "PDF/A-3B"

response = client.post(url="/pdf/meta", files={"file": pdfa_file})
assert response.status_code == 200
assert response.json()["pdfVersion"] == "1.6"
assert response.json()["pdfaClaim"] == "3B"


def test_pdfa_convert_wrong_file_type():
client = TestClient(api.app, raise_server_exceptions=False)
Expand Down

0 comments on commit cbb3654

Please sign in to comment.