Skip to content

Commit

Permalink
refactoring: added query parameter check and renamed pdfa parameter t…
Browse files Browse the repository at this point in the history
…o profile
  • Loading branch information
rueedlinger committed Jun 7, 2024
1 parent ff56eed commit d3d5b17
Show file tree
Hide file tree
Showing 12 changed files with 144 additions and 80 deletions.
2 changes: 1 addition & 1 deletion docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ The languages correspond to the Tesseract language codes.

```bash
curl -X 'POST' --output pdfa.pdf \
'http://127.0.0.1:8000/pdfa//convert?languages=enf&?pdfa=pdfa-3' \
'http://127.0.0.1:8000/pdfa/convert?languages=eng&?profile=pdfa-3' \
-F 'file=@../tests/data/digital_pdf/loadtest.pdf'
```

Expand Down
16 changes: 15 additions & 1 deletion docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,18 @@ Currently, there are the following feature flags:
- TEAL_FEATURE_LIBREOFFICE_CONVERT
- TEAL_FEATURE_APP_HEALTH
- TEAL_FEATURE_APP_METRICS
- TEAL_FEATURE_APP_INFO
- TEAL_FEATURE_APP_INFO

## Health & Monitoring

Teal provides several key endpoints for monitoring and information purposes. Below is a summary of each
endpoint:

| Endpoint | Description |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `/app/health` | This endpoint returns the health status of the application. It can be used to verify that the application is running properly and is capable of handling requests. |
| `/app/metrics` | This endpoint provides Prometheus metrics for the application. It can be used for gathering performance data and monitoring the application's usage statistics. |
| `/app/info` | This endpoint returns general information about the application, such as version number, build details, and other relevant metadata. |

These endpoints are essential for maintaining the operational integrity and performance of the application, allowing for
effective monitoring and troubleshooting.
31 changes: 18 additions & 13 deletions teal/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
from starlette.responses import FileResponse

from teal.core import (
create_json_err_response_from_exception,
is_feature_enabled,
get_version,
get_tesseract_languages,
get_app_info,
)
from teal.http import (
create_json_err_response_from_exception,
CheckUnknownQueryParamsRouter,
)
from teal.libreoffice import LibreOfficeAdapter
from teal.model import (
TextExtract,
Expand All @@ -32,6 +35,8 @@
from teal.pdfa import PdfAValidator, PdfAConverter

app = FastAPI()
app.router.route_class = CheckUnknownQueryParamsRouter

logger = logging.getLogger("teal.api")
if "TEAL_LOG_CONF" in os.environ:
log_conf_file = os.environ["TEAL_LOG_CONF"]
Expand Down Expand Up @@ -91,7 +96,7 @@ async def unicorn_exception_handler(request: Request, ex: Exception):
)
async def extract_text_from_pdf(
file: UploadFile,
pages: str = Query(None),
pages: str = Query(default=None),
) -> Any:
logger.debug(f"extract text from pdf file='{file.filename}', pages='{pages}'")
pdf = PdfDataExtractor()
Expand All @@ -112,7 +117,7 @@ async def extract_text_from_pdf(
async def extract_text_with_ocr_from_pdf(
file: UploadFile,
languages: List[str] = Query([]),
pages: str = Query(None),
pages: str = Query(default=None),
) -> Any:
logger.debug(
f"extract text with ocr from pdf file='{file.filename}', languages='{languages}', pages='{pages}'"
Expand All @@ -137,7 +142,7 @@ async def extract_text_with_ocr_from_pdf(
)
async def extract_table_from_pdf(
file: UploadFile,
pages: str = Query(None),
pages: str = Query(default=None),
) -> Any:
logger.debug(f"extract table from pdf file='{file.filename}', pages='{pages}'")
pdf = PdfDataExtractor()
Expand Down Expand Up @@ -174,20 +179,20 @@ async def extract_text_from_pdf(
)
async def convert_pdf_to_pdfa_with_ocr(
file: UploadFile,
languages: List[str] = Query([]),
pdfa: OcrPdfAProfile = Query(OcrPdfAProfile.PDFA_1B),
ocr: OcrMode = Query(OcrMode.SKIP_TEXT),
pages: str = Query(None),
languages: List[str] = Query(default=None),
profile: OcrPdfAProfile = Query(default=None),
ocr: OcrMode = Query(default=None),
pages: str = Query(default=None),
) -> Any:
logger.debug(
f"extract table from pdf file='{file.filename}', languages='{languages}, pdfa='{pdfa}', ocr={ocr}, pages='{pages}'"
f"extract table from pdf file='{file.filename}', languages='{languages}, profile='{profile}', ocr={ocr}, pages='{pages}'"
)
pdf = PdfAConverter()
return pdf.convert_pdfa(
data=await file.read(),
filename=file.filename,
langs=languages,
pdfa=pdfa,
pdfa_profile=profile,
ocr_mode=ocr,
page_ranges=pages,
)
Expand All @@ -204,7 +209,7 @@ async def convert_pdf_to_pdfa_with_ocr(
)
async def validate_pdfa(
file: UploadFile,
profile: ValidatePdfProfile = Query(None),
profile: ValidatePdfProfile = Query(default=None),
) -> Any:
logger.debug(
f"extract table from pdf file='{file.filename}', profile='{profile}'"
Expand All @@ -226,8 +231,8 @@ async def validate_pdfa(
)
async def convert_libreoffice_docs_to_pdf(
file: UploadFile,
profile: LibreOfficePdfProfile = Query(None),
pages: str = Query(None),
profile: LibreOfficePdfProfile = Query(default=None),
pages: str = Query(default=None),
) -> Any:
logger.debug(
f"libreoffice convert to pdf file='{file.filename}', profile={profile}, pages='{pages}'"
Expand Down
35 changes: 3 additions & 32 deletions teal/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,12 @@
import pikepdf
import pypdfium2
import pytesseract
from starlette.background import BackgroundTask
from starlette.responses import JSONResponse

from teal.model import AppInfo

# get root logger
_logger = logging.getLogger("teal.core")


def create_json_response(content, background: BackgroundTask = None):
return JSONResponse(content=content, background=background)


def create_json_err_response_from_exception(
ex: Exception, background: BackgroundTask = None
):
return JSONResponse(
status_code=500,
content={
"message": f"{ex}",
},
background=background,
)


def create_json_err_response(
code: int, message: str, background: BackgroundTask = None
):
return JSONResponse(
status_code=code,
content={
"message": message,
},
background=background,
)


def is_feature_enabled(feature_flag) -> bool:
if feature_flag not in os.environ:
return True
Expand Down Expand Up @@ -139,7 +108,9 @@ def get_app_info() -> AppInfo:
return AppInfo.model_validate({"version": get_version(), "details": details})


def make_tesseract_lang_param(langs: list[str]) -> str | None:
def make_tesseract_lang_param(langs: list[str] | None) -> str | None:
if langs is None:
return None
if len(langs) == 0:
return None
if len(langs) == 1 and langs[0] == "":
Expand Down
62 changes: 62 additions & 0 deletions teal/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging
from typing import Callable

from fastapi import HTTPException, Request, Response
from fastapi.routing import APIRoute
from starlette.background import BackgroundTask
from starlette.responses import JSONResponse

_logger = logging.getLogger("teal.http")


class CheckUnknownQueryParamsRouter(APIRoute):

def get_route_handler(self) -> Callable:
original_route_handler = super().get_route_handler()
known_params = [param.name for param in self.dependant.query_params]

async def custom_route_handler(request: Request) -> Response:
unknown_params = [
qp for qp in request.query_params if qp not in known_params
]

if any(unknown_params):
_logger.debug(
f"Unknown request parameters: {unknown_params}, supported parameters are {known_params}"
)
raise HTTPException(
status_code=400,
detail=f"Unknown request parameters: {unknown_params}, supported parameters are {known_params}",
)

return await original_route_handler(request)

return custom_route_handler


def create_json_response(content, background: BackgroundTask = None):
return JSONResponse(content=content, background=background)


def create_json_err_response_from_exception(
ex: Exception, background: BackgroundTask = None
):
return JSONResponse(
status_code=500,
content={
"message": f"{ex}",
},
background=background,
)


def create_json_err_response(
code: int, message: str, background: BackgroundTask = None
):
return JSONResponse(
status_code=code,
content={
"message": message,
},
background=background,
)
6 changes: 3 additions & 3 deletions teal/libreoffice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
from starlette.responses import FileResponse, JSONResponse

from teal.core import (
create_json_err_response,
cleanup_tmp_dir,
parse_page_ranges,
to_page_range,
)
from teal.http import create_json_err_response
from teal.model import LibreOfficePdfProfile

_logger = logging.getLogger("teal.libreoffice")
Expand Down Expand Up @@ -157,8 +157,8 @@ def convert_to_pdf(
self,
data: bytes,
filename: str,
pdf_profile: LibreOfficePdfProfile = None,
page_ranges: str = None,
pdf_profile: LibreOfficePdfProfile,
page_ranges: str,
) -> FileResponse | JSONResponse:

file_ext = os.path.splitext(filename)[1]
Expand Down
10 changes: 5 additions & 5 deletions teal/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
from starlette.responses import JSONResponse

from teal.core import (
create_json_err_response,
make_tesseract_lang_param,
parse_page_ranges,
)
from teal.http import create_json_err_response
from teal.model import TextExtract, TableExtract, PdfMetaDataReport

_logger = logging.getLogger("teal.pdf")
Expand All @@ -29,7 +29,7 @@ def extract_text(
self,
data: bytes,
filename: str,
page_ranges: str = None,
page_ranges: str,
) -> list[TextExtract] | JSONResponse:
file_ext = os.path.splitext(filename)[1]
if file_ext not in self.supported_file_extensions:
Expand Down Expand Up @@ -58,8 +58,8 @@ def extract_text_with_ocr(
self,
data: bytes,
filename: str,
langs: list[str] = [],
page_ranges: str = None,
langs: list[str],
page_ranges: str,
) -> list[TextExtract] | JSONResponse:
file_ext = os.path.splitext(filename)[1]
if file_ext not in self.supported_file_extensions:
Expand Down Expand Up @@ -89,7 +89,7 @@ def extract_table(
self,
data: bytes,
filename: str,
page_ranges: str = None,
page_ranges: str,
) -> list[TableExtract] | JSONResponse:
file_ext = os.path.splitext(filename)[1]
if file_ext not in self.supported_file_extensions:
Expand Down
17 changes: 8 additions & 9 deletions teal/pdfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@
from starlette.responses import JSONResponse, FileResponse

from teal.core import (
create_json_err_response,
create_json_response,
cleanup_tmp_dir,
make_tesseract_lang_param,
parse_page_ranges,
)
from teal.http import create_json_err_response, create_json_response
from teal.model import PdfAReport, OcrPdfAProfile, ValidatePdfProfile, OcrMode

_logger = logging.getLogger("teal.pdfa")
Expand All @@ -31,10 +30,10 @@ def convert_pdfa(
self,
data: bytes,
filename: str,
langs: list[str] = [],
pdfa: OcrPdfAProfile = None,
ocr_mode: OcrMode = OcrMode.SKIP_TEXT,
page_ranges: str = None,
langs: list[str],
pdfa_profile: OcrPdfAProfile,
ocr_mode: OcrMode,
page_ranges: str,
) -> FileResponse | JSONResponse:
file_ext = os.path.splitext(filename)[1]
if file_ext not in self.supported_file_extensions:
Expand Down Expand Up @@ -71,13 +70,13 @@ def convert_pdfa(
if languages is None:
languages = "eng"

if pdfa is None:
pdfa = OcrPdfAProfile.PDFA_1B
if pdfa_profile is None:
pdfa_profile = OcrPdfAProfile.PDFA_1B

if ocr_mode is None:
ocr_mode = OcrMode.SKIP_TEXT

cmd_convert_pdf = f'{self.ocrmypdf_cmd} -l {languages} {ocr_mode.to_parameter()} --output-type {pdfa.to_ocrmypdf_profile()} "{tmp_file_in_path}" "{tmp_file_out_path}"'
cmd_convert_pdf = f'{self.ocrmypdf_cmd} -l {languages} {ocr_mode.to_parameter()} --output-type {pdfa_profile.to_ocrmypdf_profile()} "{tmp_file_in_path}" "{tmp_file_out_path}"'

_logger.debug(f"running cmd: {cmd_convert_pdf}")
result = subprocess.run(
Expand Down
9 changes: 9 additions & 0 deletions tests/test_api_app_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ def test_info():
client = TestClient(api.app, raise_server_exceptions=False)
response = client.get(url="/app/info")
assert response.status_code == 200


def test_not_supported_query_parameter():
client = TestClient(api.app, raise_server_exceptions=False)
response = client.get(url="/app/info?foo")
assert response.status_code == 400

response = client.get(url="/app/info?foo=bar")
assert response.status_code == 400
6 changes: 3 additions & 3 deletions tests/test_api_pdfa_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_pdfa_convert_with_pages_range_and_selection():
def test_pdfa_convert_to_pdfa1():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/digital_pdf/document_two_pages.pdf"), "rb") as f:
response = client.post(url="/pdfa/convert?pdfa=pdfa-1b", files={"file": f})
response = client.post(url="/pdfa/convert?profile=pdfa-1b", files={"file": f})
assert response.status_code == 200
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(response.content)
Expand All @@ -159,7 +159,7 @@ def test_pdfa_convert_to_pdfa1():
def test_pdfa_convert_to_pdfa2():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/digital_pdf/document_two_pages.pdf"), "rb") as f:
response = client.post(url="/pdfa/convert?pdfa=pdfa-2b", files={"file": f})
response = client.post(url="/pdfa/convert?profile=pdfa-2b", files={"file": f})
assert response.status_code == 200
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(response.content)
Expand All @@ -178,7 +178,7 @@ def test_pdfa_convert_to_pdfa2():
def test_pdfa_convert_to_pdfa3():
client = TestClient(api.app, raise_server_exceptions=False)
with open(get_path("data/digital_pdf/document_two_pages.pdf"), "rb") as f:
response = client.post(url="/pdfa/convert?pdfa=pdfa-3b", files={"file": f})
response = client.post(url="/pdfa/convert?profile=pdfa-3b", files={"file": f})
assert response.status_code == 200
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(response.content)
Expand Down
Loading

0 comments on commit d3d5b17

Please sign in to comment.