From 5c23bb2a18905bd661cb3f6052d92525448b8252 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Sun, 19 Nov 2023 17:52:26 -0800 Subject: [PATCH 1/2] Set encoding on SubprocessTabula initialization --- tabula/backend.py | 4 ++++ tabula/io.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tabula/backend.py b/tabula/backend.py index f43e160..6388bf2 100644 --- a/tabula/backend.py +++ b/tabula/backend.py @@ -89,6 +89,10 @@ def __init__( ) ) + if encoding == "utf-8": + if not any("file.encoding" in opt for opt in java_options): + java_options += ["-Dfile.encoding=UTF8"] + self.java_options = java_options self.encoding = encoding diff --git a/tabula/io.py b/tabula/io.py index ece7403..c4bab9e 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -391,10 +391,6 @@ def read_pdf( if not any("java.awt.headless" in opt for opt in java_options): java_options += ["-Djava.awt.headless=true"] - if encoding == "utf-8": - if not any("file.encoding" in opt for opt in java_options): - java_options += ["-Dfile.encoding=UTF8"] - path, temporary = localize_file(input_path, user_agent, use_raw_url=use_raw_url) if not os.path.exists(path): From 782793de7ce01520423d60baf3dae041480e006c Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Sun, 19 Nov 2023 17:52:26 -0800 Subject: [PATCH 2/2] Set encoding on SubprocessTabula initialization --- tabula/backend.py | 4 ---- tabula/io.py | 31 +++++++++++++------------------ 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/tabula/backend.py b/tabula/backend.py index 6388bf2..f43e160 100644 --- a/tabula/backend.py +++ b/tabula/backend.py @@ -89,10 +89,6 @@ def __init__( ) ) - if encoding == "utf-8": - if not any("file.encoding" in opt for opt in java_options): - java_options += ["-Dfile.encoding=UTF8"] - self.java_options = java_options self.encoding = encoding diff --git a/tabula/io.py b/tabula/io.py index c4bab9e..2ad7328 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -44,8 +44,8 @@ def _run( - java_options: List[str], options: TabulaOption, + java_options: Optional[List[str]] = None, path: Optional[str] = None, encoding: str = "utf-8", force_subprocess: bool = False, @@ -62,6 +62,8 @@ def _run( "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", } + java_options = _build_java_options(java_options, encoding) + global _tabula_vm if force_subprocess: _tabula_vm = SubprocessTabula( @@ -381,16 +383,6 @@ def read_pdf( multiple_tables=multiple_tables, ) - if java_options is None: - java_options = [] - elif isinstance(java_options, str): - java_options = shlex.split(java_options) - - # to prevent tabula-py from stealing focus on every call on mac - if platform.system() == "Darwin": - if not any("java.awt.headless" in opt for opt in java_options): - java_options += ["-Djava.awt.headless=true"] - path, temporary = localize_file(input_path, user_agent, use_raw_url=use_raw_url) if not os.path.exists(path): @@ -401,8 +393,8 @@ def read_pdf( try: output = _run( - java_options, tabula_options, + java_options, path, encoding=encoding, force_subprocess=force_subprocess, @@ -823,7 +815,6 @@ def convert_into( output_path=output_path, options=options, ) - java_options = _build_java_options(java_options) path, temporary = localize_file(input_path) @@ -834,7 +825,7 @@ def convert_into( raise ValueError(f"{path} is empty. Check the file, or download it manually.") try: - _run(java_options, tabula_options, path, force_subprocess=force_subprocess) + _run(tabula_options, java_options, path, force_subprocess=force_subprocess) finally: if temporary: os.unlink(path) @@ -944,8 +935,6 @@ def convert_into_by_batch( format = _extract_format_for_conversion(output_format) - java_options = _build_java_options(java_options) - tabula_options = TabulaOption( pages=pages, guess=guess, @@ -963,10 +952,12 @@ def convert_into_by_batch( options=options, ) - _run(java_options, tabula_options, force_subprocess=force_subprocess) + _run(tabula_options, java_options, force_subprocess=force_subprocess) -def _build_java_options(_java_options: Optional[List[str]] = None) -> List[str]: +def _build_java_options( + _java_options: Optional[List[str]] = None, encoding: str = "utf-8" +) -> List[str]: if _java_options is None: _java_options = [] elif isinstance(_java_options, str): @@ -978,6 +969,10 @@ def _build_java_options(_java_options: Optional[List[str]] = None) -> List[str]: if not any(filter(r.find, _java_options)): # type: ignore _java_options = _java_options + ["-Djava.awt.headless=true"] + if encoding == "utf-8": + if not any("file.encoding" in opt for opt in _java_options): + _java_options += ["-Dfile.encoding=UTF8"] + return _java_options