From 72d9234ce07c9038535c4414ba092c40363fb548 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Sun, 10 Mar 2024 11:11:38 -0700 Subject: [PATCH] Update encoding everytime when SubprocessTabule is initialized --- tabula/backend.py | 14 ++++++++++++++ tabula/io.py | 4 ++++ tests/test_read_pdf_table.py | 3 +++ 3 files changed, 21 insertions(+) diff --git a/tabula/backend.py b/tabula/backend.py index f43e160..0231d01 100644 --- a/tabula/backend.py +++ b/tabula/backend.py @@ -92,6 +92,20 @@ def __init__( self.java_options = java_options self.encoding = encoding + def update_encoding( + self, encoding: str, java_options: List[str], silent: Optional[bool] + ) -> None: + self.encoding = encoding + self.java_options = java_options + if silent: + self.java_options.extend( + ( + "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", + "-Dorg.apache.commons.logging.Log" + "=org.apache.commons.logging.impl.NoOpLog", + ) + ) + def call_tabula_java( self, options: TabulaOption, path: Optional[str] = None ) -> str: diff --git a/tabula/io.py b/tabula/io.py index 2ad7328..bb9de86 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -76,6 +76,10 @@ def _run( _tabula_vm = SubprocessTabula( java_options=java_options, silent=options.silent, encoding=encoding ) + elif isinstance(_tabula_vm, SubprocessTabula): + _tabula_vm.update_encoding( + encoding=encoding, java_options=java_options, silent=options.silent + ) elif set(java_options) - IGNORED_JAVA_OPTIONS: logger.warning("java_options is ignored until rebooting the Python process.") diff --git a/tests/test_read_pdf_table.py b/tests/test_read_pdf_table.py index 1ad38ad..e0510a3 100644 --- a/tests/test_read_pdf_table.py +++ b/tests/test_read_pdf_table.py @@ -41,6 +41,9 @@ def test_read_pdf_with_force_subprocess(self): self.assertTrue(len(df), 1) self.assertTrue(isinstance(df[0], pd.DataFrame)) self.assertTrue(df[0].equals(pd.read_csv(self.expected_csv1))) + self.assertTrue(tabula.io._tabula_vm.encoding, "utf-8") + tabula.read_pdf(self.pdf_path, stream=True, encoding="cp932") + self.assertTrue(tabula.io._tabula_vm.encoding, "cp932") def test_read_pdf_into_json(self): expected_json = "tests/resources/data_1.json"