From d80c97174ecf16189bc7bf064a7be25c0681bd0b Mon Sep 17 00:00:00 2001 From: Enzo Di Tizio Date: Tue, 27 Jun 2023 17:00:12 -0300 Subject: [PATCH] adding encoding options for pdftotext --- textract/parsers/pdf_parser.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/textract/parsers/pdf_parser.py b/textract/parsers/pdf_parser.py index 9fe74e13..cab8e2df 100644 --- a/textract/parsers/pdf_parser.py +++ b/textract/parsers/pdf_parser.py @@ -37,10 +37,15 @@ def extract(self, filename, method='', **kwargs): def extract_pdftotext(self, filename, **kwargs): """Extract text from pdfs using the pdftotext command line utility.""" + args = ['pdftotext'] + if 'layout' in kwargs: - args = ['pdftotext', '-layout', filename, '-'] - else: - args = ['pdftotext', filename, '-'] + args.append('-layout') + + if 'shell_encoding' in kwargs: + args += ['-enc', kwargs['shell_encoding']] + + args += [filename, '-'] stdout, _ = self.run(args) return stdout