diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 9c0bca67..80310cc8 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -25,6 +25,26 @@ def test_tesseract_cli(self): method='tesseract', ) + def test_tesseract_cli(self): + """confirm fallback to pdf extraction with tesseract if no text is found""" + d = self.get_extension_directory() + self.compare_cli_output( + os.path.join(d, "ocr_text.pdf"), + expected_filename=os.path.join(d, "ocr_text.txt") + ) + def test_large_pdf(self): + """Make sure extraction does not hang (issue #33)""" + + # download the file + filename = os.path.join(self.get_extension_directory(), "large.pdf") + self.download_file( + "https://openknowledge.worldbank.org/bitstream/handle/10986/16091/9780821399378.pdf", + filename, + ) + + # make sure textract can successfully run + self.assertSuccessfulTextract(filename) + def test_two_column(self): """Preserve two column layout in extraction""" filename = os.path.join(self.get_extension_directory(), 'two_column.pdf') diff --git a/textract/parsers/pdf_parser.py b/textract/parsers/pdf_parser.py index 9fe74e13..effabaed 100644 --- a/textract/parsers/pdf_parser.py +++ b/textract/parsers/pdf_parser.py @@ -18,22 +18,23 @@ class Parser(ShellParser): def extract(self, filename, method='', **kwargs): if method == '' or method == 'pdftotext': try: - return self.extract_pdftotext(filename, **kwargs) + text = self.extract_pdftotext(filename, **kwargs) except ShellError as ex: # If pdftotext isn't installed and the pdftotext method # wasn't specified, then gracefully fallback to using # pdfminer instead. if method == '' and ex.is_not_installed(): - return self.extract_pdfminer(filename, **kwargs) + text = self.extract_pdfminer(filename, **kwargs) else: raise ex elif method == 'pdfminer': - return self.extract_pdfminer(filename, **kwargs) + text = self.extract_pdfminer(filename, **kwargs) elif method == 'tesseract': - return self.extract_tesseract(filename, **kwargs) + text = self.extract_tesseract(filename, **kwargs) else: raise UnknownMethod(method) + return text if text.strip() else self.extract_tesseract(filename, **kwargs) def extract_pdftotext(self, filename, **kwargs): """Extract text from pdfs using the pdftotext command line utility."""