diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 417ac7d0..fdccc643 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -55,3 +55,16 @@ def test_shell_parser_run(self): self.assertTrue(e.is_not_installed()) else: self.assertTrue(False, "Expected ShellError") + + def test_missing_module_python(self): + """Make sure not installed modules raises the correct error""" + filename = self.get_temp_filename() + import sys + temp = os + sys.modules['os'] = None + import textract + from textract.exceptions import MissingModuleError + with self.assertRaises(MissingModuleError): + textract.process(filename) + sys.modules['os'] = temp + os.remove(filename) diff --git a/textract/cli.py b/textract/cli.py index 47e1fe84..19a0f290 100644 --- a/textract/cli.py +++ b/textract/cli.py @@ -14,7 +14,7 @@ import argcomplete from . import VERSION -from .parsers import DEFAULT_ENCODING, _get_available_extensions +from .parsers import DEFAULT_OUTPUT_ENCODING, _get_available_extensions class AddToNamespaceAction(argparse.Action): @@ -58,7 +58,7 @@ def get_parser(): 'filename', help='Filename to extract text.', ).completer = argcomplete.completers.FilesCompleter parser.add_argument( - '-e', '--encoding', type=str, default=DEFAULT_ENCODING, + '-e', '--encoding', type=str, default=DEFAULT_OUTPUT_ENCODING, choices=_get_available_encodings(), help='Specify the encoding of the output.', ) diff --git a/textract/exceptions.py b/textract/exceptions.py index 3453cf71..29d97c04 100644 --- a/textract/exceptions.py +++ b/textract/exceptions.py @@ -7,12 +7,14 @@ class CommandLineError(Exception): errors occur on the command line to provide a useful command line interface. """ + def render(self, msg): return msg % vars(self) class ExtensionNotSupported(CommandLineError): """This error is raised with unsupported extensions""" + def __init__(self, ext): self.ext = ext @@ -36,6 +38,7 @@ class MissingFileError(CommandLineError): """This error is raised when the file can not be located at the specified path. """ + def __init__(self, filename): self.filename = filename self.root, self.ext = os.path.splitext(filename) @@ -51,6 +54,7 @@ class UnknownMethod(CommandLineError): """This error is raised when the specified --method on the command line is unknown. """ + def __init__(self, method): self.method = method @@ -64,6 +68,7 @@ class ShellError(CommandLineError): """This error is raised when a shell.run returns a non-zero exit code (meaning the command failed). """ + def __init__(self, command, exit_code, stdout, stderr): self.command = command self.exit_code = exit_code @@ -97,3 +102,20 @@ def __str__(self): return self.not_installed_message() else: return self.failed_message() + + +class MissingModuleError(CommandLineError): + """This error is raised when a dependency module is not installed. + """ + + def __init__(self, import_error): + self.import_error = import_error.__str__() + self.missing_module = self.import_error.split('No module named ')[1] + + def __str__(self): + return self.render(( + 'Module %(missing_module)s is not installed on your system.\n' + 'Please make sure the appropriate dependencies are installed \n' + 'before using textract:\n\n' + ' http://textract.readthedocs.org/en/latest/installation.html\n' + )) diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py index 679660a8..d1a63d9d 100755 --- a/textract/parsers/__init__.py +++ b/textract/parsers/__init__.py @@ -6,6 +6,7 @@ import importlib import glob import re +import pkgutil from .. import exceptions @@ -62,17 +63,24 @@ def process(filename, input_encoding=None, output_encoding=DEFAULT_OUTPUT_ENCODI # the _parser extension rel_module = ext + _FILENAME_SUFFIX - # If we can't import the module, the file extension isn't currently - # supported + # check if we can import the parser module related to the file extension try: - filetype_module = importlib.import_module( - rel_module, 'textract.parsers' - ) - except ImportError: - raise exceptions.ExtensionNotSupported(ext) + # check if the module exists in the system + is_module = pkgutil.find_loader('textract.parsers'+rel_module) + + if is_module is not None: + filetype_module = importlib.import_module( + rel_module, 'textract.parsers' + ) + else: + # If we can't import the module, the file extension isn't currently + # supported + raise exceptions.ExtensionNotSupported(ext) + except ImportError as e: + # Raise the exception of the import failure + raise exceptions.MissingModuleError(e) # do the extraction - parser = filetype_module.Parser() return parser.process(filename, input_encoding, output_encoding, **kwargs)