diff --git a/nlu/pipe/utils/data_conversion_utils.py b/nlu/pipe/utils/data_conversion_utils.py index 3741a21e..7743d238 100644 --- a/nlu/pipe/utils/data_conversion_utils.py +++ b/nlu/pipe/utils/data_conversion_utils.py @@ -13,18 +13,25 @@ from pyspark.sql.types import StringType, StructType, StructField +class NluDataParseException(Exception): + """Custom exception class""" + + def __init__(self, message="An error occurred parsing data with NLU"): + self.message = message + super().__init__(self.message) + class DataConversionUtils: # Modin aswell but optional, so we dont import the type yet supported_types = [pyspark.sql.DataFrame, pd.DataFrame, pd.Series, np.ndarray] @staticmethod def except_text_col_not_found(cols): - raise ValueError( + raise NluDataParseException( f'Could not find column named "text" in input Pandas Dataframe. Please ensure one column named such exists. Columns in DF are : {cols} ') @staticmethod def except_invalid_question_data_format(cols): - raise ValueError( + raise NluDataParseException( f'You input data format is invalid for question answering with span classification.' f'Make sure you have at least 2 columns in you dataset, named context/question for pandas Dataframes' f'For Strings/Iterables/Tuples make sure to use the format `question|||context` or (question,context) ') @@ -301,7 +308,6 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is # TODO invalid Table Data Format Exception pass if isinstance(data[0], str): - return DataConversionUtils.table_question_str_to_sdf(data, spark_sess) if isinstance(data[0], pd.DataFrame): return DataConversionUtils.table_question_pdf_to_sdf(data, spark_sess) @@ -321,6 +327,8 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is return DataConversionUtils.question_tuple_iterable_to_sdf(data, spark_sess) elif isinstance(data[0], str): return DataConversionUtils.question_str_iterable_to_sdf(data, spark_sess) + except NluDataParseException as err : + raise err except: ValueError("Data could not be converted to Spark Dataframe for internal conversion.") else: diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py index d4d4e72d..9a2cb4de 100644 --- a/nlu/pipe/utils/predict_helper.py +++ b/nlu/pipe/utils/predict_helper.py @@ -8,7 +8,7 @@ from sparknlp.common import AnnotatorType from nlu.pipe.utils.audio_data_conversion_utils import AudioDataConversionUtils -from nlu.pipe.utils.data_conversion_utils import DataConversionUtils +from nlu.pipe.utils.data_conversion_utils import DataConversionUtils, NluDataParseException from nlu.pipe.utils.ocr_data_conversion_utils import OcrDataConversionUtils logger = logging.getLogger('nlu') @@ -364,12 +364,14 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met try: return __predict_standard_spark(pipe, data, output_level, positions, keep_stranger_features, metadata, drop_irrelevant_cols, return_spark_df, get_embeddings) + except NluDataParseException as err: + logger.warning(f"Predictions Failed={err}") + raise err except Exception as err: logger.warning(f"Predictions Failed={err}") pipe.print_exception_err(err) raise Exception("Failure to process data with NLU") - def debug_print_pipe_cols(pipe): for c in pipe.components: print(f'{c.spark_input_column_names}->{c.name}->{c.spark_output_column_names}')