From fe82871324c9ff77528197a41b8fe625a32be4ac Mon Sep 17 00:00:00 2001 From: Manish Zope Date: Sun, 19 May 2019 14:50:04 +0530 Subject: [PATCH 1/5] SDC-11380 Add test case for Directory origin's Escape Character configuration. * Added test case with reusable code --- stage/configuration/test_directory_origin.py | 74 ++++++++++++++++++-- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/stage/configuration/test_directory_origin.py b/stage/configuration/test_directory_origin.py index a499cac0..e7da0282 100644 --- a/stage/configuration/test_directory_origin.py +++ b/stage/configuration/test_directory_origin.py @@ -477,9 +477,41 @@ def test_directory_origin_configuration_error_directory(sdc_builder, sdc_executo @pytest.mark.parametrize('delimiter_format_type', ['CUSTOM']) @pytest.mark.parametrize('data_format', ['DELIMITED']) -@pytest.mark.skip('Not yet implemented') -def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, data_format): - pass +@pytest.mark.parametrize('escape_character', ['\t', ';' , ' ']) +@pytest.mark.parametrize('delimiter_character', ['@']) +def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, + data_format, escape_character, shell_executor, + delimited_file_writer, delimiter_character): + """ Verify if DC can read the delimited file with custom escape character""" + file_name = 'custom_delimited_file.csv' + f = lambda ip_string: ip_string.format(escape_character=escape_character, delimiter_character=delimiter_character) + f1 = lambda ip_string: ip_string.replace(escape_character, "") + data = [[f('Field11{escape_character}{delimiter_character}'), 'Field12', f('{escape_character}"Field13')], + [f('Field{escape_character}{delimiter_character}21'), 'Field22', 'Field23']] + + try: + files_directory = DirectoryOriginCommon.create_file_directory(file_name, data, shell_executor,delimited_file_writer, + delimiter_format_type, delimiter_character) + + attributes = {'data_format':data_format, + 'files_directory':files_directory, + 'file_name_pattern':'custom_delimited_*', + 'file_name_pattern_mode':'GLOB', + 'delimiter_format_type':delimiter_format_type, + 'delimiter_character':delimiter_character, + 'escape_character':escape_character} + directory, pipeline = DirectoryOriginCommon.get_directory_trash_pipeline(sdc_builder, attributes) + + sdc_executor.add_pipeline(pipeline) + snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batch_size=3).snapshot + sdc_executor.stop_pipeline(pipeline) + output_records = snapshot[directory.instance_name].output + + assert 2 == len(output_records) + assert output_records[0].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[0]))) + assert output_records[1].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[1]))) + finally: + shell_executor(f'rm -r {files_directory}') @pytest.mark.parametrize('data_format', ['EXCEL']) @@ -918,5 +950,37 @@ def test_directory_origin_configuration_use_custom_log_format(sdc_builder, sdc_e ## Start of general supportive functions -def get_text_file_content(file_number): - return '\n'.join(['This is line{}{}'.format(str(file_number), i) for i in range(1, 4)]) + + +# Class with common functionalities +class DirectoryOriginCommon(object): + + def __init__(self): + pass + + @staticmethod + def get_directory_trash_pipeline(sdc_builder, attributes): + pipeline_builder = sdc_builder.get_pipeline_builder() + directory = pipeline_builder.add_stage('Directory') + directory.set_attributes(**attributes) + trash = pipeline_builder.add_stage('Trash') + directory >> trash + pipeline = pipeline_builder.build() + return (directory, pipeline) + + @staticmethod + def create_file_directory(file_name, file_content, shell_executor, file_writer, delimiter_format="CSV", delimiter_character=None): + files_directory = os.path.join('/tmp', get_random_string()) + logger.debug('Creating files directory %s ...', files_directory) + shell_executor(f'mkdir {files_directory}') + filepath = os.path.join(files_directory, file_name) + if delimiter_character: + file_writer(filepath, file_content, delimiter_format, delimiter_character) + else: + file_writer(filepath, file_content) + return files_directory + + @staticmethod + def get_text_file_content(file_number, lines_needed=3): + return '\n'.join(['This is line{}{}'.format(str(file_number), i) for i in range(1, (lines_needed + 1))]) + From 0788ec160d5b4685c7decea52f41432b4e98964d Mon Sep 17 00:00:00 2001 From: Manish Zope Date: Sun, 19 May 2019 16:22:45 +0530 Subject: [PATCH 2/5] SDC-11380 Add test case for Directory origin's Escape Character configuration. * Restructured code to integrate common code snippets. --- stage/configuration/test_directory_origin.py | 76 ++++++++++---------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/stage/configuration/test_directory_origin.py b/stage/configuration/test_directory_origin.py index e7da0282..66864247 100644 --- a/stage/configuration/test_directory_origin.py +++ b/stage/configuration/test_directory_origin.py @@ -420,43 +420,34 @@ def test_directory_origin_configuration_delimiter_format_type(sdc_builder, sdc_e """Test for Directory origin can read delimited file with different delimiter format type. Here we will be creating delimited files in different formats for testing. e.g. POSTGRES_CSV, TDF, RFC4180, etc., """ - files_directory = os.path.join('/tmp', get_random_string()) - FILE_NAME = 'delimited_file.csv' - FILE_CONTENTS = [['field1', 'field2', 'field3'], ['Field11', 'Field12', 'fält13'], ['стол', 'Field22', 'Field23']] + file_name = 'delimited_file.csv' + file_contents = [['field1', 'field2', 'field3'], ['Field11', 'Field12', 'fält13'], ['стол', 'Field22', 'Field23']] delimiter_character_map = {'CUSTOM': '^'} delimiter_character = '^' if delimiter_format_type == 'CUSTOM' else None try: - logger.debug('Creating files directory %s ...', files_directory) - shell_executor(f'mkdir {files_directory}') - delimited_file_writer(os.path.join(files_directory, FILE_NAME), - FILE_CONTENTS, delimiter_format_type, delimiter_character) + files_directory = DirectoryOriginCommon.create_file_directory(file_name, file_contents, shell_executor, + delimited_file_writer, delimiter_format_type, + delimiter_character) - pipeline_builder = sdc_builder.get_pipeline_builder() - directory = pipeline_builder.add_stage('Directory') - directory.set_attributes(data_format=data_format, - files_directory=files_directory, - file_name_pattern='delimited_*', - file_name_pattern_mode='GLOB', - delimiter_format_type=delimiter_format_type, - delimiter_character=delimiter_character, - root_field_type=root_field_type, - header_line=header_line) - trash = pipeline_builder.add_stage('Trash') - directory >> trash - pipeline = pipeline_builder.build() + attributes = {'data_format':data_format, + 'files_directory':files_directory, + 'file_name_pattern':'delimited_*', + 'file_name_pattern_mode':'GLOB', + 'delimiter_format_type':delimiter_format_type, + 'delimiter_character':delimiter_character, + 'root_field_type':root_field_type, + 'header_line':header_line} + directory, pipeline = DirectoryOriginCommon.get_directory_trash_pipeline(sdc_builder, attributes) sdc_executor.add_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batch_size=3).snapshot sdc_executor.stop_pipeline(pipeline) output_records = snapshot[directory.instance_name].output - new_line_field = 'Field12\nSTR' if delimiter_format_type == 'EXCEL' else 'Field12' - assert 2 == len(output_records) - assert output_records[0].field == OrderedDict( - [('field1', 'Field11'), ('field2', new_line_field), ('field3', 'fält13')]) - assert output_records[1].field == OrderedDict( - [('field1', 'стол'), ('field2', 'Field22'), ('field3', 'Field23')]) + new_line_field = 'Field12\nSTR' if delimiter_format_type == 'EXCEL' else 'Field12' + file_contents[1][1] = new_line_field + DirectoryOriginCommon.verify_delimited_output(output_records, file_contents[1:3], file_contents[0]) finally: shell_executor(f'rm -r {files_directory}') @@ -482,7 +473,12 @@ def test_directory_origin_configuration_error_directory(sdc_builder, sdc_executo def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, data_format, escape_character, shell_executor, delimited_file_writer, delimiter_character): - """ Verify if DC can read the delimited file with custom escape character""" + """Verify if directory origin can read delimited data custom escape character. + This TC check for different escape characters. Input data fields have delimiter characters. + Directory origin should read this data and produce field without escape character. + e.g. ;|Field is value of field with "|" as delimiter character and ";" as escape character + then output field should be "|Field". + """ file_name = 'custom_delimited_file.csv' f = lambda ip_string: ip_string.format(escape_character=escape_character, delimiter_character=delimiter_character) f1 = lambda ip_string: ip_string.replace(escape_character, "") @@ -507,9 +503,8 @@ def test_directory_origin_configuration_escape_character(sdc_builder, sdc_execut sdc_executor.stop_pipeline(pipeline) output_records = snapshot[directory.instance_name].output - assert 2 == len(output_records) - assert output_records[0].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[0]))) - assert output_records[1].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[1]))) + expected_output = [map(f1, data[0]), map(f1, data[1])] + DirectoryOriginCommon.verify_delimited_output(output_records, expected_output) finally: shell_executor(f'rm -r {files_directory}') @@ -966,21 +961,30 @@ def get_directory_trash_pipeline(sdc_builder, attributes): trash = pipeline_builder.add_stage('Trash') directory >> trash pipeline = pipeline_builder.build() - return (directory, pipeline) + return directory, pipeline @staticmethod - def create_file_directory(file_name, file_content, shell_executor, file_writer, delimiter_format="CSV", delimiter_character=None): + def create_file_directory(file_name, file_content, shell_executor, file_writer, delimiter_format_type=None, + delimiter_character=None): files_directory = os.path.join('/tmp', get_random_string()) logger.debug('Creating files directory %s ...', files_directory) shell_executor(f'mkdir {files_directory}') - filepath = os.path.join(files_directory, file_name) - if delimiter_character: - file_writer(filepath, file_content, delimiter_format, delimiter_character) + file_path = os.path.join(files_directory, file_name) + if delimiter_format_type: + file_writer(file_path, file_content, delimiter_format_type, delimiter_character) else: - file_writer(filepath, file_content) + file_writer(file_path, file_content) return files_directory @staticmethod def get_text_file_content(file_number, lines_needed=3): return '\n'.join(['This is line{}{}'.format(str(file_number), i) for i in range(1, (lines_needed + 1))]) + @staticmethod + def verify_delimited_output(output_records, data, header=None): + if not header: + header = [str(i) for i in range(0, 3)] + assert 2 == len(output_records) + assert output_records[0].field == OrderedDict(zip(header, data[0])) + assert output_records[1].field == OrderedDict(zip(header, data[1])) + From ad4b097d63c136d9336fafa9f9a1fa7332425890 Mon Sep 17 00:00:00 2001 From: Manish Zope Date: Sun, 19 May 2019 14:50:04 +0530 Subject: [PATCH 3/5] SDC-11380 Add test case for Directory origin's Escape Character configuration. * Added test case with reusable code --- stage/configuration/test_directory_origin.py | 39 ++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/stage/configuration/test_directory_origin.py b/stage/configuration/test_directory_origin.py index cebff561..c5961bed 100644 --- a/stage/configuration/test_directory_origin.py +++ b/stage/configuration/test_directory_origin.py @@ -606,9 +606,41 @@ def test_directory_origin_configuration_error_directory(sdc_builder, sdc_executo @pytest.mark.parametrize('delimiter_format_type', ['CUSTOM']) @pytest.mark.parametrize('data_format', ['DELIMITED']) -@pytest.mark.skip('Not yet implemented') -def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, data_format): - pass +@pytest.mark.parametrize('escape_character', ['\t', ';' , ' ']) +@pytest.mark.parametrize('delimiter_character', ['@']) +def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, + data_format, escape_character, shell_executor, + delimited_file_writer, delimiter_character): + """ Verify if DC can read the delimited file with custom escape character""" + file_name = 'custom_delimited_file.csv' + f = lambda ip_string: ip_string.format(escape_character=escape_character, delimiter_character=delimiter_character) + f1 = lambda ip_string: ip_string.replace(escape_character, "") + data = [[f('Field11{escape_character}{delimiter_character}'), 'Field12', f('{escape_character}"Field13')], + [f('Field{escape_character}{delimiter_character}21'), 'Field22', 'Field23']] + + try: + files_directory = DirectoryOriginCommon.create_file_directory(file_name, data, shell_executor,delimited_file_writer, + delimiter_format_type, delimiter_character) + + attributes = {'data_format':data_format, + 'files_directory':files_directory, + 'file_name_pattern':'custom_delimited_*', + 'file_name_pattern_mode':'GLOB', + 'delimiter_format_type':delimiter_format_type, + 'delimiter_character':delimiter_character, + 'escape_character':escape_character} + directory, pipeline = DirectoryOriginCommon.get_directory_trash_pipeline(sdc_builder, attributes) + + sdc_executor.add_pipeline(pipeline) + snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batch_size=3).snapshot + sdc_executor.stop_pipeline(pipeline) + output_records = snapshot[directory.instance_name].output + + assert 2 == len(output_records) + assert output_records[0].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[0]))) + assert output_records[1].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[1]))) + finally: + shell_executor(f'rm -r {files_directory}') @pytest.mark.parametrize('data_format', ['EXCEL']) @@ -1470,3 +1502,4 @@ def execute_pipeline_and_verify_output(sdc_executor, directory, pipeline, data_f assert msg_field[0]['request'][0]['value'] == 'GET /index.html 200' elif data_format == 'SDC_JSON': assert output_records[0].field == json_data[0] + From ca97c3fbea460453ca19f6d5cf0396e0ca688c65 Mon Sep 17 00:00:00 2001 From: Manish Zope Date: Sun, 19 May 2019 16:22:45 +0530 Subject: [PATCH 4/5] SDC-11380 Add test case for Directory origin's Escape Character configuration. * Restructured code to integrate common code snippets. --- stage/configuration/test_directory_origin.py | 63 ++++++++++---------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/stage/configuration/test_directory_origin.py b/stage/configuration/test_directory_origin.py index c5961bed..0f08e674 100644 --- a/stage/configuration/test_directory_origin.py +++ b/stage/configuration/test_directory_origin.py @@ -549,43 +549,34 @@ def test_directory_origin_configuration_delimiter_format_type(sdc_builder, sdc_e """Test for Directory origin can read delimited file with different delimiter format type. Here we will be creating delimited files in different formats for testing. e.g. POSTGRES_CSV, TDF, RFC4180, etc., """ - files_directory = os.path.join('/tmp', get_random_string()) - FILE_NAME = 'delimited_file.csv' - FILE_CONTENTS = [['field1', 'field2', 'field3'], ['Field11', 'Field12', 'fält13'], ['стол', 'Field22', 'Field23']] + file_name = 'delimited_file.csv' + file_contents = [['field1', 'field2', 'field3'], ['Field11', 'Field12', 'fält13'], ['стол', 'Field22', 'Field23']] delimiter_character_map = {'CUSTOM': '^'} delimiter_character = '^' if delimiter_format_type == 'CUSTOM' else None try: - logger.debug('Creating files directory %s ...', files_directory) - shell_executor(f'mkdir {files_directory}') - delimited_file_writer(os.path.join(files_directory, FILE_NAME), - FILE_CONTENTS, delimiter_format_type, delimiter_character) + files_directory = DirectoryOriginCommon.create_file_directory(file_name, file_contents, shell_executor, + delimited_file_writer, delimiter_format_type, + delimiter_character) - pipeline_builder = sdc_builder.get_pipeline_builder() - directory = pipeline_builder.add_stage('Directory') - directory.set_attributes(data_format=data_format, - files_directory=files_directory, - file_name_pattern='delimited_*', - file_name_pattern_mode='GLOB', - delimiter_format_type=delimiter_format_type, - delimiter_character=delimiter_character, - root_field_type=root_field_type, - header_line=header_line) - trash = pipeline_builder.add_stage('Trash') - directory >> trash - pipeline = pipeline_builder.build() + attributes = {'data_format':data_format, + 'files_directory':files_directory, + 'file_name_pattern':'delimited_*', + 'file_name_pattern_mode':'GLOB', + 'delimiter_format_type':delimiter_format_type, + 'delimiter_character':delimiter_character, + 'root_field_type':root_field_type, + 'header_line':header_line} + directory, pipeline = DirectoryOriginCommon.get_directory_trash_pipeline(sdc_builder, attributes) sdc_executor.add_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batch_size=3).snapshot sdc_executor.stop_pipeline(pipeline) output_records = snapshot[directory.instance_name].output - new_line_field = 'Field12\nSTR' if delimiter_format_type == 'EXCEL' else 'Field12' - assert 2 == len(output_records) - assert output_records[0].field == OrderedDict( - [('field1', 'Field11'), ('field2', new_line_field), ('field3', 'fält13')]) - assert output_records[1].field == OrderedDict( - [('field1', 'стол'), ('field2', 'Field22'), ('field3', 'Field23')]) + new_line_field = 'Field12\nSTR' if delimiter_format_type == 'EXCEL' else 'Field12' + file_contents[1][1] = new_line_field + DirectoryOriginCommon.verify_delimited_output(output_records, file_contents[1:3], file_contents[0]) finally: shell_executor(f'rm -r {files_directory}') @@ -611,7 +602,12 @@ def test_directory_origin_configuration_error_directory(sdc_builder, sdc_executo def test_directory_origin_configuration_escape_character(sdc_builder, sdc_executor, delimiter_format_type, data_format, escape_character, shell_executor, delimited_file_writer, delimiter_character): - """ Verify if DC can read the delimited file with custom escape character""" + """Verify if directory origin can read delimited data custom escape character. + This TC check for different escape characters. Input data fields have delimiter characters. + Directory origin should read this data and produce field without escape character. + e.g. ;|Field is value of field with "|" as delimiter character and ";" as escape character + then output field should be "|Field". + """ file_name = 'custom_delimited_file.csv' f = lambda ip_string: ip_string.format(escape_character=escape_character, delimiter_character=delimiter_character) f1 = lambda ip_string: ip_string.replace(escape_character, "") @@ -636,9 +632,8 @@ def test_directory_origin_configuration_escape_character(sdc_builder, sdc_execut sdc_executor.stop_pipeline(pipeline) output_records = snapshot[directory.instance_name].output - assert 2 == len(output_records) - assert output_records[0].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[0]))) - assert output_records[1].field == OrderedDict(zip([str(i) for i in range(0, 3)], map(f1, data[1]))) + expected_output = [map(f1, data[0]), map(f1, data[1])] + DirectoryOriginCommon.verify_delimited_output(output_records, expected_output) finally: shell_executor(f'rm -r {files_directory}') @@ -1503,3 +1498,11 @@ def execute_pipeline_and_verify_output(sdc_executor, directory, pipeline, data_f elif data_format == 'SDC_JSON': assert output_records[0].field == json_data[0] + @staticmethod + def verify_delimited_output(output_records, data, header=None): + if not header: + header = [str(i) for i in range(0, 3)] + assert 2 == len(output_records) + assert output_records[0].field == OrderedDict(zip(header, data[0])) + assert output_records[1].field == OrderedDict(zip(header, data[1])) + From aaeaa0a0a7800db91aa3d9c3b82c6b79f1289290 Mon Sep 17 00:00:00 2001 From: Manish Zope Date: Fri, 14 Jun 2019 14:38:24 +0530 Subject: [PATCH 5/5] SDC-11380 Add test case for Directory origin's Escape Character configuration. * Rebased util function and some nits --- stage/configuration/test_directory_origin.py | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stage/configuration/test_directory_origin.py b/stage/configuration/test_directory_origin.py index 0f08e674..eb54e8aa 100644 --- a/stage/configuration/test_directory_origin.py +++ b/stage/configuration/test_directory_origin.py @@ -615,26 +615,26 @@ def test_directory_origin_configuration_escape_character(sdc_builder, sdc_execut [f('Field{escape_character}{delimiter_character}21'), 'Field22', 'Field23']] try: - files_directory = DirectoryOriginCommon.create_file_directory(file_name, data, shell_executor,delimited_file_writer, - delimiter_format_type, delimiter_character) + files_directory = create_file_and_directory(file_name, data, shell_executor,delimited_file_writer, + delimiter_format_type, delimiter_character) - attributes = {'data_format':data_format, - 'files_directory':files_directory, - 'file_name_pattern':'custom_delimited_*', - 'file_name_pattern_mode':'GLOB', - 'delimiter_format_type':delimiter_format_type, - 'delimiter_character':delimiter_character, - 'escape_character':escape_character} - directory, pipeline = DirectoryOriginCommon.get_directory_trash_pipeline(sdc_builder, attributes) + attributes = {'data_format': data_format, + 'files_directory': files_directory, + 'file_name_pattern': 'custom_delimited_*', + 'file_name_pattern_mode': 'GLOB', + 'delimiter_format_type': delimiter_format_type, + 'delimiter_character': delimiter_character, + 'escape_character': escape_character} + directory, pipeline = get_directory_to_trash_pipeline(sdc_builder, attributes) sdc_executor.add_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batch_size=3).snapshot - sdc_executor.stop_pipeline(pipeline) output_records = snapshot[directory.instance_name].output expected_output = [map(f1, data[0]), map(f1, data[1])] - DirectoryOriginCommon.verify_delimited_output(output_records, expected_output) + verify_delimited_output(output_records, expected_output) finally: + sdc_executor.stop_pipeline(pipeline) shell_executor(f'rm -r {files_directory}')