diff --git a/core-plugins/src/e2e-test/features/distinctplugin/DistinctMacro.feature b/core-plugins/src/e2e-test/features/distinctplugin/DistinctMacro.feature index 6d13c1a34..6e2f5fe46 100644 --- a/core-plugins/src/e2e-test/features/distinctplugin/DistinctMacro.feature +++ b/core-plugins/src/e2e-test/features/distinctplugin/DistinctMacro.feature @@ -50,3 +50,141 @@ Feature: Distinct analytics - Verify File data transfer scenarios using Distinct Then Close the pipeline logs Then Validate OUT record count of distinct is equal to IN record count of sink Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "distinctMacroOutputFile" + + @GCS_DISTINCT_TEST1 @FILE_SINK_TEST + Scenario: To verify data is getting transferred from File source to File sink with number of partitions set as macro arguments + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Distinct" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Distinct" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Distinct" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "gcsDistinctTest1" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvAllDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Distinct" + Then Enter the Distinct plugin fields as list "distinctValidSingleFieldName" + Then Click on the Macro button of Property: "numberOfPartitions" and set the value to: "distinctValidPartitions" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctOutputFileSchema" + Then Validate "Distinct" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "tsv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Enter runtime argument value "distinctValidPartitions" for key "distinctValidPartitions" + Then Run the preview of pipeline with runtime arguments + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "distinctValidPartitions" for key "distinctValidPartitions" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of distinct is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "distinctDatatypeOutputFile" + + @GCS_DISTINCT_TEST2 @FILE_SINK_TEST + Scenario: To verify pipeline is failed when Field set as macro arguments with invalid value + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Distinct" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Distinct" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Distinct" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "gcsDistinctTest2" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Distinct" + Then Click on the Macro button of Property: "fields" and set the value to: "DistinctFieldName" + Then Validate "Distinct" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "tsv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "distinctInvalidFields" for key "DistinctFieldName" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Verify the pipeline status is "Failed" + Then Open Pipeline logs and verify Log entries having below listed Level and Message: + | Level | Message | + | ERROR | errorLogsMessageDistinctInvalidFields | + + @GCS_DISTINCT_TEST1 @FILE_SINK_TEST + Scenario: To verify pipeline is failed when number of partitions set as macro arguments with invalid value + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Distinct" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Distinct" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Distinct" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "gcsDistinctTest1" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvAllDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Distinct" + Then Enter the Distinct plugin fields as list "distinctValidSingleFieldName" + Then Click on the Macro button of Property: "numberOfPartitions" and set the value to: "distinctInvalidPartitions" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctOutputFileSchema" + Then Validate "Distinct" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "tsv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "distinctInvalidPartitions" for key "distinctInvalidPartitions" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Verify the pipeline status is "Failed" + Then Open Pipeline logs and verify Log entries having below listed Level and Message: + | Level | Message | + | ERROR | errorLogsMessageDistinctInvalidNumberOfPartitions | diff --git a/core-plugins/src/e2e-test/features/distinctplugin/DistinctWithFile.feature b/core-plugins/src/e2e-test/features/distinctplugin/DistinctWithFile.feature index e36476939..a3161f7cf 100644 --- a/core-plugins/src/e2e-test/features/distinctplugin/DistinctWithFile.feature +++ b/core-plugins/src/e2e-test/features/distinctplugin/DistinctWithFile.feature @@ -96,3 +96,98 @@ Feature: Distinct Analytics - Verify File source data transfer using Distinct an Then Close the pipeline logs Then Validate OUT record count of distinct is equal to IN record count of sink Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "distinctCsvOutputFile" + + @GCS_DISTINCT_TEST2 @FILE_SINK_TEST + Scenario: To verify distinct records is getting transferred from File source to File sink plugin successfully using distinct plugin without any field names given + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Distinct" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Distinct" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Distinct" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "gcsDistinctTest2" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Distinct" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvFileSchema" + Then Validate "Distinct" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of distinct is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "distinctOutputFile" + + @GCS_DISTINCT_TEST1 @FILE_SINK_TEST @Distinct_Required + Scenario: To verify data is getting transferred from File source to File sink plugin successfully with field names having unique records + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Distinct" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Distinct" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Distinct" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "gcsDistinctTest1" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctCsvAllDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Distinct" + Then Enter the Distinct plugin fields as list "distinctFieldsWithUniqueRecords" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "distinctFieldsWithUniqueRecordsOutputSchema" + Then Validate "Distinct" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of distinct is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "distinctFieldsWithUniqueRecordsOutputFile" diff --git a/core-plugins/src/e2e-test/resources/errorMessage.properties b/core-plugins/src/e2e-test/resources/errorMessage.properties index f42abe51a..16e73cfbb 100644 --- a/core-plugins/src/e2e-test/resources/errorMessage.properties +++ b/core-plugins/src/e2e-test/resources/errorMessage.properties @@ -23,3 +23,8 @@ errorMessageJoinerBasicJoinCondition=Join keys cannot be empty errorMessageJoinerAdvancedJoinCondition=A join condition must be specified. errorMessageJoinerInputLoadMemory=Advanced outer joins must specify an input to load in memory. errorMessageJoinerAdvancedJoinConditionType=Advanced join conditions can only be used when there are two inputs. +errorLogsMessageDistinctInvalidFields=Spark program 'phase-1' failed with error: Errors were encountered during validation.\ + \ Field $^&* does not exist in input schema.. Please check the system logs for more details. +errorLogsMessageDistinctInvalidNumberOfPartitions=Spark program 'phase-1' failed with error: Unable to create config \ + for batchaggregator Distinct 'numPartitions' is invalid: Value of field class io.cdap.plugin.\ + batch.aggregator.AggregatorConfig.numPartitions is expected to be a number.. Please check the system logs for more details. diff --git a/core-plugins/src/e2e-test/resources/pluginParameters.properties b/core-plugins/src/e2e-test/resources/pluginParameters.properties index f89e75c42..9388817fa 100644 --- a/core-plugins/src/e2e-test/resources/pluginParameters.properties +++ b/core-plugins/src/e2e-test/resources/pluginParameters.properties @@ -169,6 +169,11 @@ distinctCsvAllDataTypeFileSchema=[{"key":"id","value":"int"},{"key":"name","valu distinctDatatypeOutputFile=e2e-tests/expected_outputs/CSV_DISTINCT_TEST1_Output.csv distinctCsvOutputFile=e2e-tests/expected_outputs/CSV_DISTINCT_TEST2_Output.csv distinctMacroOutputFile=e2e-tests/expected_outputs/CSV_DISTINCT_TEST3_Output.csv +distinctOutputFile=e2e-tests/expected_outputs/CSV_DISTINCT_Output +distinctFieldsWithUniqueRecords=id, name, yearofbirth +distinctFieldsWithUniqueRecordsOutputSchema=[{"key":"id","value":"int"},{"key":"name","value":"string"},\ + {"key":"yearofbirth","value":"int"}] +distinctFieldsWithUniqueRecordsOutputFile:e2e-tests/expected_outputs/CSV_FIELDWITHUNIQUERECORDS_OUTPUT.csv ## DISTINCT-PLUGIN-PROPERTIES-END ## Deduplicate-PLUGIN-PROPERTIES-START diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_DISTINCT_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_DISTINCT_Output new file mode 100644 index 000000000..cf16a47eb --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_DISTINCT_Output @@ -0,0 +1,6 @@ +bob,coffee,buy,2019-03-11 04:50:01 UTC +bob,coffee,drink,2019-03-12 04:50:01 UTC +bob,donut,eat,2019-03-08 04:50:01 UTC +bob,donut,buy,2019-03-10 04:50:01 UTC +bob,donut,buy,2019-03-11 04:50:01 UTC +bob,donut,eat,2019-03-09 04:50:01 UTC diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_FIELDWITHUNIQUERECORDS_OUTPUT.csv b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_FIELDWITHUNIQUERECORDS_OUTPUT.csv new file mode 100644 index 000000000..c00c276f4 --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_FIELDWITHUNIQUERECORDS_OUTPUT.csv @@ -0,0 +1,4 @@ +4,galilée,1564 +3,marie curie,1867 +2,isaac newton,1643 +1,albert einstein,1879