From 002e0cfcb1245ae4e8667a9fc6b612c1b9591ec0 Mon Sep 17 00:00:00 2001 From: suryakumari Date: Thu, 5 Dec 2024 19:05:55 +0530 Subject: [PATCH] e2e groupby additional tests --- .../features/groupby/GroupByMacros.feature | 150 +++++++++++++ .../features/groupby/GroupByWithFile.feature | 205 ++++++++++++++++++ .../common/stepsdesign/TestSetupHooks.java | 10 +- .../resources/errorMessage.properties | 3 + .../resources/pluginParameters.properties | 31 +++ .../CSV_GROUPBY_TEST10_Output | 5 + .../CSV_GROUPBY_TEST11_Output | 5 + .../expected_outputs/CSV_GROUPBY_TEST7_Output | 5 + .../expected_outputs/CSV_GROUPBY_TEST8_Output | 12 + .../expected_outputs/CSV_GROUPBY_TEST9_Output | 12 + .../testdata/file/CSV_GROUPBY_TEST2.csv | 11 + 11 files changed, 448 insertions(+), 1 deletion(-) create mode 100644 core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST10_Output create mode 100644 core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST11_Output create mode 100644 core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST7_Output create mode 100644 core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST8_Output create mode 100644 core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST9_Output create mode 100644 core-plugins/src/e2e-test/resources/testdata/file/CSV_GROUPBY_TEST2.csv diff --git a/core-plugins/src/e2e-test/features/groupby/GroupByMacros.feature b/core-plugins/src/e2e-test/features/groupby/GroupByMacros.feature index 293280a42..c26b54e40 100644 --- a/core-plugins/src/e2e-test/features/groupby/GroupByMacros.feature +++ b/core-plugins/src/e2e-test/features/groupby/GroupByMacros.feature @@ -57,3 +57,153 @@ Feature:GroupBy - Verification of GroupBy pipeline with File as source and File Then Close the pipeline logs Then Validate OUT record count of groupby is equal to IN record count of sink Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByMacroOutputFile" + + @GROUP_BY_TEST @FILE_SINK_TEST + Scenario: To verify pipeline is failed when fields and aggregates set as macro arguments with invalid value + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Click on the Macro button of Property: "groupByFields" and set the value to: "groupByField" + Then Click on the Macro button of Property: "aggregates" and set the value to: "groupByAggregatesField" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "tsv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "invalidGroupByFields" for key "groupByField" + Then Enter runtime argument value "invalidGroupByAggregatesField" for key "groupByAggregatesField" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Failed" + Then Close the pipeline logs + Then Open Pipeline logs and verify Log entries having below listed Level and Message: + | Level | Message | + | ERROR | errorLogsMessageGroupByInvalidFields | + + @GROUP_BY_TEST @FILE_SINK_TEST + Scenario: To verify pipeline is failed when number of partitions set as macro arguments with invalid value + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidFirstField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidSecondField" + Then Press ESC key to close the unique fields dropdown + Then Enter GroupBy plugin Fields to be Aggregate "groupByGcsAggregateFields" + Then Click on the Macro button of Property: "numberOfPartitions" and set the value to: "groupByInvalidPartitions" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "tsv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "groupByInvalidPartitionValue" for key "groupByInvalidPartitions" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Failed" + Then Close the pipeline logs + Then Open Pipeline logs and verify Log entries having below listed Level and Message: + | Level | Message | + | ERROR | errorLogsMessageGroupByInvalidNumberOfPartitions | + + @GROUP_BY_TEST @FILE_SINK_TEST + Scenario: To verify complete flow of data extract and transfer from File source to File sink with GroupBy plugin using count(Asterisk) aggregate using macro-enabled + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidFirstField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidSecondField" + Then Press ESC key to close the unique fields dropdown + Then Click on the Macro button of Property: "aggregates" and set the value to: "groupByAggregate" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Enter runtime argument value "groupByMacroCountAggregate" for key "groupByAggregate" + Then Run the preview of pipeline with runtime arguments + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "groupByMacroCountAggregate" for key "groupByAggregate" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of groupby is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest7OutputFile" diff --git a/core-plugins/src/e2e-test/features/groupby/GroupByWithFile.feature b/core-plugins/src/e2e-test/features/groupby/GroupByWithFile.feature index 7c33daf07..618e9514d 100644 --- a/core-plugins/src/e2e-test/features/groupby/GroupByWithFile.feature +++ b/core-plugins/src/e2e-test/features/groupby/GroupByWithFile.feature @@ -257,3 +257,208 @@ Feature: GroupBy - Verify File source to File sink data transfer using GroupBy a Then Validate OUT record count of groupby is equal to IN record count of sink Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest6OutputFile" + @GROUP_BY_TEST @FILE_SINK_TEST + Scenario: To verify complete flow of data extract and transfer from File source to File sink with GroupBy plugin using first,last,longestString,shortestString,countNulls aggregates + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidFirstField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidSecondField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidThirdField" + Then Press ESC key to close the unique fields dropdown + Then Enter GroupBy plugin Fields to be Aggregate "groupBySet1Aggregates" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of groupby is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest8OutputFile" + + @GROUP_BY_TEST @FILE_SINK_TEST + Scenario: To verify complete flow of data extract and transfer from File source to File sink with GroupBy plugin using sumOfSquaresIf,correctedSumOfSquaresIf,sumOfSquares,correctedSumOfSquares aggregates + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvDataTypeFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidFirstField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidSecondField" + Then Press ESC key to close the unique fields dropdown + Then Select dropdown plugin property: "groupByFields" with option value: "groupByValidThirdField" + Then Press ESC key to close the unique fields dropdown + Then Enter GroupBy plugin Fields to be Aggregate "groupBySet2Aggregates" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of groupby is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest9OutputFile" + + @GROUPBY_TEST @FILE_SINK_TEST + Scenario: To verify complete flow of data extract and transfer from File source to File sink with GroupBy plugin using collectList,collectSet,collectListIf,collectSetIf aggregates + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvTestFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByField" + Then Press ESC key to close the unique fields dropdown + Then Enter GroupBy plugin Fields to be Aggregate "groupBySet3Aggregates" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "json" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of groupby is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest10OutputFile" + + @GROUPBY_TEST @FILE_SINK_TEST + Scenario: To verify complete flow of data extract and transfer from File source to File sink with GroupBy plugin using countDistinctIf, + longestStringIf,shortestStringIf,concatIf,varianceIf,concatDistinctIf,stdDevIf,logicalAndIf,logicalOrIf,logicalAnd,logicalOr aggregates + Given Open Datafusion Project to configure pipeline + When Select plugin: "File" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Analytics" + When Select plugin: "Group By" from the plugins list as: "Analytics" + Then Connect plugins: "File" and "Group By" to establish connection + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "File" from the plugins list as: "Sink" + Then Connect plugins: "Group By" and "File2" to establish connection + Then Navigate to the properties page of plugin: "File" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "groupByTest" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Click plugin property: "skipHeader" + Then Click on the Get Schema button + Then Verify the Output Schema matches the Expected Schema: "groupByCsvTestFileSchema" + Then Validate "File" plugin properties + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "Group By" + Then Select dropdown plugin property: "groupByFields" with option value: "groupByField" + Then Press ESC key to close the unique fields dropdown + Then Enter GroupBy plugin Fields to be Aggregate "groupBySet4Aggregates" + Then Click on the Get Schema button + Then Click on the Validate button + Then Close the Plugin Properties page + Then Navigate to the properties page of plugin: "File2" + Then Enter input plugin property: "referenceName" with value: "FileReferenceName" + Then Enter input plugin property: "path" with value: "fileSinkTargetBucket" + Then Replace input plugin property: "pathSuffix" with value: "yyyy-MM-dd-HH-mm-ss" + Then Select dropdown plugin property: "format" with option value: "csv" + Then Validate "File2" plugin properties + Then Close the Plugin Properties page + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Close the pipeline logs + Then Validate OUT record count of groupby is equal to IN record count of sink + Then Validate output file generated by file sink plugin "fileSinkTargetBucket" is equal to expected output file "groupByTest11OutputFile" + diff --git a/core-plugins/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java b/core-plugins/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java index dab8c670c..8ef6d1fd2 100644 --- a/core-plugins/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java +++ b/core-plugins/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java @@ -167,7 +167,7 @@ public static void createBucketWithGroupByTest1File() throws IOException, URISyn BeforeActions.scenario.write("Group by bucket name - " + gcsSourceBucketName1); } - @After(order = 1, value = "@GROUP_BY_TEST or @ROW_DENO_TEST") + @After(order = 1, value = "@GROUP_BY_TEST or @ROW_DENO_TEST or @GROUPBY_TEST") public static void deleteSourceBucketWithGroupByTest1File() { deleteGCSBucket(gcsSourceBucketName1); gcsSourceBucketName1 = StringUtils.EMPTY; @@ -542,4 +542,12 @@ public static void createBucketWithRowDenormalizerTestFile() throws IOException, PluginPropertyUtils.pluginProp("rowDenoGcsCsvFile")); BeforeActions.scenario.write("RowDenormalizer bucket name - " + gcsSourceBucketName1); } + + @Before(order = 1, value = "@GROUPBY_TEST") + public static void createBucketWithGroupByTest2File() throws IOException, URISyntaxException { + gcsSourceBucketName1 = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("groupByGcsCsvTestFile")); + PluginPropertyUtils.addPluginProp("groupByTest", "gs://" + gcsSourceBucketName1 + "/" + + PluginPropertyUtils.pluginProp("groupByGcsCsvTestFile")); + BeforeActions.scenario.write("Group by bucket name - " + gcsSourceBucketName1); + } } diff --git a/core-plugins/src/e2e-test/resources/errorMessage.properties b/core-plugins/src/e2e-test/resources/errorMessage.properties index 02bfce937..a5b3108b8 100644 --- a/core-plugins/src/e2e-test/resources/errorMessage.properties +++ b/core-plugins/src/e2e-test/resources/errorMessage.properties @@ -29,3 +29,6 @@ errorLogsMessageDistinctInvalidFields=Spark program 'phase-1' failed with error: errorLogsMessageDistinctInvalidNumberOfPartitions=Spark program 'phase-1' failed with error: Unable to create config \ for batchaggregator Distinct 'numPartitions' is invalid: Value of field class io.cdap.plugin.\ batch.aggregator.AggregatorConfig.numPartitions is expected to be a number.. Please check the system logs for more details. +errorLogsMessageGroupByInvalidFields=Spark program 'phase-1' failed with error: null. +errorLogsMessageGroupByInvalidNumberOfPartitions=Spark program 'phase-1' failed with error: Unable to create config for \ + batchaggregator GroupByAggregate 'numPartitions' is invalid: Value of field diff --git a/core-plugins/src/e2e-test/resources/pluginParameters.properties b/core-plugins/src/e2e-test/resources/pluginParameters.properties index f2d0553a6..2dd9fef4b 100644 --- a/core-plugins/src/e2e-test/resources/pluginParameters.properties +++ b/core-plugins/src/e2e-test/resources/pluginParameters.properties @@ -232,6 +232,37 @@ groupByFileAggregateMultipleSetFields2=[{"key":"price#Max","value":"MaxPrice"},\ {"key":"price#Sum","value":"SumPrice"},{"key":"item#Count","value":"CountItem"}] groupByTest5OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST5_Output.csv groupByTest6OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST6_Output.csv +invalidGroupByFields=abc +invalidGroupByAggregatesField=Average:Avg(abc) +groupByInvalidPartitionValue=&*^* +groupByValidThirdField=price +groupByMacroCountAggregate=totalSpent:sum(price),numPurchased:count(*),avgItemPrice:avgIf(price):condition(price>=0.50) +groupBySet1Aggregates=[{"key":"user#First","value":"testfirst"}, {"key":"user#Last","value":"testlast"},\ + {"key":"item#LongestString","value":"longstring"}, {"key":"user#ShortestString","value":"shortstring"},\ + {"key":"user#CountNulls","value":"numberofnulls"}] +groupBySet2Aggregates=[{"key":"price#CorrectedSumOfSquares","value":"correctedsumofsquares"}, \ + {"key":"price#CorrectedSumOfSquaresIf#price>=0.6","value":"correctedsumofsquaresif"},\ + {"key":"price#SumOfSquaresIf#price>=0.50","value":"sumofsquaresif"}, {"key":"price#SumOfSquares","value":"sumofsquares"}] +groupBySet3Aggregates=[{"key":"product#CollectList","value":"products_CollectList"}, \ + {"key":"product#CollectSet","value":"unique_products_CollectSet"},\ + {"key":"product#CollectListIf#amount>0.50","value":"products_above_5"}, \ + {"key":"product#CollectSetIf#amount>0.50","value":"unique_products_above_5"}] +groupBySet4Aggregates=[{"key":"product#CountDistinctIf#amount>5","value":"distinct_products_above_5"},{"key":"product#LongestStringIf#amount>3","value":"longest_product_above_3"},\ + {"key":"product#ShortestStringIf#amount>3","value":"shortest_product_above_3"},{"key":"product#ConcatIf#amount>5","value":"concatenated_products_above_5"},\ + {"key":"amount#VarianceIf#amount>3","value":"variance_amount_above_3"},{"key":"product#ConcatDistinctIf#amount>5","value":"distinct_concatenated_products_above_5"},\ + {"key":"amount#StddevIf#amount>=5","value":"stddev_amount_above_5"},{"key":"is_promotion#LogicalAndIf#is_promotion = true","value":"all_promotions_LogicalAndIf"},\ + {"key":"is_promotion#LogicalOrIf#is_promotion = true","value":"any_promotion_LogicalOrIf"},{"key":"is_promotion#LogicalAnd","value":"all_promotions"},\ + {"key":"is_promotion#LogicalOr","value":"any_promotion"}] +groupByGcsCsvTestFile=testdata/file/CSV_GROUPBY_TEST2.csv +groupByField=customer_id +groupByCsvTestFileSchema=[{"key":"transaction_id","value":"int"}, {"key":"customer_id","value":"int"},\ + {"key":"product","value":"string"},{"key":"amount","value":"int"}, {"key":"transaction_date","value":"string"},\ + {"key":"is_promotion","value":"boolean"}] +groupByTest7OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST7_Output +groupByTest8OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST8_Output +groupByTest9OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST9_Output +groupByTest10OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST10_Output +groupByTest11OutputFile=e2e-tests/expected_outputs/CSV_GROUPBY_TEST11_Output ## GROUPBY-PLUGIN-PROPERTIES-END ## JOINER-PLUGIN-PROPERTIES-START diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST10_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST10_Output new file mode 100644 index 000000000..f8d6c4f7e --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST10_Output @@ -0,0 +1,5 @@ +{"customer_id":103,"products_CollectList":["Peach","Plum"],"unique_products_CollectSet":["Plum","Peach"],"unique_products_above_5":["Plum","Peach"],"products_above_5":["Peach","Plum"]} +{"customer_id":101,"products_CollectList":["Apple","Banana"],"unique_products_CollectSet":["Apple","Banana"],"unique_products_above_5":["Apple","Banana"],"products_above_5":["Apple","Banana"]} +{"customer_id":105,"products_CollectList":["Watermelon","Banana"],"unique_products_CollectSet":["Watermelon","Banana"],"unique_products_above_5":["Watermelon","Banana"],"products_above_5":["Watermelon","Banana"]} +{"customer_id":102,"products_CollectList":["Orange","Grapes"],"unique_products_CollectSet":["Grapes","Orange"],"unique_products_above_5":["Grapes","Orange"],"products_above_5":["Orange","Grapes"]} +{"customer_id":104,"products_CollectList":["Pineapple","Apple"],"unique_products_CollectSet":["Apple","Pineapple"],"unique_products_above_5":["Apple","Pineapple"],"products_above_5":["Pineapple","Apple"]} diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST11_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST11_Output new file mode 100644 index 000000000..fc4ae1d8c --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST11_Output @@ -0,0 +1,5 @@ +103,false,Peach,4.0,false,1,Plum,0.0,true,Peach,Peach,true +101,false,Apple,6.25,false,1,Apple,2.5,true,Banana,Apple,true +105,false,,0.0,false,0,Watermelon,0.0,true,Watermelon,,true +102,false,Orange, Grapes,16.0,false,2,Orange,4.0,true,Orange,Orange, Grapes,true +104,true,Pineapple, Apple,16.0,true,2,Apple,4.0,true,Pineapple,Pineapple, Apple,true diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST7_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST7_Output new file mode 100644 index 000000000..9fa5a2064 --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST7_Output @@ -0,0 +1,5 @@ +bob,coffee,3.85,2,3.5 +bob,donut,3.25,4,0.9333333333333332 +alice,cookie,1.4,2,0.7 +alice,tea,3.79,3,1.745 +bob,cofee,2.05,1,2.05 diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST8_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST8_Output new file mode 100644 index 000000000..cd14a6809 --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST8_Output @@ -0,0 +1,12 @@ +alice,cookie,0.8,cookie,alice,alice,alice,0 +bob,coffee,0.35,coffee,bob,bob,bob,0 +bob,cofee,2.05,cofee,bob,bob,bob,0 +bob,donut,0.8,donut,bob,bob,bob,0 +bob,donut,0.45,donut,bob,bob,bob,0 +alice,cookie,0.6,cookie,alice,alice,alice,0 +bob,donut,1.5,donut,bob,bob,bob,0 +alice,tea,0.3,tea,alice,alice,alice,0 +alice,tea,1.99,tea,alice,alice,alice,0 +bob,coffee,3.5,coffee,bob,bob,bob,0 +bob,donut,0.5,donut,bob,bob,bob,0 +alice,tea,1.5,tea,alice,alice,alice,0 diff --git a/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST9_Output b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST9_Output new file mode 100644 index 000000000..9c954dc5f --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/expected_outputs/CSV_GROUPBY_TEST9_Output @@ -0,0 +1,12 @@ +alice,cookie,0.8,0.6400000000000001,0.0,0.0,0.6400000000000001 +bob,coffee,0.35,0.0,0.0,0.0,0.12249999999999998 +bob,cofee,2.05,4.2025,0.0,0.0,4.2025 +bob,donut,0.8,0.6400000000000001,0.0,0.0,0.6400000000000001 +bob,donut,0.45,0.0,0.0,0.0,0.2025 +alice,cookie,0.6,0.36,0.0,0.0,0.36 +bob,donut,1.5,2.25,0.0,0.0,2.25 +alice,tea,0.3,0.0,0.0,0.0,0.09 +alice,tea,1.99,3.9601,0.0,0.0,3.9601 +bob,coffee,3.5,12.25,0.0,0.0,12.25 +bob,donut,0.5,0.25,0.0,0.0,0.25 +alice,tea,1.5,2.25,0.0,0.0,2.25 diff --git a/core-plugins/src/e2e-test/resources/testdata/file/CSV_GROUPBY_TEST2.csv b/core-plugins/src/e2e-test/resources/testdata/file/CSV_GROUPBY_TEST2.csv new file mode 100644 index 000000000..a8cf8bd46 --- /dev/null +++ b/core-plugins/src/e2e-test/resources/testdata/file/CSV_GROUPBY_TEST2.csv @@ -0,0 +1,11 @@ +transaction_id,customer_id,product,amount,transaction_date,is_promotion +1,101,Apple,10,01/01/2024,TRUE +2,101,Banana,5,02/01/2024,FALSE +3,102,Orange,7,03/01/2024,TRUE +4,102,Grapes,15,04/01/2024,FALSE +5,103,Peach,8,05/01/2024,TRUE +6,103,Plum,4,06/01/2024,FALSE +7,104,Pineapple,20,07/01/2024,TRUE +8,104,Apple,12,08/01/2024,TRUE +9,105,Watermelon,5,09/01/2024,TRUE +10,105,Banana,3,10/01/2024,FALSE \ No newline at end of file