From 59560a698979b68b308d433ad51e5175ec3c2298 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Mon, 8 Apr 2024 16:46:00 +0000 Subject: [PATCH 01/28] feat: Add --ddl-file option --- bin/offload | 2 +- src/goe/goe.py | 23 ++- src/goe/listener/schemas/orchestration.py | 9 + src/goe/offload/offload.py | 183 ++++------------- src/goe/offload/offload_constants.py | 3 + src/goe/offload/offload_status_report.py | 2 +- src/goe/offload/option_validation.py | 205 +++++++++++++++++++ src/goe/util/misc_functions.py | 2 + tests/unit/offload/test_option_validation.py | 108 ++++++++++ tests/unit/offload/test_staging_file.py | 2 +- 10 files changed, 385 insertions(+), 154 deletions(-) create mode 100644 src/goe/offload/option_validation.py create mode 100644 tests/unit/offload/test_option_validation.py diff --git a/bin/offload b/bin/offload index 48346ef0..3af40a95 100755 --- a/bin/offload +++ b/bin/offload @@ -18,7 +18,6 @@ import sys from goe.config.config_checks import check_cli_path -from goe.offload.offload import OffloadOptionError, get_offload_options check_cli_path() @@ -34,6 +33,7 @@ from goe.goe import ( OFFLOAD_OP_NAME, get_log_fh, ) +from goe.offload.offload import OffloadOptionError, get_offload_options from goe.orchestration.cli_entry_points import offload_by_cli from goe.util.goe_log import log_exception diff --git a/src/goe/goe.py b/src/goe/goe.py index 42fbd05a..f2ea0950 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -84,6 +84,7 @@ TOTAL_ROWS_OFFLOADED_LOG_TEXT, ) from goe.offload.offload_functions import convert_backend_identifier_case, data_db_name +from goe.offload.option_validation import normalise_ddl_file from goe.offload.offload_source_data import ( get_offload_type_for_config, OFFLOAD_SOURCE_CLIENT_OFFLOAD, @@ -114,9 +115,7 @@ ) from goe.offload.offload import ( OffloadException, - OffloadOptionError, active_data_append_options, - check_opt_is_posint, check_ipa_predicate_type_option_conflicts, check_table_structure, create_final_backend_table_step, @@ -126,11 +125,7 @@ get_prior_offloaded_hv, offload_backend_db_message, offload_type_force_effects, - normalise_data_sampling_options, normalise_less_than_options, - normalise_offload_predicate_options, - normalise_stats_options, - normalise_verify_options, ) from goe.offload.operation.partition_controls import ( derive_partition_digits, @@ -139,6 +134,14 @@ validate_offload_partition_functions, validate_offload_partition_granularity, ) +from goe.offload.option_validation import ( + OffloadOptionError, + check_opt_is_posint, + normalise_data_sampling_options, + normalise_offload_predicate_options, + normalise_stats_options, + normalise_verify_options, +) from goe.offload.operation.sort_columns import sort_columns_csv_to_sort_columns from goe.orchestration import command_steps from goe.orchestration.execution_id import ExecutionId @@ -205,6 +208,7 @@ "data_sample_parallelism", "data_sample_pct", "date_columns_csv", + "ddl_file", "decimal_columns_csv_list", "decimal_columns_type_list", "decimal_padding_digits", @@ -397,7 +401,7 @@ def get_db_unique_name(opts): elif opts.db_type == DBTYPE_MSSQL: try: return opts.rdbms_dsn.split("=")[1] - except: + except Exception: return "" @@ -813,7 +817,7 @@ def normalise_column_transformations( ] for ct in column_transformation_list: - if not ":" in ct: + if ":" not in ct: raise OffloadOptionError("Missing transformation for column: %s" % ct) m = re.search(r"^([\w$#]+):([\w$#]+)(\(%s\))?$" % param_match, ct) @@ -2252,6 +2256,7 @@ def __init__( normalise_offload_predicate_options(self) normalise_verify_options(self) normalise_data_sampling_options(self) + normalise_ddl_file(self.ddl_file, self.owner, self.table_name, config) self._setup_offload_step(messages) @@ -2350,6 +2355,7 @@ def from_options( data_sample_parallelism=options.data_sample_parallelism, data_sample_pct=options.data_sample_pct, date_columns_csv=options.date_columns_csv, + ddl_file=options.ddl_file, decimal_columns_csv_list=options.decimal_columns_csv_list, decimal_columns_type_list=options.decimal_columns_type_list, decimal_padding_digits=options.decimal_padding_digits, @@ -2482,6 +2488,7 @@ def from_dict( "data_sample_pct", orchestration_defaults.data_sample_pct_default() ), date_columns_csv=operation_dict.get("date_columns_csv"), + ddl_file=operation_dict.get("ddl_file"), decimal_columns_csv_list=operation_dict.get("decimal_columns_csv_list"), decimal_columns_type_list=operation_dict.get("decimal_columns_type_list"), decimal_padding_digits=operation_dict.get( diff --git a/src/goe/listener/schemas/orchestration.py b/src/goe/listener/schemas/orchestration.py index 92dae14f..6a75ef04 100644 --- a/src/goe/listener/schemas/orchestration.py +++ b/src/goe/listener/schemas/orchestration.py @@ -248,6 +248,15 @@ def data_sample_pct_validator(cls, v): description="CSV list of columns to treat as date columns", cli=("--date-columns"), ) + ddl_file: Optional[str] = Field( + default=None, + title="Path to output generated target table DDL", + description=( + "Output generated target table DDL to a file, should include full path or literal AUTO. " + "Supports local paths or cloud storage URIs" + ), + cli=("--ddl-file"), + ) decimal_columns_csv_list: Optional[List[str]] = Field( default=None, title="Decimal columns", diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 2fe0e7c0..7163ed37 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -18,8 +18,7 @@ """ from datetime import datetime, timedelta -from optparse import OptionValueError, SUPPRESS_HELP -import re +from optparse import SUPPRESS_HELP from textwrap import dedent from typing import TYPE_CHECKING @@ -40,14 +39,16 @@ ) from goe.offload.offload_source_data import offload_source_data_factory from goe.offload.offload_source_table import ( - DATA_SAMPLE_SIZE_AUTO, OFFLOAD_PARTITION_TYPE_RANGE, OFFLOAD_PARTITION_TYPE_LIST, ) from goe.offload.offload_transport import VALID_OFFLOAD_TRANSPORT_METHODS from goe.offload.operation.sort_columns import check_and_alter_backend_sort_columns from goe.offload.operation.data_type_controls import DECIMAL_COL_TYPE_SYNTAX_TEMPLATE -from goe.offload.predicate_offload import GenericPredicate +from goe.offload.option_validation import ( + active_data_append_options, + check_opt_is_posint, +) from goe.orchestration import command_steps from goe.persistence.orchestration_metadata import ( hwm_column_names_from_predicates, @@ -57,7 +58,7 @@ INCREMENTAL_PREDICATE_TYPE_RANGE, INCREMENTAL_PREDICATE_TYPES_WITH_PREDICATE_IN_HV, ) -from goe.util.misc_functions import format_list_for_logging, is_pos_int +from goe.util.misc_functions import format_list_for_logging if TYPE_CHECKING: from goe.offload.backend_table import BackendTableInterface @@ -73,14 +74,6 @@ class OffloadException(Exception): pass -class OffloadOptionError(Exception): - def __init__(self, detail): - self.detail = detail - - def __str__(self): - return repr(self.detail) - - def check_ipa_predicate_type_option_conflicts( options, exc_cls=OffloadException, rdbms_table=None ): @@ -138,17 +131,6 @@ def check_ipa_predicate_type_option_conflicts( ) -def check_opt_is_posint( - opt_name, opt_val, exception_class=OptionValueError, allow_zero=False -): - if is_pos_int(opt_val, allow_zero=allow_zero): - return int(opt_val) - else: - raise exception_class( - "option %s: invalid positive integer value: %s" % (opt_name, opt_val) - ) - - def check_table_structure(frontend_table, backend_table, messages: OffloadMessages): """Compare frontend and backend columns by name and throw an exception if there is a mismatch. Ideally we would use SchemaSyncAnalyzer for this but circular dependencies prevent that for the time being. @@ -509,82 +491,9 @@ def offload_type_force_effects( hybrid_operation.force = True -def active_data_append_options( - opts, - partition_type=None, - from_options=False, - ignore_partition_names_opt=False, - ignore_pbo=False, -): - rpa_opts = { - "--less-than-value": opts.less_than_value, - "--partition-names": opts.partition_names_csv, - } - lpa_opts = { - "--equal-to-values": opts.equal_to_values, - "--partition-names": opts.partition_names_csv, - } - ida_opts = {"--offload-predicate": opts.offload_predicate} - - if from_options: - # options has a couple of synonyms for less_than_value - rpa_opts.update( - { - "--older-than-days": opts.older_than_days, - "--older-than-date": opts.older_than_date, - } - ) - - if ignore_partition_names_opt: - del rpa_opts["--partition-names"] - del lpa_opts["--partition-names"] - - if partition_type == OFFLOAD_PARTITION_TYPE_RANGE: - chk_opts = rpa_opts - elif partition_type == OFFLOAD_PARTITION_TYPE_LIST: - chk_opts = lpa_opts - elif not partition_type: - chk_opts = {} if ignore_pbo else ida_opts.copy() - chk_opts.update(lpa_opts) - chk_opts.update(rpa_opts) - - active_pa_opts = [_ for _ in chk_opts if chk_opts[_]] - return active_pa_opts - - -def normalise_verify_options(options): - if getattr(options, "verify_parallelism", None): - options.verify_parallelism = check_opt_is_posint( - "--verify-parallelism", options.verify_parallelism, allow_zero=True - ) - - -def normalise_data_sampling_options(options): - if hasattr(options, "data_sample_pct"): - if type(options.data_sample_pct) == str and re.search( - r"^[\d\.]+$", options.data_sample_pct - ): - options.data_sample_pct = float(options.data_sample_pct) - elif options.data_sample_pct == "AUTO": - options.data_sample_pct = DATA_SAMPLE_SIZE_AUTO - elif type(options.data_sample_pct) not in (int, float): - raise OffloadOptionError( - 'Invalid value "%s" for --data-sample-percent' % options.data_sample_pct - ) - else: - options.data_sample_pct = 0 - - if hasattr(options, "data_sample_parallelism"): - options.data_sample_parallelism = check_opt_is_posint( - "--data-sample-parallelism", - options.data_sample_parallelism, - allow_zero=True, - ) - - def normalise_less_than_options(options, exc_cls=OffloadException): if not hasattr(options, "older_than_date"): - # We mustn't be in offload or present so should just drop out + # We mustn't be in offload so should just drop out return active_pa_opts = active_data_append_options(options, from_options=True) @@ -618,45 +527,6 @@ def normalise_less_than_options(options, exc_cls=OffloadException): check_ipa_predicate_type_option_conflicts(options, exc_cls=exc_cls) -def normalise_offload_predicate_options(options): - if options.offload_predicate: - if isinstance(options.offload_predicate, str): - options.offload_predicate = GenericPredicate(options.offload_predicate) - - if ( - options.less_than_value - or options.older_than_date - or options.older_than_days - ): - raise OffloadOptionError( - "Predicate offload cannot be used with incremental partition offload options: (--less-than-value/--older-than-date/--older-than-days)" - ) - - no_modify_hybrid_view_option_used = not options.offload_predicate_modify_hybrid_view - if no_modify_hybrid_view_option_used and not options.offload_predicate: - raise OffloadOptionError( - "--no-modify-hybrid-view can only be used with --offload-predicate" - ) - - -def normalise_stats_options(options, target_backend): - if options.offload_stats_method not in [ - offload_constants.OFFLOAD_STATS_METHOD_NATIVE, - offload_constants.OFFLOAD_STATS_METHOD_HISTORY, - offload_constants.OFFLOAD_STATS_METHOD_COPY, - offload_constants.OFFLOAD_STATS_METHOD_NONE, - ]: - raise OffloadOptionError( - "Unsupported value for --offload-stats: %s" % options.offload_stats_method - ) - - if ( - options.offload_stats_method == offload_constants.OFFLOAD_STATS_METHOD_HISTORY - and target_backend == offload_constants.DBTYPE_IMPALA - ): - options.offload_stats_method = offload_constants.OFFLOAD_STATS_METHOD_NATIVE - - def parse_yyyy_mm_dd(ds): return datetime.strptime(ds, "%Y-%m-%d") @@ -716,6 +586,14 @@ def get_offload_options(opt): default=orchestration_defaults.data_sample_parallelism_default(), help=config_descriptions.DATA_SAMPLE_PARALLELISM, ) + opt.add_option( + "--ddl-file", + dest="ddl_file", + help=( + "Output generated target table DDL to a file, should include full path or literal AUTO. " + "Supports local paths or cloud storage URIs" + ), + ) opt.add_option( "--not-null-columns", dest="not_null_columns_csv", @@ -724,8 +602,11 @@ def get_offload_options(opt): opt.add_option( "--offload-predicate-type", dest="ipa_predicate_type", - help="Override the default INCREMENTAL_PREDICATE_TYPE for a partitioned table. Used to offload LIST partitioned tables using RANGE logic with --offload-predicate-type=%s or used for specialized cases of Incremental Partition Append and Predicate-Based Offload offloading" - % INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, + help=( + "Override the default INCREMENTAL_PREDICATE_TYPE for a partitioned table. " + f"Used to offload LIST partitioned tables using RANGE logic with --offload-predicate-type={INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE} or " + "used for specialized cases of Incremental Partition Append and Predicate-Based Offload offloading" + ), ) opt.add_option( "--offload-fs-scheme", @@ -747,7 +628,10 @@ def get_offload_options(opt): opt.add_option( "--offload-type", dest="offload_type", - help="Identifies a range partitioned offload as FULL or INCREMENTAL. FULL dictates that all data is offloaded. INCREMENTAL dictates that data up to an incremental threshold will be offloaded", + help=( + "Identifies a range partitioned offload as FULL or INCREMENTAL. FULL dictates that all data is offloaded. " + "INCREMENTAL dictates that data up to an incremental threshold will be offloaded" + ), ) opt.add_option( @@ -780,7 +664,12 @@ def get_offload_options(opt): "--decimal-columns", dest="decimal_columns_csv_list", action="append", - help='CSV list of columns to offload as DECIMAL(p,s) where "p,s" is specified in a paired --decimal-columns-type option. --decimal-columns and --decimal-columns-type allow repeat inclusion for flexible data type specification, for example "--decimal-columns-type=18,2 --decimal-columns=price,cost --decimal-columns-type=6,4 --decimal-columns=location" (only effective for numeric columns)', + help=( + 'CSV list of columns to offload as DECIMAL(p,s) where "p,s" is specified in a paired --decimal-columns-type option. ' + "--decimal-columns and --decimal-columns-type allow repeat inclusion for flexible data type specification, " + 'for example "--decimal-columns-type=18,2 --decimal-columns=price,cost --decimal-columns-type=6,4 --decimal-columns=location" ' + "(only effective for numeric columns)" + ), ) opt.add_option( "--decimal-columns-type", @@ -862,7 +751,10 @@ def get_offload_options(opt): "--offload-transport-small-table-threshold", dest="offload_transport_small_table_threshold", default=orchestration_defaults.offload_transport_small_table_threshold_default(), - help="Threshold above which Query Import is no longer considered the correct offload choice for non-partitioned tables. [\\d.]+[MG] eg. 100M, 0.5G, 1G", + help=( + "Threshold above which Query Import is no longer considered the correct offload choice " + "for non-partitioned tables. [\\d.]+[MG] eg. 100M, 0.5G, 1G" + ), ) opt.add_option( "--offload-transport-spark-properties", @@ -874,7 +766,12 @@ def get_offload_options(opt): "--offload-transport-validation-polling-interval", dest="offload_transport_validation_polling_interval", default=orchestration_defaults.offload_transport_validation_polling_interval_default(), - help="Polling interval in seconds for validation of Spark transport row count. -1 disables retrieval of RDBMS SQL statistics. 0 disables polling resulting in a single capture of SQL statistics. A value greater than 0 polls transport SQL statistics using the specified interval", + help=( + "Polling interval in seconds for validation of Spark transport row count. " + "-1 disables retrieval of RDBMS SQL statistics. " + "0 disables polling resulting in a single capture of SQL statistics. " + "A value greater than 0 polls transport SQL statistics using the specified interval" + ), ) opt.add_option( diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index 636534f0..2d424c16 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -120,6 +120,9 @@ OFFLOAD_TRANSPORT_GCP = "GCP" OFFLOAD_TRANSPORT_SQOOP = "SQOOP" +# DDL file +DDL_FILE_AUTO = "AUTO" + # Exception markers ADJUSTED_BACKEND_IDENTIFIER_MESSAGE_TEXT = "Using adjusted backend table name" CONFLICTING_DATA_ID_OPTIONS_EXCEPTION_TEXT = "Conflicting data identification options" diff --git a/src/goe/offload/offload_status_report.py b/src/goe/offload/offload_status_report.py index 759fbc79..c0127dad 100755 --- a/src/goe/offload/offload_status_report.py +++ b/src/goe/offload/offload_status_report.py @@ -46,7 +46,7 @@ from goe.offload.factory.backend_api_factory import backend_api_factory from goe.offload.factory.backend_table_factory import backend_table_factory from goe.offload.factory.frontend_api_factory import frontend_api_factory -from goe.offload.offload import OffloadOptionError +from goe.offload.option_validation import OffloadOptionError from goe.offload.offload_functions import STARTS_WITH_DATE_PATTERN_RE from goe.offload.offload_messages import ( OffloadMessages, diff --git a/src/goe/offload/option_validation.py b/src/goe/offload/option_validation.py new file mode 100644 index 00000000..2d65e32a --- /dev/null +++ b/src/goe/offload/option_validation.py @@ -0,0 +1,205 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from optparse import OptionValueError +import os +import re + +from typing import TYPE_CHECKING + +from goe.filesystem.goe_dfs import get_scheme_from_location_uri +from goe.offload import offload_constants +from goe.offload.predicate_offload import GenericPredicate +from goe.offload.offload_source_table import ( + DATA_SAMPLE_SIZE_AUTO, + OFFLOAD_PARTITION_TYPE_RANGE, + OFFLOAD_PARTITION_TYPE_LIST, +) +from goe.util.misc_functions import standard_file_name, is_pos_int + +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + + +class OffloadOptionError(Exception): + def __init__(self, detail): + self.detail = detail + + def __str__(self): + return repr(self.detail) + + +def active_data_append_options( + opts, + partition_type=None, + from_options=False, + ignore_partition_names_opt=False, + ignore_pbo=False, +): + rpa_opts = { + "--less-than-value": opts.less_than_value, + "--partition-names": opts.partition_names_csv, + } + lpa_opts = { + "--equal-to-values": opts.equal_to_values, + "--partition-names": opts.partition_names_csv, + } + ida_opts = {"--offload-predicate": opts.offload_predicate} + + if from_options: + # options has a couple of synonyms for less_than_value + rpa_opts.update( + { + "--older-than-days": opts.older_than_days, + "--older-than-date": opts.older_than_date, + } + ) + + if ignore_partition_names_opt: + del rpa_opts["--partition-names"] + del lpa_opts["--partition-names"] + + if partition_type == OFFLOAD_PARTITION_TYPE_RANGE: + chk_opts = rpa_opts + elif partition_type == OFFLOAD_PARTITION_TYPE_LIST: + chk_opts = lpa_opts + elif not partition_type: + chk_opts = {} if ignore_pbo else ida_opts.copy() + chk_opts.update(lpa_opts) + chk_opts.update(rpa_opts) + + active_pa_opts = [_ for _ in chk_opts if chk_opts[_]] + return active_pa_opts + + +def check_opt_is_posint( + opt_name, opt_val, exception_class=OptionValueError, allow_zero=False +): + if is_pos_int(opt_val, allow_zero=allow_zero): + return int(opt_val) + else: + raise exception_class( + "option %s: invalid positive integer value: %s" % (opt_name, opt_val) + ) + + +def generate_ddl_file_path( + owner: str, table_name: str, config: "OrchestrationConfig" +) -> str: + """Generates a default path when DDL file option == AUTO.""" + file_name = standard_file_name( + f"{owner}.{table_name}", extension=".sql", with_datetime=True + ) + log_path = os.path.join(config.log_path, file_name) + return log_path + + +def normalise_data_sampling_options(options): + if hasattr(options, "data_sample_pct"): + if isinstance(options.data_sample_pct, str) and re.search( + r"^[\d\.]+$", options.data_sample_pct + ): + options.data_sample_pct = float(options.data_sample_pct) + elif options.data_sample_pct == "AUTO": + options.data_sample_pct = DATA_SAMPLE_SIZE_AUTO + elif type(options.data_sample_pct) not in (int, float): + raise OffloadOptionError( + 'Invalid value "%s" for --data-sample-percent' % options.data_sample_pct + ) + else: + options.data_sample_pct = 0 + + if hasattr(options, "data_sample_parallelism"): + options.data_sample_parallelism = check_opt_is_posint( + "--data-sample-parallelism", + options.data_sample_parallelism, + allow_zero=True, + ) + + +def normalise_ddl_file(options, config: "OrchestrationConfig"): + """Validates path pointed to by ddl_file and generates a new path if AUTO. Mutates options.""" + if options.ddl_file: + options.ddl_file = options.ddl_file.strip() + else: + return options.ddl_file + + if options.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: + # Use an auto-generated path. + options.ddl_file = generate_ddl_file_path( + options.owner, options.table_name, config + ) + return + + # Simplistic check that the file path looks like a cloud storage one. + if ":" in options.ddl_file: + # We don't need to know the scheme right now, just validation that it is supported. + _ = get_scheme_from_location_uri(options.ddl_file) + return + + # Assume local filesystem, we can validate the path. + + if os.path.exists(options.ddl_file): + raise OffloadOptionError(f"DDL path already exists: {options.ddl_file}") + + if "/" in options.ddl_file[1:]: + dirname = os.path.dirname(options.ddl_file) + if not os.path.isdir(dirname): + raise OffloadOptionError(f"DDL file directory does not exist: {dirname}") + + +def normalise_offload_predicate_options(options): + if options.offload_predicate: + if isinstance(options.offload_predicate, str): + options.offload_predicate = GenericPredicate(options.offload_predicate) + + if ( + options.less_than_value + or options.older_than_date + or options.older_than_days + ): + raise OffloadOptionError( + "Predicate offload cannot be used with incremental partition offload options: (--less-than-value/--older-than-date/--older-than-days)" + ) + + no_modify_hybrid_view_option_used = not options.offload_predicate_modify_hybrid_view + if no_modify_hybrid_view_option_used and not options.offload_predicate: + raise OffloadOptionError( + "--no-modify-hybrid-view can only be used with --offload-predicate" + ) + + +def normalise_stats_options(options, target_backend): + if options.offload_stats_method not in [ + offload_constants.OFFLOAD_STATS_METHOD_NATIVE, + offload_constants.OFFLOAD_STATS_METHOD_HISTORY, + offload_constants.OFFLOAD_STATS_METHOD_COPY, + offload_constants.OFFLOAD_STATS_METHOD_NONE, + ]: + raise OffloadOptionError( + "Unsupported value for --offload-stats: %s" % options.offload_stats_method + ) + + if ( + options.offload_stats_method == offload_constants.OFFLOAD_STATS_METHOD_HISTORY + and target_backend == offload_constants.DBTYPE_IMPALA + ): + options.offload_stats_method = offload_constants.OFFLOAD_STATS_METHOD_NATIVE + + +def normalise_verify_options(options): + if getattr(options, "verify_parallelism", None): + options.verify_parallelism = check_opt_is_posint( + "--verify-parallelism", options.verify_parallelism, allow_zero=True + ) diff --git a/src/goe/util/misc_functions.py b/src/goe/util/misc_functions.py index 6cc7635c..7a59a149 100644 --- a/src/goe/util/misc_functions.py +++ b/src/goe/util/misc_functions.py @@ -227,6 +227,8 @@ def is_pos_int(val, allow_zero=False): n = int(val) if n < lower_bound: return False + if decimal.Decimal(val) != n: + return False return True except (TypeError, ValueError): return False diff --git a/tests/unit/offload/test_option_validation.py b/tests/unit/offload/test_option_validation.py new file mode 100644 index 00000000..88666216 --- /dev/null +++ b/tests/unit/offload/test_option_validation.py @@ -0,0 +1,108 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +import pytest + +from goe.offload import offload_constants, option_validation as module_under_test + +from tests.unit.test_functions import ( + build_mock_offload_operation, + build_mock_options, + FAKE_ORACLE_BQ_ENV, +) + +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + + +@pytest.fixture(scope="module") +def config(): + return build_mock_options(FAKE_ORACLE_BQ_ENV) + + +@pytest.mark.parametrize( + "input,expect_exception", + [ + ("s", True), + (None, True), + (-1, True), + (0, True), + (1.1, True), + (0.1, True), + (123456789012345.1, True), + (1, False), + (123456789012345, False), + ], +) +def test_check_opt_is_posint(input: str, expect_exception: bool): + if expect_exception: + with pytest.raises(Exception): + _ = module_under_test.check_opt_is_posint("fake-option", input) + else: + output = module_under_test.check_opt_is_posint("fake-option", input) + assert output == input + + +@pytest.mark.parametrize( + "schema,table_name", + [ + ("my_user", "my_table123"), + ("MY-USER-123", "MY-TABLE"), + ], +) +def test_generate_ddl_file_path( + schema: str, table_name: str, config: "OrchestrationConfig" +): + path = module_under_test.generate_ddl_file_path(schema, table_name, config) + assert schema in path + assert table_name in path + offload_log = FAKE_ORACLE_BQ_ENV["OFFLOAD_LOG"] + assert path.startswith(offload_log) + assert path.endswith(".sql") + + +def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): + fake_operation = build_mock_offload_operation() + fake_operation.ddl_file = offload_constants.DDL_FILE_AUTO + module_under_test.normalise_ddl_file(fake_operation, config) + assert isinstance(fake_operation.ddl_file, str) + + +@pytest.mark.parametrize( + "path,expect_exception", + [ + ("/tmp", True), + ("/tmp/", True), + ("/tmp/ddl.sql", False), + # Should fail because "not-a-dir" should not exist. + ("/tmp/not-a-dir/not-a-file.sql", True), + # Cloud storage paths will pass as long as the scheme is valid. + ("gs://bucket/path/ddl.sql", False), + ("s3://bucket/path/ddl.sql", False), + ("unknown-scheme://bucket/path/ddl.sql", True), + ], +) +def test_normalise_ddl_file_path( + path: str, expect_exception: bool, config: "OrchestrationConfig" +): + fake_operation = build_mock_offload_operation() + fake_operation.ddl_file = path + if expect_exception: + with pytest.raises(Exception): + _ = module_under_test.normalise_ddl_file(fake_operation, config) + else: + # No exception expected. + _ = module_under_test.normalise_ddl_file(fake_operation, config) diff --git a/tests/unit/offload/test_staging_file.py b/tests/unit/offload/test_staging_file.py index 17b10675..8b5c6985 100644 --- a/tests/unit/offload/test_staging_file.py +++ b/tests/unit/offload/test_staging_file.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from unittest import TestCase, main +from unittest import TestCase from goe.offload.column_metadata import ( CanonicalColumn, From 16d3175e202c18d389e243416413d472ba7f2e15 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Tue, 9 Apr 2024 10:32:20 +0000 Subject: [PATCH 02/28] feat: Add --ddl-file option --- src/goe/goe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index f2ea0950..c7a44aeb 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -2256,7 +2256,7 @@ def __init__( normalise_offload_predicate_options(self) normalise_verify_options(self) normalise_data_sampling_options(self) - normalise_ddl_file(self.ddl_file, self.owner, self.table_name, config) + normalise_ddl_file(self, config) self._setup_offload_step(messages) @@ -2890,6 +2890,7 @@ def offload_table( messages, "staging area", offload_target_table, offload_options.execute ) + # TODO act upon ddl-file here perhaps existing_metadata = None if offload_target_table.exists() and not offload_operation.reset_backend_table: # We need to pickup defaults for an existing table here, BEFORE we start looking for data to offload (get_offload_data_manager()) From 110643419b762baa65e44e3705c913a134ea2957 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Tue, 9 Apr 2024 17:59:11 +0000 Subject: [PATCH 03/28] feat: Prep for --ddl-file, move execute from config to operation --- src/goe/conductor/hybrid_view_service.py | 2 + src/goe/config/orchestration_config.py | 5 - src/goe/connect/connect.py | 3 +- src/goe/connect/connect_backend.py | 7 +- .../data_governance/hadoop_data_governance.py | 26 +-- src/goe/filesystem/goe_dfs_factory.py | 5 +- src/goe/goe.py | 182 ++++++++++-------- src/goe/listener/services/hybrid_view.py | 2 + src/goe/listener/services/system.py | 3 +- src/goe/offload/backend_table.py | 5 +- .../bigquery/bigquery_backend_table.py | 2 +- .../offload/factory/backend_api_factory.py | 2 + .../offload/factory/backend_table_factory.py | 6 +- .../offload/hadoop/hadoop_backend_table.py | 2 +- .../offload/hadoop/sqoop_offload_transport.py | 14 +- .../microsoft/synapse_backend_table.py | 2 +- src/goe/offload/offload_constants.py | 3 + src/goe/offload/offload_metadata_functions.py | 2 +- src/goe/offload/offload_status_report.py | 3 +- src/goe/offload/offload_transport.py | 16 +- src/goe/offload/offload_validation.py | 14 +- src/goe/offload/operation/stats_controls.py | 2 +- src/goe/offload/operation/transport.py | 4 +- src/goe/offload/option_validation.py | 9 +- .../snowflake/snowflake_backend_table.py | 2 +- .../spark/dataproc_offload_transport.py | 2 +- .../offload/spark/livy_offload_transport.py | 14 +- src/goe/orchestration/cli_entry_points.py | 1 - src/goe/orchestration/orchestration_runner.py | 51 +++-- .../oracle_orchestration_repo_client.py | 18 +- src/goe/schema_sync/schema_sync_analyzer.py | 1 - src/goe/schema_sync/schema_sync_processor.py | 1 - tests/integration/offload/test_backend_api.py | 10 +- .../integration/offload/test_backend_table.py | 2 + .../offload/test_predicate_offload.py | 9 +- .../orchestration/test_command_steps.py | 4 +- .../test_orchestration_runner.py | 9 +- .../test_orchestration_metadata.py | 2 - .../test_orchestration_repo_client.py | 7 +- .../scenarios/assertion_functions.py | 7 +- .../integration/scenarios/scenario_runner.py | 1 - .../scenarios/test_column_controls.py | 35 +++- tests/integration/scenarios/test_ddl_file.py | 120 ++++++++++++ .../integration/scenarios/test_identifiers.py | 24 ++- .../scenarios/test_offload_basic.py | 30 ++- .../scenarios/test_offload_data.py | 20 +- .../scenarios/test_offload_hash_column.py | 3 + .../scenarios/test_offload_list_rpa.py | 12 +- .../integration/scenarios/test_offload_lpa.py | 27 +++ .../scenarios/test_offload_misc.py | 13 +- .../scenarios/test_offload_part_fn.py | 16 +- .../integration/scenarios/test_offload_pbo.py | 33 +++- .../scenarios/test_offload_pbo_intra.py | 17 +- .../scenarios/test_offload_pbo_late.py | 13 +- .../integration/scenarios/test_offload_rpa.py | 17 +- .../scenarios/test_offload_sorting.py | 10 + .../scenarios/test_offload_subpart.py | 11 ++ .../scenarios/test_offload_transport.py | 7 +- .../test_orchestration_step_control.py | 18 +- tests/integration/test_functions.py | 2 +- .../test_framework/backend_testing_api.py | 4 +- .../hadoop/hadoop_backend_testing_api.py | 52 ++--- tests/unit/offload/test_data_type_mappings.py | 56 +++++- tests/unit/offload/test_option_validation.py | 11 +- tests/unit/test_functions.py | 2 +- 65 files changed, 728 insertions(+), 287 deletions(-) create mode 100644 tests/integration/scenarios/test_ddl_file.py diff --git a/src/goe/conductor/hybrid_view_service.py b/src/goe/conductor/hybrid_view_service.py index c75c6a90..cd749226 100644 --- a/src/goe/conductor/hybrid_view_service.py +++ b/src/goe/conductor/hybrid_view_service.py @@ -193,6 +193,7 @@ def _get_backend_table(self): self._connection_options, self._messages, hybrid_metadata=self._offload_metadata, + dry_run=self._dry_run, ) def _get_backend_detail(self, attribute_name=None): @@ -315,6 +316,7 @@ def validate_by_aggregation(self, lower_hv=None, upper_hv=None, as_json=True): messages=self._messages, backend_db=self._backend_table_owner, backend_table=self._backend_table_name, + execute=(not self._dry_run), ) status, agg_msg = validator.validate( safe=False, diff --git a/src/goe/config/orchestration_config.py b/src/goe/config/orchestration_config.py index bde2bcd6..76f90e72 100644 --- a/src/goe/config/orchestration_config.py +++ b/src/goe/config/orchestration_config.py @@ -79,7 +79,6 @@ "data_governance_auto_tags_csv", "data_governance_auto_properties_csv", "dev_log_level", - "execute", "error_on_token", "frontend_odbc_driver_name", "google_dataproc_batches_subnet", @@ -251,7 +250,6 @@ class OrchestrationConfig: db_type: str dev_log_level: str error_on_token: Optional[str] - execute: bool frontend_odbc_driver_name: Optional[str] google_dataproc_batches_subnet: Optional[str] google_dataproc_batches_version: Optional[str] @@ -417,9 +415,6 @@ def from_dict(config_dict, do_not_connect=False): "dev_log_level", orchestration_defaults.dev_log_level_default() ), error_on_token=config_dict.get("error_on_token"), - execute=config_dict.get( - "execute", orchestration_defaults.execute_default() - ), frontend_odbc_driver_name=config_dict.get( "frontend_odbc_driver_name", orchestration_defaults.frontend_odbc_driver_name_default(), diff --git a/src/goe/connect/connect.py b/src/goe/connect/connect.py index 6f801a02..a76126c5 100755 --- a/src/goe/connect/connect.py +++ b/src/goe/connect/connect.py @@ -448,7 +448,7 @@ def check_environment(options, orchestration_config): def test_offload_fs_container(orchestration_config, messages): test_name = "Offload filesystem container" test_header(test_name) - dfs_client = get_dfs_from_options(orchestration_config, messages) + dfs_client = get_dfs_from_options(orchestration_config, messages, dry_run=False) display_uri = dfs_client.gen_uri( orchestration_config.offload_fs_scheme, orchestration_config.offload_fs_container, @@ -467,7 +467,6 @@ def test_offload_fs_container(orchestration_config, messages): def get_config_with_connect_overrides(connect_options): override_dict = { - "execute": True, "verbose": connect_options.verbose, "hive_timeout_s": CONNECT_HIVE_TIMEOUT_S, } diff --git a/src/goe/connect/connect_backend.py b/src/goe/connect/connect_backend.py index 9883ddc9..8c9dad8b 100644 --- a/src/goe/connect/connect_backend.py +++ b/src/goe/connect/connect_backend.py @@ -121,10 +121,11 @@ def test_raw_conn(hadoop_host, hadoop_port): def get_cli_hdfs(orchestration_config, host, messages): + # dry_run always = False in connect. return CliHdfs( host, orchestration_config.hadoop_ssh_user, - dry_run=(not orchestration_config.execute), + dry_run=False, messages=messages, db_path_suffix=orchestration_config.hdfs_db_path_suffix, hdfs_data=orchestration_config.hdfs_data, @@ -241,7 +242,7 @@ def test_webhdfs_config(orchestration_config, messages): orchestration_config.hadoop_ssh_user, True if orchestration_config.kerberos_service else False, orchestration_config.webhdfs_verify_ssl, - dry_run=not orchestration_config.execute, + dry_run=False, messages=messages, db_path_suffix=orchestration_config.hdfs_db_path_suffix, hdfs_data=orchestration_config.hdfs_data, @@ -264,7 +265,7 @@ def test_sentry_privs(orchestration_config, backend_api, messages): log("Skipping Sentry steps due to backend system", detail=VVERBOSE) return - dfs_client = get_dfs_from_options(orchestration_config, messages) + dfs_client = get_dfs_from_options(orchestration_config, messages, dry_run=False) uris_left_to_check = get_hdfs_dirs( orchestration_config, dfs_client, include_hdfs_home=False ) diff --git a/src/goe/data_governance/hadoop_data_governance.py b/src/goe/data_governance/hadoop_data_governance.py index 9d9fe54e..54f4fb7d 100644 --- a/src/goe/data_governance/hadoop_data_governance.py +++ b/src/goe/data_governance/hadoop_data_governance.py @@ -208,12 +208,12 @@ def data_governance_auto_property_defaults( """ property_defaults = {} now = datetime.datetime.now().replace(microsecond=0) - property_defaults[ - DATA_GOVERNANCE_DYNAMIC_PROPERTY_INITIAL_OPERATION_DATETIME - ] = now.isoformat() - property_defaults[ - DATA_GOVERNANCE_DYNAMIC_PROPERTY_LATEST_OPERATION_DATETIME - ] = now.isoformat() + property_defaults[DATA_GOVERNANCE_DYNAMIC_PROPERTY_INITIAL_OPERATION_DATETIME] = ( + now.isoformat() + ) + property_defaults[DATA_GOVERNANCE_DYNAMIC_PROPERTY_LATEST_OPERATION_DATETIME] = ( + now.isoformat() + ) if source_rdbms_object: property_defaults[DATA_GOVERNANCE_DYNAMIC_PROPERTY_SOURCE_RDBMS_OBJECT] = ( "%s.%s.%s" % (rdbms_name, rdbms_schema, source_rdbms_object) @@ -223,9 +223,9 @@ def data_governance_auto_property_defaults( "%s.%s.%s" % (rdbms_name, rdbms_schema, target_rdbms_object) ).upper() if goe_object_type: - property_defaults[ - DATA_GOVERNANCE_DYNAMIC_PROPERTY_GOE_OBJECT_TYPE - ] = goe_object_type + property_defaults[DATA_GOVERNANCE_DYNAMIC_PROPERTY_GOE_OBJECT_TYPE] = ( + goe_object_type + ) property_defaults = filter_properties_by_goe_object_type( property_defaults, goe_object_type ) @@ -436,8 +436,8 @@ def data_governance_register_new_db( def data_governance_register_new_db_step( hadoop_db, data_gov_client, messages, goe_object_type, options=None ): - opts_execute = options.execute if options else True if data_gov_client: + opts_execute = options.execute if options else True def step_fn(): data_governance_register_new_db( @@ -463,8 +463,8 @@ def data_governance_register_new_object_step( renaming_from_object_name=None, dg_object_type=None, ): - opts_execute = options.execute if options else True if data_gov_client: + opts_execute = options.execute if options else True def step_fn(): data_governance_register_new_object( @@ -526,8 +526,8 @@ def data_governance_register_new_view_step( def data_governance_update_metadata_step( hadoop_db, hadoop_object_name, data_gov_client, messages, options=None ): - opts_execute = options.execute if options else True if data_gov_client: + opts_execute = options.execute if options else True def step_fn(): data_governance_update_metadata( @@ -550,8 +550,8 @@ def data_governance_register_new_multi_db_step( """ assert hadoop_db_list assert type(hadoop_db_list) is list - opts_execute = options.execute if options else True if data_gov_client: + opts_execute = options.execute if options else True def step_fn(): for hadoop_db, goe_object_type in hadoop_db_list: diff --git a/src/goe/filesystem/goe_dfs_factory.py b/src/goe/filesystem/goe_dfs_factory.py index 70514a7f..60423f9b 100644 --- a/src/goe/filesystem/goe_dfs_factory.py +++ b/src/goe/filesystem/goe_dfs_factory.py @@ -24,12 +24,9 @@ def get_dfs_from_options( - offload_options, messages=None, force_ssh=False, dry_run=None, do_not_connect=False + offload_options, messages=None, force_ssh=False, dry_run=True, do_not_connect=False ): """Helper function to get an appropriate GOEDfs object based on offload options.""" - if dry_run is None: - dry_run = bool(not offload_options.execute) - if offload_options.backend_distribution in HADOOP_BASED_BACKEND_DISTRIBUTIONS: if ( offload_options.webhdfs_host diff --git a/src/goe/goe.py b/src/goe/goe.py index 71a51801..7994d470 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -24,7 +24,7 @@ from optparse import OptionParser, Option, OptionValueError, SUPPRESS_HELP import re import traceback -from typing import Union +from typing import Union, TYPE_CHECKING import orjson @@ -36,6 +36,7 @@ ) from goe.filesystem.goe_dfs_factory import get_dfs_from_options +from goe.offload import offload_constants from goe.offload.backend_api import IMPALA_SHUFFLE_HINT, IMPALA_NOSHUFFLE_HINT from goe.offload.factory.backend_api_factory import backend_api_factory from goe.offload.factory.backend_table_factory import ( @@ -59,28 +60,6 @@ GOE_TYPE_VARIABLE_STRING, GOE_TYPE_TIMESTAMP_TZ, ) -from goe.offload.offload_constants import ( - ADJUSTED_BACKEND_IDENTIFIER_MESSAGE_TEXT, - DBTYPE_BIGQUERY, - DBTYPE_IMPALA, - DBTYPE_ORACLE, - DBTYPE_MSSQL, - FILE_STORAGE_COMPRESSION_CODEC_GZIP, - FILE_STORAGE_COMPRESSION_CODEC_SNAPPY, - FILE_STORAGE_COMPRESSION_CODEC_ZLIB, - IPA_PREDICATE_TYPE_CHANGE_EXCEPTION_TEXT, - IPA_PREDICATE_TYPE_EXCEPTION_TEXT, - IPA_PREDICATE_TYPE_FIRST_OFFLOAD_EXCEPTION_TEXT, - LOG_LEVEL_INFO, - LOG_LEVEL_DETAIL, - LOG_LEVEL_DEBUG, - MISSING_METADATA_EXCEPTION_TEMPLATE, - OFFLOAD_STATS_METHOD_COPY, - OFFLOAD_STATS_METHOD_NATIVE, - OFFLOAD_TRANSPORT_VALIDATION_POLLER_DISABLED, - SORT_COLUMNS_NO_CHANGE, - TOTAL_ROWS_OFFLOADED_LOG_TEXT, -) from goe.offload.offload_functions import convert_backend_identifier_case, data_db_name from goe.offload.option_validation import normalise_ddl_file from goe.offload.offload_source_data import ( @@ -169,6 +148,10 @@ from goe.util.ora_query import get_oracle_connection from goe.util.redis_tools import RedisClient +if TYPE_CHECKING: + from goe.offload.backend_table import BackendTableInterface + from goe.config.orchestration_config import OrchestrationConfig + dev_logger = logging.getLogger("goe") @@ -213,6 +196,7 @@ "equal_to_values", "error_before_step", "error_after_step", + "execute", "force", "hive_column_stats", "impala_insert_hint", @@ -377,7 +361,7 @@ def ora_single_item_query(opts, qry, ora_conn=None, params={}): def get_db_unique_name(opts): - if opts.db_type == DBTYPE_ORACLE: + if opts.db_type == offload_constants.DBTYPE_ORACLE: sql = """ SELECT SYS_CONTEXT('USERENV', 'DB_UNIQUE_NAME') || CASE @@ -393,7 +377,7 @@ def get_db_unique_name(opts): ) """ return ora_single_item_query(opts, sql) - elif opts.db_type == DBTYPE_MSSQL: + elif opts.db_type == offload_constants.DBTYPE_MSSQL: try: return opts.rdbms_dsn.split("=")[1] except Exception: @@ -401,7 +385,7 @@ def get_db_unique_name(opts): def get_rdbms_db_name(opts, ora_conn=None): - if opts.db_type == DBTYPE_ORACLE: + if opts.db_type == offload_constants.DBTYPE_ORACLE: sql = """ SELECT CASE WHEN version < 12 @@ -414,7 +398,7 @@ def get_rdbms_db_name(opts, ora_conn=None): ) """ return ora_single_item_query(opts, sql, ora_conn) - elif opts.db_type == DBTYPE_MSSQL: + elif opts.db_type == offload_constants.DBTYPE_MSSQL: try: return opts.rdbms_dsn.split("=")[1] except: @@ -448,7 +432,7 @@ def set_nls_lang_default(opts): def check_and_set_nls_lang(opts, messages=None): # TODO: We believe that we need to have NLS_LANG set correctly in order for query_import to offload data correctly? # If that is the case if/when we implement query_import for non-Oracle, we need to cater for this. - if opts.db_type == DBTYPE_ORACLE: + if opts.db_type == offload_constants.DBTYPE_ORACLE: if not nls_lang_exists(): set_nls_lang_default(opts) if messages: @@ -568,23 +552,23 @@ def incremental_offload_partition_overrides( def verify_offload_by_backend_count( offload_source_table, offload_target_table, - ipa_predicate_type, - offload_options, + offload_operation, messages, verification_hvs, prior_hvs, - verify_parallelism, inflight_offload_predicate=None, ): """Verify (by row counts) the data offloaded in the current operation. For partitioned tables the partition columns and verification_hvs and prior_hvs are used to limit scanning to the relevant data in both frontend abd backend. """ + ipa_predicate_type = offload_operation.ipa_predicate_type + verify_parallelism = offload_operation.verify_parallelism validator = BackendCountValidator( offload_source_table, offload_target_table, messages, - dry_run=bool(not offload_options.execute), + dry_run=bool(not offload_operation.execute), ) bind_predicates = bool( ipa_predicate_type @@ -623,17 +607,18 @@ def verify_offload_by_backend_count( def verify_row_count_by_aggs( offload_source_table, offload_target_table, - ipa_predicate_type, + offload_operation, options, messages, verification_hvs, prior_hvs, - verify_parallelism, inflight_offload_predicate=None, ): """Light verification by running aggregate queries in both Oracle and backend and comparing their results """ + ipa_predicate_type = offload_operation.ipa_predicate_type + verify_parallelism = offload_operation.verify_parallelism validator = CrossDbValidator( db_name=offload_source_table.owner, table_name=offload_source_table.table_name, @@ -642,6 +627,7 @@ def verify_row_count_by_aggs( messages=messages, backend_db=offload_target_table.db_name, backend_table=offload_target_table.table_name, + execute=offload_operation.execute, ) bind_predicates = bool( @@ -669,7 +655,7 @@ def verify_row_count_by_aggs( status, _ = validator.validate( safe=False, filters=backend_filters, - execute=options.execute, + execute=offload_operation.execute, frontend_filters=frontend_filters, frontend_query_params=query_binds, frontend_parallelism=verify_parallelism, @@ -726,20 +712,18 @@ def offload_data_verification( verify_fn = lambda: verify_offload_by_backend_count( offload_source_table, offload_target_table, - offload_operation.ipa_predicate_type, - offload_options, + offload_operation, messages, new_hvs, prior_hvs, - offload_operation.verify_parallelism, inflight_offload_predicate=source_data_client.get_inflight_offload_predicate(), ) verify_by_count_results = messages.offload_step( command_steps.STEP_VERIFY_EXPORTED_DATA, verify_fn, - execute=offload_options.execute, + execute=offload_operation.execute, ) - if offload_options.execute and verify_by_count_results: + if offload_operation.execute and verify_by_count_results: num_diff, source_rows, hybrid_rows = verify_by_count_results if num_diff == 0: messages.log( @@ -760,18 +744,17 @@ def offload_data_verification( verify_fn = lambda: verify_row_count_by_aggs( offload_source_table, offload_target_table, - offload_operation.ipa_predicate_type, + offload_operation, offload_options, messages, new_hvs, prior_hvs, - offload_operation.verify_parallelism, inflight_offload_predicate=source_data_client.get_inflight_offload_predicate(), ) if messages.offload_step( command_steps.STEP_VERIFY_EXPORTED_DATA, verify_fn, - execute=offload_options.execute, + execute=offload_operation.execute, ): messages.log( "Source and target table data matches: offload successful%s" @@ -982,7 +965,7 @@ def normalise_offload_transport_user_options(options): r"^[\d\.]+$", options.offload_transport_validation_polling_interval ) or options.offload_transport_validation_polling_interval - == str(OFFLOAD_TRANSPORT_VALIDATION_POLLER_DISABLED) + == str(offload_constants.OFFLOAD_TRANSPORT_VALIDATION_POLLER_DISABLED) ): options.offload_transport_validation_polling_interval = float( options.offload_transport_validation_polling_interval @@ -1028,7 +1011,11 @@ def normalise_options(options, normalise_owner_table=True): if hasattr(options, "log_level") and options.log_level: options.log_level = options.log_level.lower() - if options.log_level not in [LOG_LEVEL_INFO, LOG_LEVEL_DETAIL, LOG_LEVEL_DEBUG]: + if options.log_level not in [ + offload_constants.LOG_LEVEL_INFO, + offload_constants.LOG_LEVEL_DETAIL, + offload_constants.LOG_LEVEL_DEBUG, + ]: raise OptionValueError( "Invalid value for LOG_LEVEL: %s" % options.log_level ) @@ -1045,9 +1032,9 @@ def normalise_options(options, normalise_owner_table=True): "NONE", "HIGH", "MED", - FILE_STORAGE_COMPRESSION_CODEC_GZIP, - FILE_STORAGE_COMPRESSION_CODEC_SNAPPY, - FILE_STORAGE_COMPRESSION_CODEC_ZLIB, + offload_constants.FILE_STORAGE_COMPRESSION_CODEC_GZIP, + offload_constants.FILE_STORAGE_COMPRESSION_CODEC_SNAPPY, + offload_constants.FILE_STORAGE_COMPRESSION_CODEC_ZLIB, ]: raise OptionValueError( "Invalid value for --storage-compression, valid values: HIGH|MED|NONE|GZIP|ZLIB|SNAPPY" @@ -1058,7 +1045,7 @@ def normalise_options(options, normalise_owner_table=True): normalise_offload_transport_user_options(options) - if options.target == DBTYPE_IMPALA: + if options.target == offload_constants.DBTYPE_IMPALA: options.offload_distribute_enabled = False normalise_less_than_options(options, exc_cls=OffloadOptionError) @@ -1275,6 +1262,8 @@ class BaseOperation(object): to completely merge them, using this base class to centralise some code. """ + execute: bool + def __init__( self, operation_name, @@ -1308,12 +1297,19 @@ def __init__( # The sorts of checks we do here do not require a backend connection: do_not_connect=True backend_api = backend_api_factory( - config.target, config, messages, do_not_connect=True + config.target, + config, + messages, + dry_run=(not self.execute), + do_not_connect=True, ) - if self.offload_stats_method == OFFLOAD_STATS_METHOD_COPY and not ( - backend_api.table_stats_get_supported() - and backend_api.table_stats_set_supported() + if ( + self.offload_stats_method == offload_constants.OFFLOAD_STATS_METHOD_COPY + and not ( + backend_api.table_stats_get_supported() + and backend_api.table_stats_set_supported() + ) ): raise OptionValueError( "%s for %s backend: %s" @@ -1325,13 +1321,13 @@ def __init__( ) if ( - self.offload_stats_method == OFFLOAD_STATS_METHOD_COPY + self.offload_stats_method == offload_constants.OFFLOAD_STATS_METHOD_COPY and self.offload_predicate ): messages.warning( "Offload stats method COPY in incompatible with predicate-based offload" ) - self.offload_stats_method = OFFLOAD_STATS_METHOD_NATIVE + self.offload_stats_method = offload_constants.OFFLOAD_STATS_METHOD_NATIVE self._hash_distribution_threshold = config.hash_distribution_threshold @@ -1544,7 +1540,7 @@ def repo_client(self): self._repo_client = orchestration_repo_client_factory( self._orchestration_config, self._messages, - dry_run=bool(not self._orchestration_config.execute), + dry_run=bool(not self.execute), trace_action="repo_client(OffloadOperation)", ) return self._repo_client @@ -1615,8 +1611,8 @@ def get_hybrid_metadata(self, force=False): ) return self._existing_metadata - def reset_hybrid_metadata(self, execute, new_metadata): - if execute: + def reset_hybrid_metadata(self, new_metadata): + if self.execute: self._existing_metadata = self.get_hybrid_metadata(force=True) else: # If we're not in execute mode then we need to re-use the in-flight metadata @@ -1679,7 +1675,8 @@ def defaults_for_existing_table(self, messages): if not existing_metadata: raise OffloadException( - MISSING_METADATA_EXCEPTION_TEMPLATE % (self.owner, self.table_name) + offload_constants.MISSING_METADATA_EXCEPTION_TEMPLATE + % (self.owner, self.table_name) ) self.set_bucket_info_from_metadata(existing_metadata, messages) @@ -1695,7 +1692,9 @@ def defaults_for_existing_table(self, messages): != existing_metadata.incremental_predicate_type ): # We are overwriting user input with value from metadata - raise OffloadException(IPA_PREDICATE_TYPE_CHANGE_EXCEPTION_TEXT) + raise OffloadException( + offload_constants.IPA_PREDICATE_TYPE_CHANGE_EXCEPTION_TEXT + ) self.ipa_predicate_type = existing_metadata.incremental_predicate_type self.pre_offload_hybrid_metadata = existing_metadata @@ -1873,7 +1872,7 @@ def validate_ipa_predicate_type(self, offload_source_table): raise OffloadException( "%s: %s/%s" % ( - IPA_PREDICATE_TYPE_EXCEPTION_TEXT, + offload_constants.IPA_PREDICATE_TYPE_EXCEPTION_TEXT, self.ipa_predicate_type, offload_source_table.partition_type, ) @@ -1934,7 +1933,7 @@ def validate_ipa_predicate_type(self, offload_source_table): raise OffloadException( "%s: %s" % ( - IPA_PREDICATE_TYPE_FIRST_OFFLOAD_EXCEPTION_TEXT, + offload_constants.IPA_PREDICATE_TYPE_FIRST_OFFLOAD_EXCEPTION_TEXT, self.ipa_predicate_type, ) ) @@ -2039,7 +2038,7 @@ def validate_sort_columns( offload_options.target, offload_options, messages, - dry_run=bool(not offload_options.execute), + dry_run=bool(not self.execute), ) created_api = False @@ -2047,7 +2046,8 @@ def validate_sort_columns( if not backend_api.sorted_table_supported(): if ( self.sort_columns_csv - and self.sort_columns_csv != SORT_COLUMNS_NO_CHANGE + and self.sort_columns_csv + != offload_constants.SORT_COLUMNS_NO_CHANGE ): # Only warn the user if they input a specific value messages.warning( @@ -2113,7 +2113,7 @@ def __init__( normalise_offload_predicate_options(self) normalise_verify_options(self) normalise_data_sampling_options(self) - normalise_ddl_file(self, config) + normalise_ddl_file(self, config, messages) self._setup_offload_step(messages) @@ -2220,6 +2220,7 @@ def from_options( equal_to_values=options.equal_to_values, error_after_step=options.error_after_step, error_before_step=options.error_before_step, + execute=options.execute, force=options.force, hive_column_stats=options.hive_column_stats, impala_insert_hint=options.impala_insert_hint, @@ -2354,6 +2355,9 @@ def from_dict( equal_to_values=operation_dict.get("equal_to_values"), error_after_step=operation_dict.get("error_after_step"), error_before_step=operation_dict.get("error_before_step"), + execute=operation_dict.get( + "execute", orchestration_defaults.execute_default() + ), force=operation_dict.get("force", orchestration_defaults.force_default()), hive_column_stats=operation_dict.get( "hive_column_stats", orchestration_defaults.hive_column_stats_default() @@ -2579,7 +2583,7 @@ def offload_operation_logic( offload_options.not_null_propagation, messages, ), - execute=offload_options.execute, + execute=offload_operation.execute, mandatory_step=True, ) @@ -2661,16 +2665,16 @@ def offload_operation_logic( def offload_table( - offload_options, - offload_operation, - offload_source_table, - offload_target_table, - messages, + offload_options: "OrchestrationConfig", + offload_operation: OffloadOperation, + offload_source_table: OffloadSourceTableInterface, + offload_target_table: "BackendTableInterface", + messages: OffloadMessages, ): global suppress_stdout_override global execution_id - if offload_options.db_type == DBTYPE_ORACLE: + if offload_options.db_type == offload_constants.DBTYPE_ORACLE: abort, v_goe, v_ora = version_abort( offload_operation.ver_check, offload_source_table.get_frontend_api() ) @@ -2683,7 +2687,7 @@ def offload_table( + ")" ) - if offload_options.target != DBTYPE_BIGQUERY: + if offload_options.target != offload_constants.DBTYPE_BIGQUERY: # As of GOE-2334 we only support BigQuery as a target. OffloadException(f"Unsupported Offload target: {offload_options.target}") @@ -2707,7 +2711,7 @@ def offload_table( offload_operation.target_owner_name.split(".")[0] ) ) - if offload_options.execute: + if offload_operation.execute: raise OffloadOptionError("Unsupported character(s) in Oracle schema name.") if not offload_source_table.columns: @@ -2731,14 +2735,16 @@ def offload_table( messages, "target database %s" % offload_target_table.db_name, offload_target_table, - offload_options.execute, + offload_operation.execute, ) if not offload_target_table.staging_area_exists(): offload_backend_db_message( - messages, "staging area", offload_target_table, offload_options.execute + messages, + "staging area", + offload_target_table, + offload_operation.execute, ) - # TODO act upon ddl-file here perhaps existing_metadata = None if offload_target_table.exists() and not offload_operation.reset_backend_table: # We need to pickup defaults for an existing table here, BEFORE we start looking for data to offload (get_offload_data_manager()) @@ -2776,7 +2782,7 @@ def offload_table( existing_metadata, OFFLOAD_SOURCE_CLIENT_OFFLOAD, ), - execute=offload_options.execute, + execute=offload_operation.execute, mandatory_step=True, ) @@ -2807,13 +2813,16 @@ def offload_table( data_gov_client = get_data_gov_client( offload_options, messages, + offload_operation, rdbms_schema=offload_source_table.owner, source_rdbms_object_name=offload_source_table.table_name, ) offload_operation.offload_transport_method = choose_offload_transport_method( offload_operation, offload_source_table, offload_options, messages ) - dfs_client = get_dfs_from_options(offload_options, messages) + dfs_client = get_dfs_from_options( + offload_options, messages, dry_run=(not offload_operation.execute) + ) # For a fresh offload we may have tuned offload_operation attributes offload_target_table.refresh_operational_settings( @@ -2832,14 +2841,14 @@ def offload_table( offload_target_table, messages, repo_client, - offload_options.execute, + offload_operation.execute, purge=offload_operation.purge_backend_table, ) rows_offloaded = None pre_offload_snapshot = None - if offload_options.db_type == DBTYPE_ORACLE: + if offload_options.db_type == offload_constants.DBTYPE_ORACLE: # Pre-offload SCN will be stored in metadata. pre_offload_snapshot = offload_source_table.get_current_scn( return_none_on_failure=True @@ -2876,7 +2885,9 @@ def offload_table( data_gov_client, ) messages.log( - "%s: %s" % (TOTAL_ROWS_OFFLOADED_LOG_TEXT, str(rows_offloaded)), detail=VVERBOSE + "%s: %s" + % (offload_constants.TOTAL_ROWS_OFFLOADED_LOG_TEXT, str(rows_offloaded)), + detail=VVERBOSE, ) if not offload_operation.preserve_load_table: @@ -2895,7 +2906,7 @@ def offload_table( pre_offload_snapshot, existing_metadata, ) - offload_operation.reset_hybrid_metadata(offload_options.execute, new_metadata) + offload_operation.reset_hybrid_metadata(new_metadata) if offload_operation.verify_row_count: if rows_offloaded != 0: @@ -2948,7 +2959,7 @@ def get_offload_target_table( offload_operation.target_name, ): messages.log( - f"{ADJUSTED_BACKEND_IDENTIFIER_MESSAGE_TEXT}: {db_name}.{table_name}", + f"{offload_constants.ADJUSTED_BACKEND_IDENTIFIER_MESSAGE_TEXT}: {db_name}.{table_name}", detail=VVERBOSE, ) backend_table = backend_table_factory( @@ -2969,13 +2980,14 @@ def get_synthetic_partition_cols(backend_cols): def get_data_gov_client( options, messages, + execute, rdbms_schema=None, source_rdbms_object_name=None, target_rdbms_object_name=None, ): if options.data_governance_api_url: data_gov_client = get_hadoop_data_governance_client_from_options( - options, messages, dry_run=bool(not options.execute) + options, messages, dry_run=bool(not execute) ) data_gov_client.healthcheck_api() data_gov_client.cache_property_values( diff --git a/src/goe/listener/services/hybrid_view.py b/src/goe/listener/services/hybrid_view.py index 5f0e02bb..268d9303 100644 --- a/src/goe/listener/services/hybrid_view.py +++ b/src/goe/listener/services/hybrid_view.py @@ -211,6 +211,7 @@ def validate_by_aggregation(self, lower_hv=None, upper_hv=None, as_json=True): messages=self._messages, backend_db=self._backend_table_owner, backend_table=self._backend_table_name, + execute=(not self._dry_run), ) status, agg_msg = validator.validate( safe=False, @@ -293,6 +294,7 @@ def _get_backend_table(self): self._connection_options, self._messages, hybrid_metadata=self._offload_metadata, + dry_run=self._dry_run, ) def _get_backend_detail(self, attribute_name=None): diff --git a/src/goe/listener/services/system.py b/src/goe/listener/services/system.py index e814c93d..2785c73b 100644 --- a/src/goe/listener/services/system.py +++ b/src/goe/listener/services/system.py @@ -48,8 +48,9 @@ def __init__(self): def get_repo( config: OrchestrationConfig, messages: OffloadMessages ) -> OrchestrationRepoClientInterface: + # TODO We need to find another way of setting dry_run below. return orchestration_repo_client_factory( - config, messages, dry_run=bool(not config.execute) + config, messages, dry_run=False # bool(not config.execute) ) def generate_listener_group_id(self) -> UUID3: diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index d513031a..30dc1189 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -989,6 +989,7 @@ def _get_dfs_client(self): self._dfs_client = get_dfs_from_options( self._orchestration_config, messages=self._messages, + dry_run=self._dry_run, do_not_connect=self._do_not_connect, ) self._backend_dfs = self._dfs_client.backend_dfs @@ -2440,7 +2441,7 @@ def create_backend_table_step(self, goe_object_type): ), ) pre_register_data_gov_fn() - self._offload_step( + executed_commands: list = self._offload_step( command_steps.STEP_CREATE_TABLE, lambda: self.create_backend_table() ) post_register_data_gov_fn() @@ -2494,7 +2495,7 @@ def compute_final_table_stats(self, incremental_stats, materialized_join=False): pass @abstractmethod - def create_backend_table(self): + def create_backend_table(self) -> list: pass @abstractmethod diff --git a/src/goe/offload/bigquery/bigquery_backend_table.py b/src/goe/offload/bigquery/bigquery_backend_table.py index 3064758d..7422687c 100644 --- a/src/goe/offload/bigquery/bigquery_backend_table.py +++ b/src/goe/offload/bigquery/bigquery_backend_table.py @@ -469,7 +469,7 @@ def compute_final_table_stats(self, incremental_stats, materialized_join=False): """Do nothing on BigQuery""" pass - def create_backend_table(self): + def create_backend_table(self) -> list: """Create a table in BigQuery based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. diff --git a/src/goe/offload/factory/backend_api_factory.py b/src/goe/offload/factory/backend_api_factory.py index ffafb8e2..402ac5a3 100644 --- a/src/goe/offload/factory/backend_api_factory.py +++ b/src/goe/offload/factory/backend_api_factory.py @@ -41,6 +41,8 @@ def backend_api_factory( if dry_run is None: if hasattr(connection_options, "execute"): dry_run = bool(not connection_options.execute) + elif do_not_connect: + dry_run = True else: dry_run = False if backend_type == DBTYPE_HIVE: diff --git a/src/goe/offload/factory/backend_table_factory.py b/src/goe/offload/factory/backend_table_factory.py index 16febede..411a5901 100644 --- a/src/goe/offload/factory/backend_table_factory.py +++ b/src/goe/offload/factory/backend_table_factory.py @@ -44,8 +44,12 @@ def backend_table_factory( an orchestration_operation or a means of getting hybrid_metadata. """ if dry_run is None: - if hasattr(orchestration_options, "execute"): + if orchestration_operation: + dry_run = bool(not orchestration_operation.execute) + elif hasattr(orchestration_options, "execute"): dry_run = bool(not orchestration_options.execute) + elif do_not_connect: + dry_run = True else: dry_run = False diff --git a/src/goe/offload/hadoop/hadoop_backend_table.py b/src/goe/offload/hadoop/hadoop_backend_table.py index 8f0c3a00..383faf00 100644 --- a/src/goe/offload/hadoop/hadoop_backend_table.py +++ b/src/goe/offload/hadoop/hadoop_backend_table.py @@ -646,7 +646,7 @@ def _tzoffset_to_timestamp_sql_expression(self, col_name): def cleanup_staging_area(self): self._drop_load_table(sync=True) - def create_backend_table(self): + def create_backend_table(self) -> list: """Create a table in the backend based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. diff --git a/src/goe/offload/hadoop/sqoop_offload_transport.py b/src/goe/offload/hadoop/sqoop_offload_transport.py index 26611ecd..b68b8f76 100644 --- a/src/goe/offload/hadoop/sqoop_offload_transport.py +++ b/src/goe/offload/hadoop/sqoop_offload_transport.py @@ -301,9 +301,11 @@ def _sqoop_import(self, partition_chunk=None): + ["--fetch-size=%d" % int(self._offload_transport_fetch_size)] + self._column_type_read_remappings() + [ - "--as-avrodatafile" - if self._staging_format == FILE_STORAGE_FORMAT_AVRO - else "--as-parquetfile", + ( + "--as-avrodatafile" + if self._staging_format == FILE_STORAGE_FORMAT_AVRO + else "--as-parquetfile" + ), "--outdir=" + self._offload_options.sqoop_outdir, ] ) @@ -345,7 +347,7 @@ def _sqoop_import(self, partition_chunk=None): ) # In order to let Impala/Hive drop the load table in the future we need g+w self.log_dfs_cmd('chmod(%s, "g+w")' % self._staging_table_location) - if self._offload_options.execute: + if not self._dry_run: self._dfs_client.chmod(self._staging_table_location, mode="g+w") except: # Even in a sqoop failure we still want to chmod the load directory - if it exists @@ -354,7 +356,7 @@ def _sqoop_import(self, partition_chunk=None): % self._staging_table_location, detail=VVERBOSE, ) - if self._offload_options.execute: + if not self._dry_run: try: self._dfs_client.chmod(self._staging_table_location, mode="g+w") except Exception as exc: @@ -438,7 +440,7 @@ def step_fn(): return self._messages.offload_step( command_steps.STEP_STAGING_TRANSPORT, step_fn, - execute=self._offload_options.execute, + execute=(not self._dry_run), ) def ping_source_rdbms(self): diff --git a/src/goe/offload/microsoft/synapse_backend_table.py b/src/goe/offload/microsoft/synapse_backend_table.py index c2cbf089..8888b748 100644 --- a/src/goe/offload/microsoft/synapse_backend_table.py +++ b/src/goe/offload/microsoft/synapse_backend_table.py @@ -433,7 +433,7 @@ def cleanup_staging_area(self): def compute_final_table_stats(self, incremental_stats, materialized_join=False): return self._db_api.compute_stats(self.db_name, self.table_name) - def create_backend_table(self): + def create_backend_table(self) -> list: """Create a table in Synapse based on object state. For efficiency, we compute backend stats immediately after table creation to initialise empty stats objects on each column. These will be updated using a single table level command after the final load. diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index 7d5c85f0..7657d612 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -144,6 +144,9 @@ "Offload data identification options required with --reset-hybrid-view" ) TOTAL_ROWS_OFFLOADED_LOG_TEXT = "Total rows offloaded" +DDL_FILE_EXECUTE_MESSAGE_TEXT = ( + "Switch command to non-exectute mode due to --ddl-file option" +) # Offload capabilities we can switch on/off by backend db type # Any capabilities that are version specific will have extra code in the BackendApi method diff --git a/src/goe/offload/offload_metadata_functions.py b/src/goe/offload/offload_metadata_functions.py index 51bad9c9..19da9131 100755 --- a/src/goe/offload/offload_metadata_functions.py +++ b/src/goe/offload/offload_metadata_functions.py @@ -214,7 +214,7 @@ def gen_and_save_offload_metadata( messages.offload_step( command_steps.STEP_SAVE_METADATA, lambda: goe_metadata.save(), - execute=config.execute, + execute=offload_operation.execute, ) return goe_metadata diff --git a/src/goe/offload/offload_status_report.py b/src/goe/offload/offload_status_report.py index c0127dad..6eb8976f 100755 --- a/src/goe/offload/offload_status_report.py +++ b/src/goe/offload/offload_status_report.py @@ -200,6 +200,7 @@ def __init__(self, orchestration_config, messages, ora_adm_conn=None): self._orchestration_config.target, self._orchestration_config, self._messages, + dry_run=(not self._execute), ) self._backend_info = self._backend_api.backend_report_info() self._backend_db_type = self._orchestration_config.target @@ -255,7 +256,7 @@ def __init__(self, orchestration_config, messages, ora_adm_conn=None): ) self._repo_client = orchestration_repo_client_factory( - self._orchestration_config, self._messages + self._orchestration_config, self._messages, dry_run=(not self._execute) ) # General initialisations... diff --git a/src/goe/offload/offload_transport.py b/src/goe/offload/offload_transport.py index ed09e078..c779d785 100755 --- a/src/goe/offload/offload_transport.py +++ b/src/goe/offload/offload_transport.py @@ -699,7 +699,7 @@ def __init__( self._messages = messages self._dfs_client = dfs_client self._backend_dfs = self._dfs_client.backend_dfs - self._dry_run = bool(not offload_options.execute) + self._dry_run = bool(not offload_operation.execute) # Details of the source of the offload self._rdbms_owner = offload_source_table.owner self._rdbms_table_name = offload_source_table.table_name @@ -1313,7 +1313,7 @@ def log_file_size(path, uri_attribs) -> int: return None self.debug("Logging contents of URI: %s" % self._staging_table_location) - if not self._offload_options.execute: + if self._dry_run: return None try: @@ -2161,9 +2161,7 @@ def step_fn(): return row_count return self._messages.offload_step( - command_steps.STEP_STAGING_TRANSPORT, - step_fn, - execute=self._offload_options.execute, + command_steps.STEP_STAGING_TRANSPORT, step_fn, execute=(not self._dry_run) ) def ping_source_rdbms(self): @@ -2515,9 +2513,7 @@ def step_fn(): return row_count return self._messages.offload_step( - command_steps.STEP_STAGING_TRANSPORT, - step_fn, - execute=self._offload_options.execute, + command_steps.STEP_STAGING_TRANSPORT, step_fn, execute=(not self._dry_run) ) def ping_source_rdbms(self): @@ -2751,9 +2747,7 @@ def step_fn(): return rows_imported rows_imported = self._messages.offload_step( - command_steps.STEP_STAGING_TRANSPORT, - step_fn, - execute=self._offload_options.execute, + command_steps.STEP_STAGING_TRANSPORT, step_fn, execute=(not self._dry_run) ) return rows_imported diff --git a/src/goe/offload/offload_validation.py b/src/goe/offload/offload_validation.py index a52de9c3..4c217dd1 100755 --- a/src/goe/offload/offload_validation.py +++ b/src/goe/offload/offload_validation.py @@ -273,6 +273,7 @@ def __init__( messages=None, backend_db=None, backend_table=None, + execute=True, ): """CONSTRUCTOR @@ -284,6 +285,7 @@ def __init__( """ assert db_name and table_name + self._execute = execute self._db_name = db_name self._table_name = table_name self._db_table = "%s.%s" % (db_name, table_name) @@ -292,6 +294,7 @@ def __init__( connection_options, messages, conn_user_override=connection_options.rdbms_app_user, + dry_run=(not self._execute), trace_action=self.__class__.__name__, ) if backend_obj: @@ -301,7 +304,7 @@ def __init__( connection_options.target, connection_options, messages, - dry_run=bool(not connection_options.execute), + dry_run=bool(not self._execute), ) self._connection_options = connection_options self._messages = messages @@ -473,9 +476,11 @@ def expand(lst, elem_func, concat_func): if filters: sql += "\nWHERE %s" % expand( filters, - lambda x: (" ".join((x[0], x[1], str(x[2])))) - if type(x) in (list, tuple) - else x, + lambda x: ( + (" ".join((x[0], x[1], str(x[2])))) + if type(x) in (list, tuple) + else x + ), lambda x, y: "%s\nAND %s" % (x, y), ) @@ -763,6 +768,7 @@ def _extract_offload_boundary(self, frontend=False): self._messages, hybrid_metadata=self._offload_metadata, existing_backend_api=self._backend, + dry_run=(not self._execute), ) ( inc_keys, diff --git a/src/goe/offload/operation/stats_controls.py b/src/goe/offload/operation/stats_controls.py index 3f3bb4a9..1c9cac46 100644 --- a/src/goe/offload/operation/stats_controls.py +++ b/src/goe/offload/operation/stats_controls.py @@ -113,7 +113,7 @@ def filter_for_affected_partitions_list( % offload_target_table.backend_db_name() ) - dry_run = bool(not offload_options.execute) + dry_run = bool(not offload_operation.execute) rdbms_tab_stats = offload_source_table.table_stats rdbms_col_stats = rdbms_tab_stats["column_stats"] tab_stats = { diff --git a/src/goe/offload/operation/transport.py b/src/goe/offload/operation/transport.py index 642cb62d..2a22da95 100644 --- a/src/goe/offload/operation/transport.py +++ b/src/goe/offload/operation/transport.py @@ -94,7 +94,7 @@ def transport_and_load_offload_chunk_fn( chunk_count=chunk_count, sync=sync, offload_predicate=offload_operation.inflight_offload_predicate, - dry_run=bool(not offload_options.execute), + dry_run=bool(not offload_operation.execute), ) if discarded_all_partitions: @@ -148,7 +148,7 @@ def transport_and_load_offload_chunk_fn( offload_options, messages, ), - execute=offload_options.execute, + execute=offload_operation.execute, optional=True, ) else: diff --git a/src/goe/offload/option_validation.py b/src/goe/offload/option_validation.py index 2d65e32a..8442186e 100644 --- a/src/goe/offload/option_validation.py +++ b/src/goe/offload/option_validation.py @@ -30,6 +30,7 @@ if TYPE_CHECKING: from goe.config.orchestration_config import OrchestrationConfig + from goe.offload.offload_messages import OffloadMessages class OffloadOptionError(Exception): @@ -128,13 +129,19 @@ def normalise_data_sampling_options(options): ) -def normalise_ddl_file(options, config: "OrchestrationConfig"): +def normalise_ddl_file( + options, config: "OrchestrationConfig", messages: "OffloadMessages" +): """Validates path pointed to by ddl_file and generates a new path if AUTO. Mutates options.""" if options.ddl_file: options.ddl_file = options.ddl_file.strip() else: return options.ddl_file + if options.execute and options.ddl_file: + messages.notice(offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT) + options.execute = False + if options.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: # Use an auto-generated path. options.ddl_file = generate_ddl_file_path( diff --git a/src/goe/offload/snowflake/snowflake_backend_table.py b/src/goe/offload/snowflake/snowflake_backend_table.py index a7eb2c6c..9a415a56 100644 --- a/src/goe/offload/snowflake/snowflake_backend_table.py +++ b/src/goe/offload/snowflake/snowflake_backend_table.py @@ -492,7 +492,7 @@ def compute_final_table_stats(self, incremental_stats, materialized_join=False): """We cannot influence stats on Snowflake and this should never be called due to capability setting""" pass - def create_backend_table(self): + def create_backend_table(self) -> list: """Create a table in Snowflake based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. diff --git a/src/goe/offload/spark/dataproc_offload_transport.py b/src/goe/offload/spark/dataproc_offload_transport.py index b92db460..56b60ee8 100644 --- a/src/goe/offload/spark/dataproc_offload_transport.py +++ b/src/goe/offload/spark/dataproc_offload_transport.py @@ -334,7 +334,7 @@ def step_fn(): return self._messages.offload_step( command_steps.STEP_STAGING_TRANSPORT, step_fn, - execute=self._offload_options.execute, + execute=(not self._dry_run), ) def ping_source_rdbms(self): diff --git a/src/goe/offload/spark/livy_offload_transport.py b/src/goe/offload/spark/livy_offload_transport.py index 5a5710ca..7b7d9364 100644 --- a/src/goe/offload/spark/livy_offload_transport.py +++ b/src/goe/offload/spark/livy_offload_transport.py @@ -126,9 +126,11 @@ def _log_app_info(self, resp_json, last_log_msg): if log_msg != last_log_msg: self._messages.log_timestamp(detail=VVERBOSE) self.log( - "\n".join(_ for _ in log_msg) - if isinstance(log_msg, list) - else str(log_msg), + ( + "\n".join(_ for _ in log_msg) + if isinstance(log_msg, list) + else str(log_msg) + ), detail=VVERBOSE, ) return log_msg @@ -458,7 +460,7 @@ def step_fn(): return self._messages.offload_step( command_steps.STEP_STAGING_TRANSPORT, step_fn, - execute=self._offload_options.execute, + execute=(not self._dry_run), ) def ping_source_rdbms(self): @@ -492,7 +494,9 @@ def __init__(self, offload_options, messages): self._idle_session_timeout = 30 self._dfs_client = get_dfs_from_options( - self._offload_options, messages=self._messages + self._offload_options, + messages=self._messages, + dry_run=self._dry_run, ) self._backend_dfs = self._dfs_client.backend_dfs diff --git a/src/goe/orchestration/cli_entry_points.py b/src/goe/orchestration/cli_entry_points.py index b81cbaf1..58fddf30 100644 --- a/src/goe/orchestration/cli_entry_points.py +++ b/src/goe/orchestration/cli_entry_points.py @@ -47,7 +47,6 @@ def offload_by_cli(options, messages_override=None): normalise_options(options) config_overrides = { - "execute": options.execute, "verbose": options.verbose, "vverbose": options.vverbose, "offload_transport_dsn": options.offload_transport_dsn, diff --git a/src/goe/orchestration/orchestration_runner.py b/src/goe/orchestration/orchestration_runner.py index fff05414..4daee1d9 100644 --- a/src/goe/orchestration/orchestration_runner.py +++ b/src/goe/orchestration/orchestration_runner.py @@ -91,9 +91,9 @@ class OrchestrationRunnerException(Exception): class OrchestrationRunner: """OrchestrationRunner: Library providing simple entry point for orchestration commands.""" - def __init__(self, config_overrides=None, dry_run=False, suppress_stdout=False): + def __init__(self, config_overrides=None, suppress_stdout=False): self._config = self._gen_config( - config_overrides, dry_run, suppress_stdout=suppress_stdout + config_overrides, suppress_stdout=suppress_stdout ) # State refreshed by each command, not necessarily static. self._execution_id: Optional[ExecutionId] = None @@ -109,11 +109,13 @@ def _build_offload_source_table(self, operation): operation.owner, operation.table_name, self._config, self._messages ) - def _build_repo_client(self, messages) -> "OrchestrationRepoClientInterface": + def _build_repo_client( + self, messages, dry_run=False + ) -> "OrchestrationRepoClientInterface": return orchestration_repo_client_factory( self._config, messages, - dry_run=bool(not self._config.execute), + dry_run=dry_run, trace_action="repo_client(OrchestrationRunner)", ) @@ -218,13 +220,16 @@ def _command_fail( ) raise + def _execute_from_params(self, params) -> bool: + if isinstance(params, dict): + return params["execute"] + else: + return params.execute + def _gen_config( - self, config_overrides, dry_run, suppress_stdout=False + self, config_overrides, suppress_stdout=False ) -> OrchestrationConfig: - # options.execute remains a disease in goe.py and IU therefore we need this preamble for config overrides = config_overrides or {} - if "execute" not in overrides: - overrides["execute"] = bool(not dry_run) if suppress_stdout: overrides["suppress_stdout"] = suppress_stdout return OrchestrationConfig.from_dict(overrides) @@ -239,10 +244,15 @@ def _gen_messages(self, execution_id, command_type): ) def _gen_offload_operation( - self, params, repo_client: "OrchestrationRepoClientInterface" + self, + params, + repo_client: "OrchestrationRepoClientInterface", ): """Return an OffloadOperation object based on either a parameter dict or OptParse object.""" try: + max_hybrid_name_length = self._get_max_hybrid_identifier_length( + dry_run=bool(not self._execute_from_params(params)) + ) if isinstance(params, dict): # Non-CLI APIs are dict driven therefore we construct via "from_dict". # Also will be threaded and not-safe to pass in shared repo_client. @@ -252,7 +262,7 @@ def _gen_offload_operation( self._messages, repo_client=repo_client, execution_id=self._execution_id, - max_hybrid_name_length=self._get_max_hybrid_identifier_length(), + max_hybrid_name_length=max_hybrid_name_length, ) else: # CLI has an OptParse object therefore we construct via "from_options". @@ -263,7 +273,7 @@ def _gen_offload_operation( self._messages, repo_client=repo_client, execution_id=self._execution_id, - max_hybrid_name_length=self._get_max_hybrid_identifier_length(), + max_hybrid_name_length=max_hybrid_name_length, ) return op except Exception as exc: @@ -284,7 +294,7 @@ def _get_execution_id( execution_id = ExecutionId() return execution_id - def _get_max_hybrid_identifier_length(self) -> int: + def _get_max_hybrid_identifier_length(self, dry_run: bool) -> int: """Get the max supported hybrid identifier (table/view/column) length for the frontend RDBMS. This is not ideal because it is making a frontend connection just to get this information but at the point this is called we don't already have a connection we can use. @@ -295,7 +305,7 @@ def _get_max_hybrid_identifier_length(self) -> int: self._config.db_type, self._config, self._messages, - dry_run=bool(not self._config.execute), + dry_run=dry_run, trace_action="_get_max_hybrid_identifier_length", ) self._max_hybrid_name_length = frontend_api.max_table_name_length() @@ -332,7 +342,9 @@ def _init_command( detail=VVERBOSE, ) init_redis_execution_id(self._execution_id) - return self._build_repo_client(self._messages) + return self._build_repo_client( + self._messages, dry_run=(not self._execute_from_params(params)) + ) except Exception as exc: self._log_error( f"Exception initializing command {command}: {str(exc)}", detail=VVERBOSE @@ -392,21 +404,21 @@ def _log_error(self, msg, detail=NORMAL): logger.error(msg) self._messages.log(msg, detail=detail) - def _log_final_messages(self, command_type, repo_client=None): + def _log_final_messages(self, command_type, dry_run): self._messages.log_step_deltas() if self._messages.get_messages(): self._messages.offload_step( command_steps.STEP_MESSAGES, self._messages.log_messages, command_type=command_type, - execute=self._config.execute, + execute=(not dry_run), ) def _offload(self, operation, offload_source_table, offload_target_table): with orchestration_lock_for_table( offload_source_table.owner, offload_source_table.table_name, - dry_run=bool(not self._config.execute), + dry_run=(not operation.execute), ): try: return offload_table( @@ -451,6 +463,7 @@ def offload( messages_override: Allows us to pass in an existing messages object so a parent can inspect the messages, used for testing. """ + dry_run = bool(not self._execute_from_params(params)) repo_client = self._init_command( orchestration_constants.COMMAND_OFFLOAD, params, @@ -478,9 +491,7 @@ def offload( status = self._offload( operation, offload_source_table, offload_target_table ) - self._log_final_messages( - orchestration_constants.COMMAND_OFFLOAD, repo_client=repo_client - ) + self._log_final_messages(orchestration_constants.COMMAND_OFFLOAD, dry_run) self._command_end(command_id, repo_client) self._cleanup_objects( diff --git a/src/goe/persistence/oracle/oracle_orchestration_repo_client.py b/src/goe/persistence/oracle/oracle_orchestration_repo_client.py index e17accaf..7d768db2 100644 --- a/src/goe/persistence/oracle/oracle_orchestration_repo_client.py +++ b/src/goe/persistence/oracle/oracle_orchestration_repo_client.py @@ -204,16 +204,18 @@ def _ora_object_to_metadata_dict(self, metadata_obj): OFFLOADED_OWNER: metadata_obj.FRONTEND_OBJECT_OWNER, OFFLOADED_TABLE: metadata_obj.FRONTEND_OBJECT_NAME, INCREMENTAL_KEY: metadata_obj.OFFLOAD_KEY or None, - INCREMENTAL_HIGH_VALUE: metadata_obj.OFFLOAD_HIGH_VALUE.read() - if metadata_obj.OFFLOAD_HIGH_VALUE - else None, + INCREMENTAL_HIGH_VALUE: ( + metadata_obj.OFFLOAD_HIGH_VALUE.read() + if metadata_obj.OFFLOAD_HIGH_VALUE + else None + ), INCREMENTAL_RANGE: metadata_obj.OFFLOAD_RANGE_TYPE or None, INCREMENTAL_PREDICATE_TYPE: metadata_obj.OFFLOAD_PREDICATE_TYPE or None, - INCREMENTAL_PREDICATE_VALUE: json.loads( - metadata_obj.OFFLOAD_PREDICATE_VALUE.read() - ) - if metadata_obj.OFFLOAD_PREDICATE_VALUE - else None, + INCREMENTAL_PREDICATE_VALUE: ( + json.loads(metadata_obj.OFFLOAD_PREDICATE_VALUE.read()) + if metadata_obj.OFFLOAD_PREDICATE_VALUE + else None + ), OFFLOAD_BUCKET_COLUMN: metadata_obj.OFFLOAD_HASH_COLUMN or None, OFFLOAD_SORT_COLUMNS: metadata_obj.OFFLOAD_SORT_COLUMNS or None, OFFLOAD_SNAPSHOT: metadata_obj.OFFLOAD_SNAPSHOT or None, diff --git a/src/goe/schema_sync/schema_sync_analyzer.py b/src/goe/schema_sync/schema_sync_analyzer.py index f8069778..d53473da 100644 --- a/src/goe/schema_sync/schema_sync_analyzer.py +++ b/src/goe/schema_sync/schema_sync_analyzer.py @@ -100,7 +100,6 @@ def __init__(self, options, messages): self._messages = messages self._orchestration_options = OrchestrationConfig.from_dict( { - "execute": options.execute, "verbose": options.verbose, "vverbose": options.vverbose, } diff --git a/src/goe/schema_sync/schema_sync_processor.py b/src/goe/schema_sync/schema_sync_processor.py index 7b3faf11..1a33b670 100644 --- a/src/goe/schema_sync/schema_sync_processor.py +++ b/src/goe/schema_sync/schema_sync_processor.py @@ -60,7 +60,6 @@ def __init__( self._repo_client = repo_client self._orchestration_options = OrchestrationConfig.from_dict( { - "execute": options.execute, "verbose": options.verbose, "vverbose": options.vverbose, } diff --git a/tests/integration/offload/test_backend_api.py b/tests/integration/offload/test_backend_api.py index 8e6ef18b..86cf3f3a 100644 --- a/tests/integration/offload/test_backend_api.py +++ b/tests/integration/offload/test_backend_api.py @@ -95,6 +95,7 @@ def _create_test_tables(self): { "owner_table": self.schema + "." + self.table, "create_backend_db": True, + "execute": True, } ) except OffloadException: @@ -104,6 +105,7 @@ def _create_test_tables(self): "owner_table": self.schema + "." + self.table, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } ) @@ -118,13 +120,19 @@ def _create_test_tables(self): ) # Ignore return status, if the table has already been offloaded previously then we'll re-use it. try: - run_offload({"owner_table": self.schema + "." + self.part_table}) + run_offload( + { + "owner_table": self.schema + "." + self.part_table, + "execute": True, + }, + ) except OffloadException: # If this one fails then we let the exception bubble up. run_offload( { "owner_table": self.schema + "." + self.part_table, "reset_backend_table": True, + "execute": True, } ) diff --git a/tests/integration/offload/test_backend_table.py b/tests/integration/offload/test_backend_table.py index bc28194b..367906f2 100644 --- a/tests/integration/offload/test_backend_table.py +++ b/tests/integration/offload/test_backend_table.py @@ -148,6 +148,7 @@ def _create_test_table(self): { "owner_table": self.schema + "." + FACT_NAME, "create_backend_db": True, + "execute": True, } ) except OffloadException: @@ -157,6 +158,7 @@ def _create_test_table(self): "owner_table": self.schema + "." + FACT_NAME, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } ) diff --git a/tests/integration/offload/test_predicate_offload.py b/tests/integration/offload/test_predicate_offload.py index 56088ee7..0c859471 100644 --- a/tests/integration/offload/test_predicate_offload.py +++ b/tests/integration/offload/test_predicate_offload.py @@ -76,7 +76,13 @@ def create_and_offload_dim_table(config, frontend_api, messages, schema): ) # Ignore return status, if the table has already been offloaded previously then we'll re-use it. try: - run_offload({"owner_table": schema + "." + DIM_NAME, "create_backend_db": True}) + run_offload( + { + "owner_table": schema + "." + DIM_NAME, + "create_backend_db": True, + "execute": True, + } + ) except OffloadException: # If this one fails then we let the exception bubble up. run_offload( @@ -84,6 +90,7 @@ def create_and_offload_dim_table(config, frontend_api, messages, schema): "owner_table": schema + "." + DIM_NAME, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } ) diff --git a/tests/integration/orchestration/test_command_steps.py b/tests/integration/orchestration/test_command_steps.py index fa53deb1..3bbea4c6 100644 --- a/tests/integration/orchestration/test_command_steps.py +++ b/tests/integration/orchestration/test_command_steps.py @@ -45,7 +45,9 @@ def test_command_steps_repo(self): config = cached_current_options() messages = get_test_messages(config, "TestCommandStepsIntegration") client = orchestration_repo_client_factory( - config, messages, trace_action="repo_client(test_command_steps_repo)" + config, + messages, + trace_action="repo_client(test_command_steps_repo)", ) codes = client.get_command_step_codes() step_constants = self._get_step_constants() diff --git a/tests/integration/orchestration/test_orchestration_runner.py b/tests/integration/orchestration/test_orchestration_runner.py index 3c44db56..ceb4a19d 100644 --- a/tests/integration/orchestration/test_orchestration_runner.py +++ b/tests/integration/orchestration/test_orchestration_runner.py @@ -203,8 +203,12 @@ def test_orchestration_runner_offload(config, schema): frontend_api.standard_dimension_frontend_ddl(schema, TABLE_NAME), ) - params = {"owner_table": f"{schema}.{TABLE_NAME}", "reset_backend_table": True} - status = OrchestrationRunner(dry_run=True).offload(params) + params = { + "owner_table": f"{schema}.{TABLE_NAME}", + "reset_backend_table": True, + "execute": True, + } + status = OrchestrationRunner().offload(params) assert status @@ -228,6 +232,7 @@ def test_fn(): params = { "owner_table": f"{schema}.{LOCK_TABLE}", "reset_backend_table": True, + "execute": True, } OrchestrationRunner().offload(params) diff --git a/tests/integration/persistence/test_orchestration_metadata.py b/tests/integration/persistence/test_orchestration_metadata.py index 8153809b..7bedc3a8 100644 --- a/tests/integration/persistence/test_orchestration_metadata.py +++ b/tests/integration/persistence/test_orchestration_metadata.py @@ -15,8 +15,6 @@ """ TestOrchestrationMetadata: Unit test library to test orchestration metadata API for configured frontend. """ -import copy - from unittest import TestCase, main from goe.orchestration import orchestration_constants diff --git a/tests/integration/persistence/test_orchestration_repo_client.py b/tests/integration/persistence/test_orchestration_repo_client.py index 7e5173a7..860f84be 100644 --- a/tests/integration/persistence/test_orchestration_repo_client.py +++ b/tests/integration/persistence/test_orchestration_repo_client.py @@ -25,7 +25,6 @@ from goe.persistence.factory.orchestration_repo_client_factory import ( orchestration_repo_client_factory, ) -from goe.offload.offload_messages import OffloadMessages from goe.offload.offload_source_data import OffloadSourcePartition from goe.orchestration import command_steps, orchestration_constants from goe.orchestration.execution_id import ExecutionId @@ -49,7 +48,7 @@ def test_orchestration_command_logging_cli(self): Tests a command is if launched from the CLI. Pretends to offload a multi chunk partitioned table. """ # execute=True because we want to actually insert and update the repo records for this test. - config = OrchestrationConfig.from_dict({"verbose": False, "execute": True}) + config = OrchestrationConfig.from_dict({"verbose": False}) execution_id = ExecutionId() messages = get_test_messages( config, "test_orchestration_command_logging_cli", execution_id=execution_id @@ -57,6 +56,7 @@ def test_orchestration_command_logging_cli(self): client = orchestration_repo_client_factory( config, messages, + dry_run=False, trace_action="repo_client(test_orchestration_command_logging_cli)", ) @@ -182,7 +182,7 @@ def test_orchestration_command_logging_api(self): Tests a command is if launched from an API. Pretends to offload a non-partitioned table. """ # execute=True because we want to actually insert and update the repo records for this test. - config = OrchestrationConfig.from_dict({"verbose": False, "execute": True}) + config = OrchestrationConfig.from_dict({"verbose": False}) execution_id = ExecutionId() messages = get_test_messages( config, "test_orchestration_command_logging_api", execution_id=execution_id @@ -190,6 +190,7 @@ def test_orchestration_command_logging_api(self): client = orchestration_repo_client_factory( config, messages, + dry_run=False, trace_action="repo_client(test_orchestration_command_logging_api)", ) # Start an API based Offload diff --git a/tests/integration/scenarios/assertion_functions.py b/tests/integration/scenarios/assertion_functions.py index a98fb2e2..3a5764ee 100644 --- a/tests/integration/scenarios/assertion_functions.py +++ b/tests/integration/scenarios/assertion_functions.py @@ -395,7 +395,12 @@ def load_table_is_compressed(db_name, table_name, config, dfs_client, messages): "load_table_is_compressed(%s, %s)" % (db_name, table_name), detail=VERBOSE ) backend_table = backend_table_factory( - db_name, table_name, config.target, config, messages + db_name, + table_name, + config.target, + config, + messages, + dry_run=False, ) path = backend_table.get_staging_table_location() files = [_ for _ in dfs_client.list_dir(path) if _ and _[0] != "."] diff --git a/tests/integration/scenarios/scenario_runner.py b/tests/integration/scenarios/scenario_runner.py index ac3192e4..8f78a9bd 100644 --- a/tests/integration/scenarios/scenario_runner.py +++ b/tests/integration/scenarios/scenario_runner.py @@ -46,7 +46,6 @@ def get_config_overrides( ): """Return config from story enhanced with certain attributes from orchestration_config""" base_config = { - "execute": True, "verbose": orchestration_config.verbose, "vverbose": orchestration_config.vverbose, } diff --git a/tests/integration/scenarios/test_column_controls.py b/tests/integration/scenarios/test_column_controls.py index e98be2ef..59bab399 100644 --- a/tests/integration/scenarios/test_column_controls.py +++ b/tests/integration/scenarios/test_column_controls.py @@ -739,6 +739,7 @@ def test_numeric_controls(config, schema, data_db): "reset_backend_table": True, "decimal_padding_digits": 2, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) nums_assertion( @@ -760,6 +761,7 @@ def test_numeric_controls(config, schema, data_db): "decimal_columns_type_list": ["10,0", "13,9", "15,9", "36,3", "37,3", "38,3"], "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "decimal_padding_digits": 2, + "execute": True, } run_offload(options, config, messages) nums_assertion( @@ -791,6 +793,7 @@ def test_numeric_controls(config, schema, data_db): "decimal_columns_type_list": ["36,3", "37,3", "38,3"], "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "decimal_padding_digits": 2, + "execute": True, } run_offload(options, config, messages) nums_assertion( @@ -813,6 +816,7 @@ def test_numeric_controls(config, schema, data_db): "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, "verify_row_count": False, + "execute": True, } log_test_marker(messages, f"{id}:samp1") run_offload(options, config, messages) @@ -827,6 +831,7 @@ def test_numeric_controls(config, schema, data_db): "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, "verify_row_count": False, + "execute": True, } log_test_marker(messages, f"{id}:samp2") run_offload(options, config, messages) @@ -848,6 +853,7 @@ def test_numeric_controls(config, schema, data_db): ], "decimal_columns_type_list": ["10,2"], "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, + "execute": True, } run_offload( options, @@ -862,6 +868,7 @@ def test_numeric_controls(config, schema, data_db): "decimal_columns_csv_list": [",".join([STORY_TEST_OFFLOAD_NUMS_DEC_36_3])], "decimal_columns_type_list": ["100,10"], "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -878,6 +885,7 @@ def test_numeric_controls(config, schema, data_db): "decimal_columns_csv_list": [",".join([STORY_TEST_OFFLOAD_NUMS_DEC_36_3])], "decimal_columns_type_list": ["10,100"], "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -915,6 +923,7 @@ def test_date_controls(config, schema, data_db): "data_sample_pct": 0, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) date_assertion(config, frontend_api, backend_api, messages, data_db, DATE_DIM) @@ -926,6 +935,7 @@ def test_date_controls(config, schema, data_db): "data_sample_pct": 0, "date_columns_csv": "dt,ts0,ts6", "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) date_assertion( @@ -944,6 +954,7 @@ def test_date_controls(config, schema, data_db): "data_sample_pct": 0, "timestamp_tz_columns_csv": "dt,ts0,ts6,ts0tz,ts6tz", "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) date_assertion( @@ -987,6 +998,7 @@ def test_date_sampling(config, schema, data_db): "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) samp_date_assertion(config, backend_api, frontend_api, messages, data_db, DATE_SDIM) @@ -1008,6 +1020,7 @@ def test_date_sampling(config, schema, data_db): "allow_nanosecond_timestamp_columns": True, "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) samp_date_assertion( @@ -1039,6 +1052,7 @@ def test_date_sampling(config, schema, data_db): "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "date_columns_csv": "good_date,good_ts", "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) samp_date_assertion( @@ -1084,6 +1098,7 @@ def test_precision_scale_overflow(config, schema, data_db): "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -1110,6 +1125,7 @@ def test_precision_scale_overflow(config, schema, data_db): "owner_table": schema + "." + NUM_TOO_BIG_DIM, "data_sample_pct": DATA_SAMPLE_SIZE_AUTO, "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -1124,6 +1140,7 @@ def test_precision_scale_overflow(config, schema, data_db): "owner_table": schema + "." + NUM_TOO_BIG_DIM, "data_sample_pct": 0, "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -1154,6 +1171,7 @@ def test_precision_scale_overflow(config, schema, data_db): "reset_backend_table": True, "decimal_padding_digits": 0, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -1170,6 +1188,7 @@ def test_precision_scale_overflow(config, schema, data_db): "allow_decimal_scale_rounding": True, "reset_backend_table": True, "decimal_padding_digits": 0, + "execute": True, } run_offload( options, @@ -1219,6 +1238,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "unicode_string_columns_csv": "*_desc", "decimal_padding_digits": 0, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) wildcard_assertion(backend_api, data_db, wildcard_dim_be) @@ -1229,6 +1249,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "reset_backend_table": True, "integer_1_columns_csv": "*_id", "integer_2_columns_csv": "*id", + "execute": True, } run_offload( options, @@ -1259,6 +1280,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "variable_string_columns_csv": "prod_id", + "execute": True, } run_offload( options, @@ -1272,6 +1294,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "date_columns_csv": "prod_id", + "execute": True, } run_offload( options, @@ -1285,6 +1308,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "integer_8_columns_csv": "TXN_date", + "execute": True, } run_offload( options, @@ -1298,6 +1322,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "date_columns_csv": "TXN_DESC", + "execute": True, } run_offload( options, @@ -1311,6 +1336,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "timestamp_tz_columns_csv": "TXN_DESC", + "execute": True, } run_offload( options, @@ -1324,6 +1350,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "integer_4_columns_csv": "TXN_DESC", + "execute": True, } run_offload( options, @@ -1337,6 +1364,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "unicode_string_columns_csv": "PROD_ID", + "execute": True, } run_offload( options, @@ -1351,6 +1379,7 @@ def test_column_controls_column_name_checks(config, schema, data_db, load_db): "reset_backend_table": True, "unicode_string_columns_csv": "TXN_DESC", "skip": ["verify_exported_data"], + "execute": True, } run_offload( options, @@ -1393,6 +1422,7 @@ def test_column_controls_not_null(config, schema, data_db): "owner_table": f"{schema}.{NOT_NULL_DIM}", "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -1410,6 +1440,7 @@ def test_column_controls_not_null(config, schema, data_db): options = { "owner_table": f"{schema}.{NOT_NULL_DIM}", "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -1428,6 +1459,7 @@ def test_column_controls_not_null(config, schema, data_db): "owner_table": f"{schema}.{NOT_NULL_DIM}", "not_null_columns_csv": "DT*", "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -1448,12 +1480,12 @@ def test_column_controls_not_null(config, schema, data_db): "owner_table": f"{schema}.{NOT_NULL_DIM}", "not_null_columns_csv": "not-a-column", "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=UNKNOWN_NOT_NULL_COLUMN_EXCEPTION_TEXT, ) @@ -1462,6 +1494,7 @@ def test_column_controls_not_null(config, schema, data_db): "owner_table": f"{schema}.{NOT_NULL_DIM}", "not_null_columns_csv": "With_Nulls", "reset_backend_table": True, + "execute": True, } run_offload( options, diff --git a/tests/integration/scenarios/test_ddl_file.py b/tests/integration/scenarios/test_ddl_file.py new file mode 100644 index 00000000..e1c69253 --- /dev/null +++ b/tests/integration/scenarios/test_ddl_file.py @@ -0,0 +1,120 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from goe.offload import offload_constants +from goe.offload.offload_functions import ( + convert_backend_identifier_case, + data_db_name, +) +from goe.util.misc_functions import get_temp_path + +from tests.integration.scenarios.assertion_functions import ( + backend_table_exists, + text_in_messages, +) +from tests.integration.scenarios.scenario_runner import ( + run_offload, + run_setup, +) +from tests.integration.scenarios.setup_functions import ( + drop_backend_test_table, +) +from tests.integration.test_functions import ( + cached_current_options, + cached_default_test_user, +) +from tests.testlib.test_framework.test_functions import ( + get_backend_testing_api, + get_frontend_testing_api, + get_test_messages, +) + + +TEST_TABLE = "DDL_FILE_DIM" + + +@pytest.fixture +def config(): + return cached_current_options() + + +@pytest.fixture +def schema(): + return cached_default_test_user() + + +@pytest.fixture +def data_db(schema, config): + data_db = data_db_name(schema, config) + data_db = convert_backend_identifier_case(config, data_db) + return data_db + + +def test_ddl_file_local_fs(config, schema, data_db): + id = "test_ddl_file_local_fs" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages) + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, TEST_TABLE), + python_fns=lambda: drop_backend_test_table( + config, backend_api, messages, data_db, TEST_TABLE + ), + ) + + # Offload in execute mode asking for ddl_file. + ddl_file = get_temp_path(prefix=id, suffix=".sql") + options = { + "owner_table": schema + "." + TEST_TABLE, + "reset_backend_table": True, + "ddl_file": ddl_file, + "execute": True, + } + offload_messages = run_offload(options, config, messages) + # When using DDL file no table should be created, even in execute mode. + assert not backend_table_exists( + config, backend_api, messages, data_db, TEST_TABLE + ), f"Backend table for {schema}.{TEST_TABLE} should not exist" + assert text_in_messages( + offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT + ) + assert os.path.isfile(ddl_file) + + # Offload in non-execute mode asking for ddl_file. + ddl_file = get_temp_path(prefix=id, suffix=".sql") + options = { + "owner_table": schema + "." + TEST_TABLE, + "reset_backend_table": True, + "ddl_file": ddl_file, + "execute": False, + } + offload_messages = run_offload(options, config, messages) + assert not backend_table_exists( + config, backend_api, messages, data_db, TEST_TABLE + ), f"Backend table for {schema}.{TEST_TABLE} should not exist" + assert text_in_messages( + offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT + ) + # Even in non-execture mode we expect to see a DDL file. + assert os.path.isfile(ddl_file) diff --git a/tests/integration/scenarios/test_identifiers.py b/tests/integration/scenarios/test_identifiers.py index c74d4093..a3dca29f 100644 --- a/tests/integration/scenarios/test_identifiers.py +++ b/tests/integration/scenarios/test_identifiers.py @@ -156,6 +156,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_QUERY_IMPORT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -166,6 +167,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SQOOP_BY_QUERY, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -176,6 +178,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SQOOP, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -186,6 +189,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SPARK_SUBMIT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -196,6 +200,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SPARK_THRIFT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -206,6 +211,7 @@ def test_identifiers_keyword_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SPARK_DATAPROC_GCLOUD, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -246,6 +252,7 @@ def test_identifiers_bad_char_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_QUERY_IMPORT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -256,6 +263,7 @@ def test_identifiers_bad_char_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SPARK_SUBMIT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -266,6 +274,7 @@ def test_identifiers_bad_char_column_names(config, schema, data_db): "offload_transport_method": OFFLOAD_TRANSPORT_METHOD_SPARK_DATAPROC_GCLOUD, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -304,13 +313,14 @@ def test_identifiers_table_name_case(config, schema, data_db): "owner_table": schema + "." + CASE_DIM, "reset_backend_table": True, "create_backend_db": True, + "execute": False, } log_test_marker(messages, f"{id}1") run_offload( options, config, messages, - config_overrides={"execute": False, "backend_identifier_case": "LOWER"}, + config_overrides={"backend_identifier_case": "LOWER"}, ) assert backend_case_offload_assertion( messages, f"{data_db}.{CASE_DIM}".lower(), f"{id}1" @@ -319,13 +329,14 @@ def test_identifiers_table_name_case(config, schema, data_db): options = { "owner_table": schema + "." + CASE_DIM.lower(), "reset_backend_table": True, + "execute": False, } log_test_marker(messages, f"{id}2") run_offload( options, config, messages, - config_overrides={"execute": False, "backend_identifier_case": "UPPER"}, + config_overrides={"backend_identifier_case": "UPPER"}, ) assert backend_case_offload_assertion( messages, f"{data_db}.{CASE_DIM}".upper(), f"{id}2" @@ -334,13 +345,14 @@ def test_identifiers_table_name_case(config, schema, data_db): options = { "owner_table": schema.upper() + "." + CASE_DIM.capitalize(), "reset_backend_table": True, + "execute": False, } log_test_marker(messages, f"{id}3") run_offload( options, config, messages, - config_overrides={"execute": False, "backend_identifier_case": "NO_MODIFY"}, + config_overrides={"backend_identifier_case": "NO_MODIFY"}, ) assert backend_case_offload_assertion( messages, f"{data_db.upper()}.{CASE_DIM.capitalize()}", f"{id}3" @@ -356,7 +368,7 @@ def test_identifiers_table_name_change_100_0(config, schema, data_db): backend_api = get_backend_testing_api(config, messages) frontend_api = get_frontend_testing_api(config, messages, trace_action=id) repo_client = orchestration_repo_client_factory( - config, messages, trace_action=f"repo_client({id})" + config, messages, dry_run=False, trace_action=f"repo_client({id})" ) # Setup @@ -381,6 +393,7 @@ def test_identifiers_table_name_change_100_0(config, schema, data_db): "target_owner_name": schema + "." + NEW_NAME_DIM2, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -406,6 +419,7 @@ def test_identifiers_table_name_change_100_0(config, schema, data_db): # Used to confirm that attempted re-offload exits early and doesn\'t lose sight of previous --target-name. options = { "owner_table": schema + "." + NEW_NAME_DIM1, + "execute": True, } # Uncomment this test after completing GOE-1461 # run_offload( @@ -453,6 +467,7 @@ def test_identifiers_table_name_change_90_10(config, schema, data_db): "older_than_date": SALES_BASED_FACT_HV_1, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -476,6 +491,7 @@ def test_identifiers_table_name_change_90_10(config, schema, data_db): options = { "owner_table": schema + "." + NEW_NAME_FACT1, "older_than_date": SALES_BASED_FACT_HV_3, + "execute": True, } # Uncomment this test after completing GOE-1461 # run_offload(options, config, messages) diff --git a/tests/integration/scenarios/test_offload_basic.py b/tests/integration/scenarios/test_offload_basic.py index 04b32060..daa2909e 100644 --- a/tests/integration/scenarios/test_offload_basic.py +++ b/tests/integration/scenarios/test_offload_basic.py @@ -300,8 +300,9 @@ def test_offload_basic_dim(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, + "execute": False, } - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert not backend_table_exists( config, backend_api, messages, data_db, OFFLOAD_DIM @@ -310,14 +311,17 @@ def test_offload_basic_dim(config, schema, data_db): # Basic offload of a simple dimension. options = { "owner_table": schema + "." + OFFLOAD_DIM, - "offload_stats_method": offload_constants.OFFLOAD_STATS_METHOD_COPY - if copy_stats_available - else offload_constants.OFFLOAD_STATS_METHOD_NATIVE, + "offload_stats_method": ( + offload_constants.OFFLOAD_STATS_METHOD_COPY + if copy_stats_available + else offload_constants.OFFLOAD_STATS_METHOD_NATIVE + ), "compute_load_table_stats": True, "preserve_load_table": True, "impala_insert_hint": IMPALA_NOSHUFFLE_HINT, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -331,6 +335,7 @@ def test_offload_basic_dim(config, schema, data_db): # Attempt to re-offload, expect to fail. options = { "owner_table": schema + "." + OFFLOAD_DIM, + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -340,6 +345,7 @@ def test_offload_basic_dim(config, schema, data_db): "offload_partition_lower_value": 0, "offload_partition_upper_value": 1000, "reset_backend_table": True, + "execute": True, } if backend_api.partition_by_column_supported(): if backend_api.max_partition_columns() == 1: @@ -408,8 +414,9 @@ def test_offload_basic_fact(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, "reset_backend_table": True, + "execute": False, } - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert not backend_table_exists( config, backend_api, messages, data_db, OFFLOAD_FACT @@ -421,12 +428,12 @@ def test_offload_basic_fact(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_LIST, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, ) @@ -441,6 +448,7 @@ def test_offload_basic_fact(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_PRE_HV, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -469,6 +477,7 @@ def test_offload_basic_fact(config, schema, data_db): "offload_stats_method": offload_stats_method, "reset_backend_table": True, "create_backend_db": True, + "execute": False, } if backend_api.partition_by_column_supported(): if config.target == offload_constants.DBTYPE_BIGQUERY: @@ -485,11 +494,12 @@ def test_offload_basic_fact(config, schema, data_db): + ",1", } ) - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) # Offload some partitions from a fact table. # The fact is partitioned by multiple columns (if possible) with appropriate granularity. # We use COPY stats on this initial offload, also specify some specific data types. + options["execute"] = True run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -512,8 +522,9 @@ def test_offload_basic_fact(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_FACT, "older_than_date": test_constants.SALES_BASED_FACT_HV_2, + "execute": False, } - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert offload_basic_fact_1st_incr_assertion( config, backend_api, messages, data_db, backend_name @@ -523,6 +534,7 @@ def test_offload_basic_fact(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_FACT, "older_than_date": test_constants.SALES_BASED_FACT_HV_2, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -563,6 +575,7 @@ def test_offload_basic_fact(config, schema, data_db): backend_api, "promo_id" ), "synthetic_partition_digits": 5, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -595,6 +608,7 @@ def test_offload_basic_fact(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_FACT, "older_than_date": test_constants.SALES_BASED_FACT_HV_4, + "execute": True, } run_offload(options, config, messages) diff --git a/tests/integration/scenarios/test_offload_data.py b/tests/integration/scenarios/test_offload_data.py index aa19ebf0..99a061a5 100644 --- a/tests/integration/scenarios/test_offload_data.py +++ b/tests/integration/scenarios/test_offload_data.py @@ -440,12 +440,12 @@ def test_offload_data_nan_inf_not_supported(config, schema, data_db): "owner_table": schema + "." + NAN_TABLE, "allow_floating_point_conversions": False, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_status=False, ) @@ -455,6 +455,7 @@ def test_offload_data_nan_inf_not_supported(config, schema, data_db): "allow_floating_point_conversions": True, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -504,6 +505,7 @@ def test_offload_data_partition_by_microsecond(config, schema, data_db): "less_than_value": "2030-01-02", "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -530,6 +532,7 @@ def test_offload_data_partition_by_microsecond(config, schema, data_db): if backend_api.sql_microsecond_predicate_supported() else orchestration_defaults.verify_row_count_default() ), + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -551,6 +554,7 @@ def test_offload_data_partition_by_microsecond(config, schema, data_db): options = { "owner_table": schema + "." + US_FACT, "less_than_value": "2030-01-03", + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -597,6 +601,7 @@ def test_offload_data_partition_by_nanosecond(config, schema, data_db): "allow_nanosecond_timestamp_columns": False, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -607,6 +612,7 @@ def test_offload_data_partition_by_nanosecond(config, schema, data_db): "less_than_value": "2030-01-02", "verify_row_count": False, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) else: @@ -616,6 +622,7 @@ def test_offload_data_partition_by_nanosecond(config, schema, data_db): "less_than_value": "2030-01-02", "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -642,6 +649,7 @@ def test_offload_data_partition_by_nanosecond(config, schema, data_db): if backend_api.sql_microsecond_predicate_supported() else orchestration_defaults.verify_row_count_default() ), + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -703,6 +711,7 @@ def test_offload_data_oracle_xmltype(config, schema, data_db): "owner_table": schema + "." + XMLTYPE_TABLE, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -717,6 +726,7 @@ def test_offload_data_oracle_xmltype(config, schema, data_db): config, no_table_centric_sqoop=True ), "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) @@ -760,6 +770,7 @@ def test_offload_data_nulls_qi(config, schema, data_db): "allow_floating_point_conversions": True, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -809,6 +820,7 @@ def test_offload_data_nulls_no_qi(config, schema, data_db): "allow_floating_point_conversions": True, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -875,6 +887,7 @@ def test_offload_data_large_decimals_lpa(config, schema, data_db): ), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -901,6 +914,7 @@ def test_offload_data_large_decimals_lpa(config, schema, data_db): "owner_table": schema + "." + LPA_LARGE_NUMS, "partition_names_csv": "P_2", "offload_transport_method": no_query_import_transport_method(config), + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -927,6 +941,7 @@ def test_offload_data_large_decimals_lpa(config, schema, data_db): "owner_table": schema + "." + LPA_LARGE_NUMS, "partition_names_csv": "P_3", "offload_transport_method": no_query_import_transport_method(config), + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -1005,6 +1020,7 @@ def test_offload_data_large_decimals_rpa(config, schema, data_db): ), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -1031,6 +1047,7 @@ def test_offload_data_large_decimals_rpa(config, schema, data_db): "owner_table": schema + "." + RPA_LARGE_NUMS, "partition_names_csv": "P_2", "offload_transport_method": no_query_import_transport_method(config), + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -1052,6 +1069,7 @@ def test_offload_data_large_decimals_rpa(config, schema, data_db): "owner_table": schema + "." + RPA_LARGE_NUMS, "partition_names_csv": "P_3", "offload_transport_method": no_query_import_transport_method(config), + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( diff --git a/tests/integration/scenarios/test_offload_hash_column.py b/tests/integration/scenarios/test_offload_hash_column.py index 24a423f2..b9cc560d 100644 --- a/tests/integration/scenarios/test_offload_hash_column.py +++ b/tests/integration/scenarios/test_offload_hash_column.py @@ -113,6 +113,7 @@ def test_offload_hash_column_synapse(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -139,6 +140,7 @@ def test_offload_hash_column_synapse(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "bucket_hash_col": "prod_id", "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -164,6 +166,7 @@ def test_offload_hash_column_synapse(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_DIM, "reset_backend_table": True, + "execute": True, } run_offload( options, diff --git a/tests/integration/scenarios/test_offload_list_rpa.py b/tests/integration/scenarios/test_offload_list_rpa.py index 6f24a085..6e58b304 100644 --- a/tests/integration/scenarios/test_offload_list_rpa.py +++ b/tests/integration/scenarios/test_offload_list_rpa.py @@ -145,12 +145,12 @@ def offload_list_as_range_ipa_standard_story_tests( less_than_option: hv_1, "offload_partition_functions": udf, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=offload_constants.IPA_PREDICATE_TYPE_EXCEPTION_TEXT, ) @@ -160,12 +160,12 @@ def offload_list_as_range_ipa_standard_story_tests( less_than_option: hv_1, "offload_partition_functions": udf, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_status=False, ) @@ -178,6 +178,7 @@ def offload_list_as_range_ipa_standard_story_tests( "offload_partition_upper_value": test_constants.UPPER_YRMON_NUM, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -206,6 +207,7 @@ def offload_list_as_range_ipa_standard_story_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_3, + "execute": True, } run_offload( options, @@ -233,6 +235,7 @@ def offload_list_as_range_ipa_standard_story_tests( options = { "owner_table": schema + "." + table_name, "equal_to_values": hv_4, + "execute": True, } run_offload( options, @@ -247,6 +250,7 @@ def offload_list_as_range_ipa_standard_story_tests( "partition_names_csv": test_constants.SALES_BASED_LIST_PNAME_4 + "," + test_constants.SALES_BASED_LIST_PNAME_5, + "execute": True, } run_offload( options, @@ -259,6 +263,7 @@ def offload_list_as_range_ipa_standard_story_tests( options = { "owner_table": schema + "." + table_name, "partition_names_csv": test_constants.SALES_BASED_LIST_PNAME_4, + "execute": True, } run_offload( options, @@ -285,6 +290,7 @@ def offload_list_as_range_ipa_standard_story_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_4, + "execute": True, } run_offload( options, @@ -315,6 +321,7 @@ def offload_list_as_range_ipa_standard_story_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_5, + "execute": True, } run_offload( options, @@ -585,6 +592,7 @@ def test_offload_list_rpa_subpart(config, schema, data_db): "offload_partition_lower_value": 0, "offload_partition_upper_value": 10, "create_backend_db": True, + "execute": True, } run_offload( options, diff --git a/tests/integration/scenarios/test_offload_lpa.py b/tests/integration/scenarios/test_offload_lpa.py index 8e5ba750..e9334ea3 100644 --- a/tests/integration/scenarios/test_offload_lpa.py +++ b/tests/integration/scenarios/test_offload_lpa.py @@ -312,6 +312,7 @@ def test_offload_lpa_num(config, schema, data_db): "offload_partition_upper_value": 1000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -331,6 +332,7 @@ def test_offload_lpa_num(config, schema, data_db): "owner_table": schema + "." + LPA_NUM_PART_KEY_TABLE, "equal_to_values": [LPA_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -383,6 +385,7 @@ def test_offload_lpa_vc2(config, schema, data_db): "equal_to_values": ["%s,%s" % (LPA_PART1_KEY1, LPA_PART1_KEY2)], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -402,6 +405,7 @@ def test_offload_lpa_vc2(config, schema, data_db): "owner_table": schema + "." + LPA_VC2_PART_KEY_TABLE, "equal_to_values": [LPA_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -454,6 +458,7 @@ def test_offload_lpa_char(config, schema, data_db): "equal_to_values": ["%s,%s" % (LPA_PART1_KEY1, LPA_PART1_KEY2)], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -473,6 +478,7 @@ def test_offload_lpa_char(config, schema, data_db): "owner_table": schema + "." + LPA_CHR_PART_KEY_TABLE, "equal_to_values": [LPA_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -525,6 +531,7 @@ def test_offload_lpa_date(config, schema, data_db): "equal_to_values": ["%s,%s" % (LPA_DT_PART1_KEY1, LPA_DT_PART1_KEY2)], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -544,6 +551,7 @@ def test_offload_lpa_date(config, schema, data_db): "owner_table": schema + "." + LPA_DT_PART_KEY_TABLE, "equal_to_values": [LPA_DT_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -596,6 +604,7 @@ def test_offload_lpa_ts(config, schema, data_db): "equal_to_values": ["%s,%s" % (LPA_DT_PART1_KEY1, LPA_DT_PART1_KEY2)], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -615,6 +624,7 @@ def test_offload_lpa_ts(config, schema, data_db): "owner_table": schema + "." + LPA_TS_PART_KEY_TABLE, "equal_to_values": [LPA_DT_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -678,6 +688,7 @@ def test_offload_lpa_unicode(config, schema, data_db): "offload_partition_upper_value": 10000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -697,6 +708,7 @@ def test_offload_lpa_unicode(config, schema, data_db): "owner_table": schema + "." + LPA_UNICODE_FACT_TABLE, "equal_to_values": [(LPA_UNICODE_PART2_KEY1,)], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -748,6 +760,7 @@ def test_offload_lpa_fact(config, schema, data_db): "offload_partition_upper_value": test_constants.UPPER_YRMON_NUM, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -770,6 +783,7 @@ def test_offload_lpa_fact(config, schema, data_db): "offload_partition_lower_value": test_constants.LOWER_YRMON_NUM, "offload_partition_upper_value": test_constants.UPPER_YRMON_NUM, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -788,6 +802,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "equal_to_values": [test_constants.SALES_BASED_LIST_HV_2], + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -808,6 +823,7 @@ def test_offload_lpa_fact(config, schema, data_db): "partition_names_csv": test_constants.SALES_BASED_LIST_PNAME_3 + "," + test_constants.SALES_BASED_LIST_PNAME_4, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -831,6 +847,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "equal_to_values": ["DEfauLT"], + "execute": True, } run_offload( options, @@ -843,6 +860,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "partition_names_csv": test_constants.SALES_BASED_LIST_PNAME_3, + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -850,6 +868,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "reset_hybrid_view": True, + "execute": True, } run_offload( options, @@ -863,6 +882,7 @@ def test_offload_lpa_fact(config, schema, data_db): "owner_table": schema + "." + LPA_FACT_TABLE, "reset_hybrid_view": True, "partition_names_csv": "NOT_A_PARTITION", + "execute": True, } run_offload( options, @@ -877,6 +897,7 @@ def test_offload_lpa_fact(config, schema, data_db): # "owner_table": schema + "." + LPA_FACT_TABLE, # "reset_hybrid_view": True, # "partition_names_csv": test_constants.SALES_BASED_LIST_PNAME_3, + # "execute": True, # } # run_offload(options, config, messages) # assert offload_lpa_fact_assertion( @@ -923,6 +944,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "offload_type": OFFLOAD_TYPE_FULL, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -942,6 +964,7 @@ def test_offload_lpa_fact(config, schema, data_db): options = { "owner_table": schema + "." + LPA_FACT_TABLE, "offload_type": OFFLOAD_TYPE_INCREMENTAL, + "execute": True, } run_offload( options, @@ -1004,6 +1027,7 @@ def test_offload_lpa_part_fn(config, schema, data_db): "offload_partition_upper_value": 1000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -1025,6 +1049,7 @@ def test_offload_lpa_part_fn(config, schema, data_db): "owner_table": schema + "." + LPA_NUM_PART_FUNC_TABLE, "equal_to_values": [LPA_PART2_KEY1], "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -1073,6 +1098,7 @@ def test_offload_lpa_full(config, schema, data_db): "offload_partition_upper_value": test_constants.UPPER_YRMON_NUM, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( @@ -1103,6 +1129,7 @@ def test_offload_lpa_full(config, schema, data_db): # Offload new partition to 100/0 list fact. options = { "owner_table": schema + "." + LPA_FULL_TABLE, + "execute": True, } run_offload(options, config, messages) assert offload_lpa_fact_assertion( diff --git a/tests/integration/scenarios/test_offload_misc.py b/tests/integration/scenarios/test_offload_misc.py index 9b45cc20..fa58a6fd 100644 --- a/tests/integration/scenarios/test_offload_misc.py +++ b/tests/integration/scenarios/test_offload_misc.py @@ -113,9 +113,10 @@ def test_offload_misc_verification_parallel(config, schema, data_db): "verify_parallelism": 3, "data_sample_pct": 0, "reset_backend_table": True, + "execute": False, } log_test_marker(messages, f"{id}1") - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert hint_text_in_log(messages, config, 3, f"{id}1") # Offload with verification parallelism=1. @@ -124,9 +125,10 @@ def test_offload_misc_verification_parallel(config, schema, data_db): "verify_parallelism": 1, "data_sample_pct": 0, "reset_backend_table": True, + "execute": False, } log_test_marker(messages, f"{id}2") - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert hint_text_in_log(messages, config, 1, f"{id}2") # Offload with verification parallelism=0. @@ -135,9 +137,10 @@ def test_offload_misc_verification_parallel(config, schema, data_db): "verify_parallelism": 0, "data_sample_pct": 0, "reset_backend_table": True, + "execute": False, } log_test_marker(messages, f"{id}3") - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) assert hint_text_in_log(messages, config, 0, f"{id}3") # Offload with aggregation verification parallelism=4. @@ -148,6 +151,7 @@ def test_offload_misc_verification_parallel(config, schema, data_db): "data_sample_pct": 0, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } log_test_marker(messages, f"{id}4") run_offload(options, config, messages) @@ -191,6 +195,7 @@ def test_offload_misc_maxvalue_partition(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_2, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -210,6 +215,7 @@ def test_offload_misc_maxvalue_partition(config, schema, data_db): # Offloads all partitions from a MAXVALUE fact table but in 90/10, the MAXVALUE partition should be skipped. options = { "owner_table": schema + "." + MAXVAL_FACT, + "execute": True, } offload_messages = run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -231,6 +237,7 @@ def test_offload_misc_maxvalue_partition(config, schema, data_db): options = { "owner_table": schema + "." + MAXVAL_FACT, "offload_type": OFFLOAD_TYPE_FULL, + "execute": True, } offload_messages = run_offload(options, config, messages) assert sales_based_fact_assertion( diff --git a/tests/integration/scenarios/test_offload_part_fn.py b/tests/integration/scenarios/test_offload_part_fn.py index 57d72228..13a76ae1 100644 --- a/tests/integration/scenarios/test_offload_part_fn.py +++ b/tests/integration/scenarios/test_offload_part_fn.py @@ -207,12 +207,12 @@ def test_offload_part_fn_exceptions(config, schema, data_db): "owner_table": schema + "." + DIM_NUM, "offload_partition_functions": "anything", "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTIONS_NOT_SUPPORTED_EXCEPTION_TEXT, ) @@ -242,12 +242,12 @@ def test_offload_part_fn_exceptions(config, schema, data_db): "offload_partition_lower_value": 0, "offload_partition_upper_value": 5000, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTION_ARG_COUNT_EXCEPTION_TEXT, ) @@ -257,7 +257,6 @@ def test_offload_part_fn_exceptions(config, schema, data_db): options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTION_ARG_COUNT_EXCEPTION_TEXT, ) @@ -267,7 +266,6 @@ def test_offload_part_fn_exceptions(config, schema, data_db): options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTION_DOES_NOT_EXIST_EXCEPTION_TEXT, ) @@ -281,7 +279,6 @@ def test_offload_part_fn_exceptions(config, schema, data_db): options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTION_ARG_TYPE_EXCEPTION_TEXT, ) @@ -293,12 +290,12 @@ def test_offload_part_fn_exceptions(config, schema, data_db): "offload_partition_granularity": "M", "reset_backend_table": True, "create_backend_db": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTION_ARG_TYPE_EXCEPTION_TEXT, ) @@ -309,12 +306,12 @@ def test_offload_part_fn_exceptions(config, schema, data_db): "offload_partition_columns": "TXN_DATE", "offload_partition_granularity": "M", "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PARTITION_FUNCTIONS_ELEMENT_EXCEPTION_TEXT, ) @@ -367,6 +364,7 @@ def test_offload_part_fn_num(config, schema, data_db): "offload_partition_upper_value": 5000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert standard_dimension_assertion( @@ -395,6 +393,7 @@ def test_offload_part_fn_num(config, schema, data_db): "offload_partition_lower_value": 0, "offload_partition_upper_value": 5000, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert standard_dimension_assertion( @@ -460,6 +459,7 @@ def test_offload_part_fn_dec(config, schema, data_db): "offload_partition_upper_value": 5000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert standard_dimension_assertion( @@ -487,6 +487,7 @@ def test_offload_part_fn_dec(config, schema, data_db): "offload_partition_lower_value": 0, "offload_partition_upper_value": 5000, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert standard_dimension_assertion( @@ -555,6 +556,7 @@ def test_offload_part_fn_str(config, schema, data_db): "offload_partition_upper_value": 5000, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert standard_dimension_assertion( diff --git a/tests/integration/scenarios/test_offload_pbo.py b/tests/integration/scenarios/test_offload_pbo.py index f496e0d2..522cf046 100644 --- a/tests/integration/scenarios/test_offload_pbo.py +++ b/tests/integration/scenarios/test_offload_pbo.py @@ -308,12 +308,12 @@ def test_offload_pbo_exceptions(config, schema, data_db): ), "offload_predicate_modify_hybrid_view": False, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PREDICATE_TYPE_NO_MODIFY_HV_EXCEPTION_TEXT, ) @@ -323,12 +323,12 @@ def test_offload_pbo_exceptions(config, schema, data_db): "offload_predicate": GenericPredicate('column(txn_desc) = string("ABC")'), "older_than_date": "2012-01-01", "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=CONFLICTING_DATA_ID_OPTIONS_EXCEPTION_TEXT, ) @@ -338,12 +338,12 @@ def test_offload_pbo_exceptions(config, schema, data_db): "offload_predicate": GenericPredicate('column(txn_desc) = string("ABC")'), "offload_type": OFFLOAD_TYPE_FULL, "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=PREDICATE_TYPE_OFFLOAD_TYPE_FULL_EXCEPTION_TEXT, ) @@ -354,12 +354,12 @@ def test_offload_pbo_exceptions(config, schema, data_db): "owner_table": schema + "." + EXC_TABLE, "offload_predicate": GenericPredicate('column(not_a_column) = string("NOPE")'), "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string="Unable to resolve column", ) @@ -370,12 +370,12 @@ def test_offload_pbo_exceptions(config, schema, data_db): 'column(txn_desc) = string("No such data")' ), "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_status=False, ) @@ -412,12 +412,12 @@ def test_offload_pbo_dim(config, schema, data_db): "owner_table": schema + "." + DIM_TABLE, "offload_predicate": GenericPredicate('column(txn_desc) = string("ABC")'), "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, ) assert not backend_table_exists(config, backend_api, messages, data_db, DIM_TABLE) @@ -427,6 +427,7 @@ def test_offload_pbo_dim(config, schema, data_db): "offload_predicate": GenericPredicate('column(txn_desc) = string("ABC")'), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload( @@ -464,6 +465,7 @@ def test_offload_pbo_dim(config, schema, data_db): "owner_table": schema + "." + DIM_TABLE, "offload_predicate": GenericPredicate('column(txn_desc) = string("DEF")'), "verify_row_count": "aggregate", + "execute": True, } run_offload( options, @@ -487,6 +489,7 @@ def test_offload_pbo_dim(config, schema, data_db): options = { "owner_table": schema + "." + DIM_TABLE, "offload_predicate": GenericPredicate('column(txn_desc) = string("DEF")'), + "execute": True, } run_offload( options, @@ -501,6 +504,7 @@ def test_offload_pbo_dim(config, schema, data_db): # "owner_table": schema + "." + DIM_TABLE, # "offload_predicate": GenericPredicate('column(txn_desc) = string("DEF")'), # "force": True, + # "execute": True, # } # run_offload( # options, @@ -529,6 +533,7 @@ def test_offload_pbo_dim(config, schema, data_db): "owner_table": schema + "." + DIM_TABLE, "offload_predicate": GenericPredicate('column(txn_desc) = string("GHI")'), "reset_hybrid_view": True, + "execute": True, } messages.log(f"{id}:2", detail=VVERBOSE) run_offload( @@ -601,6 +606,7 @@ def test_offload_pbo_unicode(config, schema, data_db): ), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload( @@ -634,6 +640,7 @@ def test_offload_pbo_unicode(config, schema, data_db): "offload_predicate": GenericPredicate( '((column(data) = string("%s")))' % UCODE_VALUE2 ), + "execute": True, } messages.log(f"{id}:2", detail=VVERBOSE) run_offload( @@ -695,6 +702,7 @@ def test_offload_pbo_char_pad(config, schema, data_db): "offload_predicate": GenericPredicate('(column(data) = string("a "))'), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload( @@ -726,6 +734,7 @@ def test_offload_pbo_char_pad(config, schema, data_db): options = { "owner_table": schema + "." + CHAR_TABLE, "offload_predicate": GenericPredicate('(column(data) = string("a "))'), + "execute": True, } run_offload( options, @@ -769,6 +778,7 @@ def test_offload_pbo_ts(config, schema, data_db): "allow_nanosecond_timestamp_columns": True, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload( @@ -834,13 +844,13 @@ def test_offload_pbo_range(config, schema, data_db): ), "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, "reset_backend_table": True, + "execute": False, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=IPA_PREDICATE_TYPE_FIRST_OFFLOAD_EXCEPTION_TEXT, ) @@ -856,6 +866,7 @@ def test_offload_pbo_range(config, schema, data_db): ), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload(options, config, messages) @@ -893,6 +904,7 @@ def test_offload_pbo_range(config, schema, data_db): "offload_predicate": GenericPredicate( "column(time_id) = datetime(%s)" % test_constants.SALES_BASED_FACT_HV_1 ), + "execute": True, } run_offload(options, config, messages) assert pbo_assertion( @@ -917,6 +929,7 @@ def test_offload_pbo_range(config, schema, data_db): options = { "owner_table": schema + "." + RANGE_TABLE, "older_than_date": test_constants.SALES_BASED_FACT_HV_1, + "execute": True, } run_offload( options, @@ -934,6 +947,7 @@ def test_offload_pbo_range(config, schema, data_db): % test_constants.SALES_BASED_FACT_HV_4 ), "offload_type": OFFLOAD_TYPE_FULL, + "execute": True, } run_offload( options, @@ -986,6 +1000,7 @@ def test_offload_pbo_list(config, schema, data_db): "equal_to_values": [test_constants.SALES_BASED_FACT_HV_1], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -996,6 +1011,7 @@ def test_offload_pbo_list(config, schema, data_db): "((column(yrmon) = datetime(%s)) and (column(channel_id) = numeric(3)))" % (test_constants.SALES_BASED_FACT_HV_1) ), + "execute": True, } run_offload( options, @@ -1012,6 +1028,7 @@ def test_offload_pbo_list(config, schema, data_db): % (test_constants.SALES_BASED_FACT_HV_1) ), "reset_backend_table": True, + "execute": True, } messages.log(f"{id}:1", detail=VVERBOSE) run_offload(options, config, messages) @@ -1044,6 +1061,7 @@ def test_offload_pbo_list(config, schema, data_db): options = { "owner_table": "%s.%s" % (schema, LIST_TABLE), "equal_to_values": [test_constants.SALES_BASED_FACT_HV_1], + "execute": True, } run_offload( options, @@ -1059,6 +1077,7 @@ def test_offload_pbo_list(config, schema, data_db): "((column(yrmon) = datetime(%s)) and (column(channel_id) = numeric(4)))" % (test_constants.SALES_BASED_FACT_HV_1) ), + "execute": True, } messages.log(f"{id}:2", detail=VVERBOSE) run_offload(options, config, messages) diff --git a/tests/integration/scenarios/test_offload_pbo_intra.py b/tests/integration/scenarios/test_offload_pbo_intra.py index 3de49cd6..e1dc5825 100644 --- a/tests/integration/scenarios/test_offload_pbo_intra.py +++ b/tests/integration/scenarios/test_offload_pbo_intra.py @@ -145,6 +145,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "ipa_predicate_type": ipa_and_predicate_type, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, @@ -158,6 +159,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "owner_table": schema + "." + table_name, "older_than_date": hv_1, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -181,6 +183,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate("(column(channel_id) = numeric(2))"), "ipa_predicate_type": ipa_and_predicate_type, + "execute": True, } run_offload( options, @@ -193,6 +196,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): options = { "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "2")), + "execute": True, } run_offload( options, @@ -207,12 +211,12 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "2")), "ipa_predicate_type": ipa_and_predicate_type, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, ) # Offload 1st predicate on top of HV_1 which will switch table to ..._AND_PREDICATE. @@ -220,6 +224,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "2")), "ipa_predicate_type": ipa_and_predicate_type, + "execute": True, } messages.log(f"{test_id}:1", detail=VVERBOSE) run_offload(options, config, messages) @@ -262,6 +267,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "2")), "ipa_predicate_type": ipa_and_predicate_type, "offload_type": OFFLOAD_TYPE_FULL, + "execute": True, } run_offload( options, @@ -276,6 +282,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "offload_predicate": GenericPredicate( gen_pred(hv_pred, hv_1, hv_2, ["3", "4"]) ), + "execute": True, } messages.log(f"{test_id}:2", detail=VVERBOSE) run_offload(options, config, messages) @@ -316,6 +323,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "owner_table": schema + "." + table_name, "offload_predicate_modify_hybrid_view": False, "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "5")), + "execute": True, } messages.log(f"{test_id}:3", detail=VVERBOSE) run_offload(options, config, messages) @@ -357,6 +365,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): "reset_hybrid_view": True, "offload_predicate": GenericPredicate(gen_pred(hv_pred, hv_1, hv_2, "1")), "ipa_predicate_type": ipa_and_predicate_type, + "execute": True, } messages.log(f"{test_id}:4", detail=VVERBOSE) run_offload(options, config, messages) @@ -397,6 +406,7 @@ def gen_pred(hv_template, hv_1, hv_2, channel_id): options = { "owner_table": schema + "." + table_name, "older_than_date": hv_2, + "execute": True, } # Disabled until issue-99 is fixed. # messages.log(f"{test_id}:5", detail=VVERBOSE) @@ -517,9 +527,6 @@ def test_offload_pbo_intra_list(config, schema, data_db): frontend_api = get_frontend_testing_api( config, messages, trace_action=f"ftest_api({id})" ) - repo_client = orchestration_repo_client_factory( - config, messages, trace_action=f"repo_client({id})" - ) # Setup run_setup( @@ -546,6 +553,7 @@ def test_offload_pbo_intra_list(config, schema, data_db): "equal_to_values": [test_constants.SALES_BASED_FACT_HV_1], "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -556,6 +564,7 @@ def test_offload_pbo_intra_list(config, schema, data_db): "(column(time_id) = datetime(%s)) and (column(channel_id) = numeric(3))" % test_constants.SALES_BASED_FACT_HV_3 ), + "execute": True, } run_offload( options, diff --git a/tests/integration/scenarios/test_offload_pbo_late.py b/tests/integration/scenarios/test_offload_pbo_late.py index c3a37ff8..1cb0f12e 100644 --- a/tests/integration/scenarios/test_offload_pbo_late.py +++ b/tests/integration/scenarios/test_offload_pbo_late.py @@ -176,6 +176,7 @@ def offload_pbo_late_100_x_tests( "ipa_predicate_type": ipa_predicate_type, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -219,6 +220,7 @@ def offload_pbo_late_100_x_tests( ), "ipa_predicate_type": ipa_predicate_type, "offload_type": OFFLOAD_TYPE_INCREMENTAL, + "execute": True, } run_offload( options, @@ -235,6 +237,7 @@ def offload_pbo_late_100_x_tests( "column(time_id) = datetime(%s)" % OLD_HV_1 ), "ipa_predicate_type": ipa_predicate_type, + "execute": True, } messages.log(f"{test_id}:1", detail=VVERBOSE) run_offload(options, config, messages) @@ -349,6 +352,7 @@ def offload_pbo_late_arriving_std_range_tests( "offload_partition_granularity": offload_partition_granularity, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -375,6 +379,7 @@ def offload_pbo_late_arriving_std_range_tests( "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(hv_pred), "ipa_predicate_type": ipa_predicate_type, + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -399,6 +404,7 @@ def offload_pbo_late_arriving_std_range_tests( "offload_predicate": GenericPredicate(hv_pred), "reset_hybrid_view": True, "ipa_predicate_type": ipa_predicate_type, + "execute": True, } run_offload( options, @@ -413,6 +419,7 @@ def offload_pbo_late_arriving_std_range_tests( "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(hv_pred), "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, + "execute": True, } run_offload( options, @@ -427,6 +434,7 @@ def offload_pbo_late_arriving_std_range_tests( "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(hv_pred), "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_LIST, + "execute": True, } run_offload( options, @@ -440,6 +448,7 @@ def offload_pbo_late_arriving_std_range_tests( "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(hv_pred), "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_PREDICATE, + "execute": True, } run_offload( options, @@ -454,6 +463,7 @@ def offload_pbo_late_arriving_std_range_tests( "owner_table": schema + "." + table_name, "offload_predicate": GenericPredicate(hv_pred), "ipa_predicate_type": ipa_predicate_type, + "execute": True, } messages.log(f"{test_id}:1", detail=VVERBOSE) run_offload(options, config, messages) @@ -528,7 +538,8 @@ def test_offload_pbo_late_range_90_10(config, schema, data_db): ) # TODO do we need to create a test for below 100_10 tests? - # offload_pbo_late_100_x_tests(config, backend_api, frontend_api, messages, repo_client, schema, data_db, RANGE_TABLE_LATE, OFFLOAD_PATTERN_100_10, id) + # offload_pbo_late_100_x_tests(config, backend_api, frontend_api, messages, repo_client, schema, + # data_db, RANGE_TABLE_LATE, OFFLOAD_PATTERN_100_10, id) # Connections are being left open, explicitly close them. frontend_api.close() diff --git a/tests/integration/scenarios/test_offload_rpa.py b/tests/integration/scenarios/test_offload_rpa.py index 68e1408a..f8ebc251 100644 --- a/tests/integration/scenarios/test_offload_rpa.py +++ b/tests/integration/scenarios/test_offload_rpa.py @@ -209,8 +209,9 @@ def offload_range_ipa_standard_tests( "offload_partition_upper_value": upper_value, "synthetic_partition_digits": synthetic_partition_digits, "reset_backend_table": True, + "execute": False, } - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) # RANGE Offload 1st Partition. options = { @@ -222,6 +223,7 @@ def offload_range_ipa_standard_tests( "offload_partition_upper_value": upper_value, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -249,6 +251,7 @@ def offload_range_ipa_standard_tests( "offload_partition_functions": different_udf, "offload_partition_granularity": different_granularity, "synthetic_partition_digits": different_partition_digits, + "execute": True, } run_offload(options, config, messages) @@ -271,8 +274,9 @@ def offload_range_ipa_standard_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_3, + "execute": False, } - run_offload(options, config, messages, config_overrides={"execute": False}) + run_offload(options, config, messages) # Assert HV is still from prior offload. assert sales_based_fact_assertion( @@ -296,6 +300,7 @@ def offload_range_ipa_standard_tests( options = { "owner_table": schema + "." + table_name, "partition_names_csv": "P4,P5", + "execute": True, } run_offload( options, @@ -310,6 +315,7 @@ def offload_range_ipa_standard_tests( "owner_table": schema + "." + table_name, offload3_opt_name: offload3_opt_value, "verify_row_count": "aggregate", + "execute": True, } run_offload(options, config, messages) @@ -331,6 +337,7 @@ def offload_range_ipa_standard_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_3, + "execute": True, } # On Teradata we can't test by partition name in previous test so this test will not be a no-op. run_offload( @@ -375,6 +382,7 @@ def offload_range_ipa_standard_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_4, + "execute": True, } run_offload(options, config, messages) @@ -416,6 +424,7 @@ def offload_range_ipa_standard_tests( options = { "owner_table": schema + "." + table_name, less_than_option: hv_4, + "execute": True, } run_offload(options, config, messages, expected_status=False) @@ -715,6 +724,7 @@ def test_offload_rpa_alpha(config, schema, data_db): "offload_partition_granularity": "1", "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -736,6 +746,7 @@ def test_offload_rpa_alpha(config, schema, data_db): options = { "owner_table": schema + "." + RPA_ALPHA_FACT_TABLE, "less_than_value": "u", + "execute": True, } run_offload(options, config, messages) @@ -800,6 +811,7 @@ def test_offload_rpa_empty_partitions(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_2, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -819,6 +831,7 @@ def test_offload_rpa_empty_partitions(config, schema, data_db): "owner_table": schema + "." + NOSEG_FACT, "older_than_date": test_constants.SALES_BASED_FACT_HV_5, "max_offload_chunk_count": 1, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( diff --git a/tests/integration/scenarios/test_offload_sorting.py b/tests/integration/scenarios/test_offload_sorting.py index 39a4cf66..6949b5e1 100644 --- a/tests/integration/scenarios/test_offload_sorting.py +++ b/tests/integration/scenarios/test_offload_sorting.py @@ -180,6 +180,7 @@ def test_offload_sorting_dim(config, schema, data_db): "sort_columns_csv": offload_constants.SORT_COLUMNS_NO_CHANGE, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -203,6 +204,7 @@ def test_offload_sorting_dim(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "sort_columns_csv": "txn_day,Txn_Rate", "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -223,6 +225,7 @@ def test_offload_sorting_dim(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "sort_columns_csv": "txn_day,Txn_Rate,prod_id,txn_desc,TXN_CODE", "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -236,6 +239,7 @@ def test_offload_sorting_dim(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "sort_columns_csv": "not_a_column,txn_day", "reset_backend_table": True, + "execute": True, } run_offload( options, @@ -249,6 +253,7 @@ def test_offload_sorting_dim(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_DIM, "sort_columns_csv": "*rate", "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -308,6 +313,7 @@ def test_offload_sorting_fact(config, schema, data_db): ), "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -326,6 +332,7 @@ def test_offload_sorting_fact(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_FACT, "sort_columns_csv": "channel_id,promo_id,prod_id", "older_than_date": test_constants.SALES_BASED_FACT_HV_2, + "execute": True, } if backend_api.sorted_table_modify_supported(): # Fail to modify existing sorting. @@ -361,6 +368,7 @@ def test_offload_sorting_fact(config, schema, data_db): options = { "owner_table": schema + "." + OFFLOAD_FACT, "older_than_date": test_constants.SALES_BASED_FACT_HV_3, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -379,6 +387,7 @@ def test_offload_sorting_fact(config, schema, data_db): "owner_table": schema + "." + OFFLOAD_FACT, "sort_columns_csv": "", "older_than_date": test_constants.SALES_BASED_FACT_HV_4, + "execute": True, } run_offload(options, config, messages) assert sort_story_assertion( @@ -399,6 +408,7 @@ def test_offload_sorting_fact(config, schema, data_db): "sort_columns_csv": "channel_id,promo_id", "older_than_date": test_constants.SALES_BASED_FACT_HV_4, "force": True, + "execute": True, } # run_offload(options, config, messages) # assert sort_story_assertion( diff --git a/tests/integration/scenarios/test_offload_subpart.py b/tests/integration/scenarios/test_offload_subpart.py index 3dc3dc16..dd4ecad6 100644 --- a/tests/integration/scenarios/test_offload_subpart.py +++ b/tests/integration/scenarios/test_offload_subpart.py @@ -107,6 +107,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): "offload_by_subpartition": True, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -127,6 +128,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): options = { "owner_table": schema + "." + FACT_LIST_RANGE_R, "older_than_date": test_constants.SALES_BASED_FACT_HV_2, + "execute": True, } run_offload(options, config, messages) @@ -147,6 +149,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): options = { "owner_table": schema + "." + FACT_LIST_RANGE_R, "older_than_date": test_constants.SALES_BASED_FACT_HV_3, + "execute": True, } run_offload(options, config, messages, expected_exception_string="common boundary") @@ -154,6 +157,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): options = { "owner_table": schema + "." + FACT_LIST_RANGE_R, "older_than_date": test_constants.SALES_BASED_FACT_HV_6, + "execute": True, } run_offload( options, config, messages, expected_exception_string="--offload-type=FULL" @@ -163,6 +167,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): options = { "owner_table": schema + "." + FACT_LIST_RANGE_R, "offload_type": OFFLOAD_TYPE_FULL, + "execute": True, } run_offload(options, config, messages) @@ -184,6 +189,7 @@ def test_offload_subpart_lr_range(config, schema, data_db): options = { "owner_table": schema + "." + FACT_LIST_RANGE_R, "offload_type": OFFLOAD_TYPE_INCREMENTAL, + "execute": True, } run_offload( options, @@ -229,6 +235,7 @@ def test_offload_subpart_lr_list(config, schema, data_db): "offload_partition_upper_value": 10, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -250,6 +257,7 @@ def test_offload_subpart_lr_list(config, schema, data_db): "equal_to_values": ["3"], "offload_partition_lower_value": 0, "offload_partition_upper_value": 10, + "execute": True, } run_offload(options, config, messages) @@ -303,6 +311,7 @@ def test_offload_subpart_range_range(config, schema, data_db): "offload_partition_upper_value": 10, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -326,6 +335,7 @@ def test_offload_subpart_range_range(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "offload_by_subpartition": True, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( @@ -377,6 +387,7 @@ def test_offload_subpart_hash_range(config, schema, data_db): "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) assert sales_based_fact_assertion( diff --git a/tests/integration/scenarios/test_offload_transport.py b/tests/integration/scenarios/test_offload_transport.py index 66a1f576..dae1fa34 100644 --- a/tests/integration/scenarios/test_offload_transport.py +++ b/tests/integration/scenarios/test_offload_transport.py @@ -148,6 +148,7 @@ def simple_offload_test( "offload_transport_method": transport_method, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } if transport_method == OFFLOAD_TRANSPORT_METHOD_SPARK_LIVY: @@ -178,7 +179,7 @@ def load_table_compression_tests( repo_client = orchestration_repo_client_factory( config, messages, trace_action=f"repo_client({test_id})" ) - dfs = get_dfs_from_options(config, messages=messages) + dfs = get_dfs_from_options(config, messages=messages, dry_run=False) backend_name = convert_backend_identifier_case(config, table_name) # Setup @@ -206,6 +207,7 @@ def load_table_compression_tests( "compress_load_table": False, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload(options, config, messages) @@ -224,6 +226,7 @@ def load_table_compression_tests( "preserve_load_table": True, "compress_load_table": True, "reset_backend_table": True, + "execute": True, } run_offload(options, config, messages) @@ -273,6 +276,7 @@ def offload_transport_polling_validation_tests( "offload_transport_validation_polling_interval": 1, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } log_test_marker(messages, f"{test_id}:1") run_offload(options, config, messages) @@ -286,6 +290,7 @@ def offload_transport_polling_validation_tests( "offload_transport_method": transport_method, "offload_transport_validation_polling_interval": OFFLOAD_TRANSPORT_VALIDATION_POLLER_DISABLED, "reset_backend_table": True, + "execute": True, } log_test_marker(messages, f"{test_id}:2") run_offload(options, config, messages) diff --git a/tests/integration/scenarios/test_orchestration_step_control.py b/tests/integration/scenarios/test_orchestration_step_control.py index 56a42087..a04e146c 100644 --- a/tests/integration/scenarios/test_orchestration_step_control.py +++ b/tests/integration/scenarios/test_orchestration_step_control.py @@ -89,10 +89,9 @@ def test_offload_step_dim(config, schema, data_db): "owner_table": schema + "." + STEP_DIM, "skip": [step_title_to_step_id(step_title(command_steps.STEP_VALIDATE_DATA))], "reset_backend_table": True, + "execute": False, } - offload_messages = run_offload( - options, config, messages, config_overrides={"execute": False} - ) + offload_messages = run_offload(options, config, messages) assert ( messages_step_executions( offload_messages, step_title(command_steps.STEP_VALIDATE_DATA) @@ -105,10 +104,9 @@ def test_offload_step_dim(config, schema, data_db): "owner_table": schema + "." + STEP_DIM, "skip": [step_title_to_step_id(step_title(command_steps.STEP_VALIDATE_CASTS))], "reset_backend_table": True, + "execute": False, } - offload_messages = run_offload( - options, config, messages, config_overrides={"execute": False} - ) + offload_messages = run_offload(options, config, messages) assert ( messages_step_executions( offload_messages, step_title(command_steps.STEP_VALIDATE_CASTS) @@ -123,10 +121,9 @@ def test_offload_step_dim(config, schema, data_db): step_title_to_step_id(step_title(command_steps.STEP_VERIFY_EXPORTED_DATA)) ], "reset_backend_table": True, + "execute": False, } - offload_messages = run_offload( - options, config, messages, config_overrides={"execute": False} - ) + offload_messages = run_offload(options, config, messages) assert ( messages_step_executions( offload_messages, step_title(command_steps.STEP_VERIFY_EXPORTED_DATA) @@ -139,12 +136,12 @@ def test_offload_step_dim(config, schema, data_db): "owner_table": schema + "." + STEP_DIM, "error_before_step": step_title(command_steps.STEP_CREATE_TABLE), "reset_backend_table": True, + "execute": False, } run_offload( options, config, messages, - config_overrides={"execute": False}, expected_exception_string=FORCED_EXCEPTION_TEXT, ) @@ -153,6 +150,7 @@ def test_offload_step_dim(config, schema, data_db): "owner_table": schema + "." + STEP_DIM, "error_before_step": step_title(command_steps.STEP_FINAL_LOAD), "reset_backend_table": True, + "execute": True, } run_offload( options, config, messages, expected_exception_string=FORCED_EXCEPTION_TEXT diff --git a/tests/integration/test_functions.py b/tests/integration/test_functions.py index 2c8da990..a2e79759 100644 --- a/tests/integration/test_functions.py +++ b/tests/integration/test_functions.py @@ -32,7 +32,7 @@ def build_current_options(): - return OrchestrationConfig.from_dict({"verbose": False, "execute": True}) + return OrchestrationConfig.from_dict({"verbose": False}) @lru_cache(maxsize=None) diff --git a/tests/testlib/test_framework/backend_testing_api.py b/tests/testlib/test_framework/backend_testing_api.py index a43fbd66..304fe97c 100644 --- a/tests/testlib/test_framework/backend_testing_api.py +++ b/tests/testlib/test_framework/backend_testing_api.py @@ -97,10 +97,10 @@ class BackendTestingApiException(Exception): ########################################################################### -def subproc_cmd(cmd, opts, messages, cwd=None, env=None): +def subproc_cmd(cmd, opts, messages, cwd=None, env=None, execute=True): messages.log("Shell cmd: " + " ".join(cmd), detail=VVERBOSE) - if opts.execute: + if execute: proc = subprocess.Popen(cmd, stdout=PIPE, stderr=STDOUT, cwd=cwd, env=env) output = "" for line in proc.stdout: diff --git a/tests/testlib/test_framework/hadoop/hadoop_backend_testing_api.py b/tests/testlib/test_framework/hadoop/hadoop_backend_testing_api.py index 726e692b..42544ab9 100644 --- a/tests/testlib/test_framework/hadoop/hadoop_backend_testing_api.py +++ b/tests/testlib/test_framework/hadoop/hadoop_backend_testing_api.py @@ -713,7 +713,9 @@ def name(*args): def _sudo_hdfs_dfs(self, hdfs_dfs_options): cmd = ["/usr/bin/sudo", "-u", "hdfs", "hdfs", "dfs"] + hdfs_dfs_options self._log("sudo_hdfs_dfs: %s" % cmd, detail=VERBOSE) - returncode, output = subproc_cmd(cmd, self._connection_options, self._messages) + returncode, output = subproc_cmd( + cmd, self._connection_options, self._messages, execute=(not self._dry_run) + ) if returncode != 0: self._log("Non-zero response: %s" % output) return False @@ -1158,12 +1160,12 @@ def select_single_non_null_value( def story_test_offload_nums_expected_backend_types(self, sampling_enabled=True): non_sampled_type = self.gen_default_numeric_column("x").format_data_type() return { - STORY_TEST_OFFLOAD_NUMS_BARE_NUM: "decimal(18,6)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_BARE_FLT: HADOOP_TYPE_BIGINT - if sampling_enabled - else non_sampled_type, + STORY_TEST_OFFLOAD_NUMS_BARE_NUM: ( + "decimal(18,6)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_BARE_FLT: ( + HADOOP_TYPE_BIGINT if sampling_enabled else non_sampled_type + ), STORY_TEST_OFFLOAD_NUMS_NUM_4: HADOOP_TYPE_BIGINT, STORY_TEST_OFFLOAD_NUMS_NUM_18: HADOOP_TYPE_BIGINT, STORY_TEST_OFFLOAD_NUMS_NUM_19: "decimal(38,0)", @@ -1174,24 +1176,24 @@ def story_test_offload_nums_expected_backend_types(self, sampling_enabled=True): STORY_TEST_OFFLOAD_NUMS_NUM_STAR_4: "decimal(38,4)", STORY_TEST_OFFLOAD_NUMS_NUM_3_5: "decimal(18,8)", STORY_TEST_OFFLOAD_NUMS_NUM_10_M5: HADOOP_TYPE_BIGINT, - STORY_TEST_OFFLOAD_NUMS_DEC_10_0: "decimal(18,2)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_DEC_13_9: "decimal(18,12)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_DEC_15_9: "decimal(38,12)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_DEC_36_3: "decimal(38,4)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_DEC_37_3: "decimal(38,4)" - if sampling_enabled - else non_sampled_type, - STORY_TEST_OFFLOAD_NUMS_DEC_38_3: "decimal(38,3)" - if sampling_enabled - else non_sampled_type, + STORY_TEST_OFFLOAD_NUMS_DEC_10_0: ( + "decimal(18,2)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_DEC_13_9: ( + "decimal(18,12)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_DEC_15_9: ( + "decimal(38,12)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_DEC_36_3: ( + "decimal(38,4)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_DEC_37_3: ( + "decimal(38,4)" if sampling_enabled else non_sampled_type + ), + STORY_TEST_OFFLOAD_NUMS_DEC_38_3: ( + "decimal(38,3)" if sampling_enabled else non_sampled_type + ), } def story_test_table_extra_col_info(self): diff --git a/tests/unit/offload/test_data_type_mappings.py b/tests/unit/offload/test_data_type_mappings.py index c0434951..f479032e 100644 --- a/tests/unit/offload/test_data_type_mappings.py +++ b/tests/unit/offload/test_data_type_mappings.py @@ -383,7 +383,12 @@ def setUp(self): self.options = build_mock_options(FAKE_ORACLE_BQ_ENV) messages = OffloadMessages() self.test_table_object = OracleSourceTable( - "no_user", "no_table", self.options, messages, do_not_connect=True + "no_user", + "no_table", + self.options, + messages, + dry_run=True, + do_not_connect=True, ) def _get_rdbms_source_columns(self): @@ -672,6 +677,7 @@ def test_bigquery_to_oracle(self): DBTYPE_BIGQUERY, self.options, OffloadMessages(), + dry_run=True, do_not_connect=True, ) backend_columns = [ @@ -707,6 +713,7 @@ def test_hive_to_oracle(self): DBTYPE_HIVE, self.options, OffloadMessages(), + dry_run=True, do_not_connect=True, ) except ModuleNotFoundError as e: @@ -776,6 +783,7 @@ def test_impala_to_oracle(self): DBTYPE_IMPALA, self.options, OffloadMessages(), + dry_run=True, do_not_connect=True, ) except ModuleNotFoundError as e: @@ -843,6 +851,7 @@ def test_snowflake_to_oracle(self): DBTYPE_SNOWFLAKE, self.options, OffloadMessages(), + dry_run=True, do_not_connect=True, ) except ModuleNotFoundError as e: @@ -924,6 +933,7 @@ def test_synapse_to_oracle(self): DBTYPE_SYNAPSE, self.options, OffloadMessages(), + dry_run=True, do_not_connect=True, ) except ModuleNotFoundError as e: @@ -1125,7 +1135,12 @@ def setUp(self): messages = OffloadMessages() try: self.test_table_object = MSSQLSourceTable( - "no_user", "no_table", self.options, messages, do_not_connect=True + "no_user", + "no_table", + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_sql_server_dependency_exception(e): @@ -1389,7 +1404,12 @@ def setUp(self): messages = OffloadMessages() try: self.test_table_object = NetezzaSourceTable( - "no_user", "no_table", self.options, messages, do_not_connect=True + "no_user", + "no_table", + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_netezza_dependency_exception(e): @@ -1581,7 +1601,11 @@ def setUp(self): messages = OffloadMessages() try: self.test_api = backend_api_factory( - self.options.target, self.options, messages, do_not_connect=True + self.options.target, + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_hadoop_dependency_exception(e): @@ -1760,7 +1784,11 @@ def setUp(self): messages = OffloadMessages() try: self.test_api = backend_api_factory( - self.options.target, self.options, messages, do_not_connect=True + self.options.target, + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_hadoop_dependency_exception(e): @@ -1927,7 +1955,11 @@ def setUp(self): self.options = build_mock_options(FAKE_ORACLE_BQ_ENV) messages = OffloadMessages() self.test_api = backend_api_factory( - self.options.target, self.options, messages, do_not_connect=True + self.options.target, + self.options, + messages, + dry_run=True, + do_not_connect=True, ) def _get_bigquery_source_columns(self): @@ -2353,7 +2385,11 @@ def setUp(self): messages = OffloadMessages() try: self.test_api = backend_api_factory( - self.options.target, self.options, messages, do_not_connect=True + self.options.target, + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_snowflake_dependency_exception(e): @@ -2538,7 +2574,11 @@ def setUp(self): messages = OffloadMessages() try: self.test_api = backend_api_factory( - self.options.target, self.options, messages, do_not_connect=True + self.options.target, + self.options, + messages, + dry_run=True, + do_not_connect=True, ) except ModuleNotFoundError as e: if optional_synapse_dependency_exception(e): diff --git a/tests/unit/offload/test_option_validation.py b/tests/unit/offload/test_option_validation.py index 88666216..23935c9e 100644 --- a/tests/unit/offload/test_option_validation.py +++ b/tests/unit/offload/test_option_validation.py @@ -17,6 +17,7 @@ import pytest from goe.offload import offload_constants, option_validation as module_under_test +from goe.offload.offload_messages import OffloadMessages from tests.unit.test_functions import ( build_mock_offload_operation, @@ -75,9 +76,10 @@ def test_generate_ddl_file_path( def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): + fake_messages = OffloadMessages() fake_operation = build_mock_offload_operation() fake_operation.ddl_file = offload_constants.DDL_FILE_AUTO - module_under_test.normalise_ddl_file(fake_operation, config) + module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) assert isinstance(fake_operation.ddl_file, str) @@ -98,11 +100,14 @@ def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): def test_normalise_ddl_file_path( path: str, expect_exception: bool, config: "OrchestrationConfig" ): + fake_messages = OffloadMessages() fake_operation = build_mock_offload_operation() fake_operation.ddl_file = path if expect_exception: with pytest.raises(Exception): - _ = module_under_test.normalise_ddl_file(fake_operation, config) + _ = module_under_test.normalise_ddl_file( + fake_operation, config, fake_messages + ) else: # No exception expected. - _ = module_under_test.normalise_ddl_file(fake_operation, config) + _ = module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index 271b98cf..c94cb1c3 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -317,7 +317,7 @@ def build_mock_options(mock_env: dict): assert mock_env k = mock.patch.dict(os.environ, mock_env) k.start() - c = OrchestrationConfig.from_dict({"verbose": False, "execute": False}) + c = OrchestrationConfig.from_dict({"verbose": False}) k.stop() return c From f75ca9c9936ebe9ba390e088051bdfb3f659c6b8 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 10 Apr 2024 10:18:11 +0000 Subject: [PATCH 04/28] feat: Add --ddl-file option for local filesystem --- src/goe/exceptions.py | 25 +++++ src/goe/goe.py | 5 +- src/goe/offload/backend_table.py | 3 +- src/goe/offload/offload.py | 21 ++-- src/goe/offload/offload_status_report.py | 2 +- src/goe/offload/operation/ddl_file.py | 100 ++++++++++++++++++ src/goe/offload/operation/stats_controls.py | 2 +- src/goe/offload/option_validation.py | 67 +----------- src/goe/util/misc_functions.py | 7 +- tests/integration/offload/test_backend_api.py | 2 +- .../integration/offload/test_backend_table.py | 2 +- .../offload/test_predicate_offload.py | 2 +- tests/integration/scenarios/test_ddl_file.py | 98 ++++++++++++++--- tests/unit/offload/operation/__init__.py | 0 tests/unit/offload/operation/test_ddl_file.py | 91 ++++++++++++++++ tests/unit/offload/test_option_validation.py | 74 +------------ 16 files changed, 327 insertions(+), 174 deletions(-) create mode 100644 src/goe/exceptions.py create mode 100644 src/goe/offload/operation/ddl_file.py create mode 100644 tests/unit/offload/operation/__init__.py create mode 100644 tests/unit/offload/operation/test_ddl_file.py diff --git a/src/goe/exceptions.py b/src/goe/exceptions.py new file mode 100644 index 00000000..f13ffdcc --- /dev/null +++ b/src/goe/exceptions.py @@ -0,0 +1,25 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class OffloadException(Exception): + pass + + +class OffloadOptionError(Exception): + def __init__(self, detail): + self.detail = detail + + def __str__(self): + return repr(self.detail) diff --git a/src/goe/goe.py b/src/goe/goe.py index 7994d470..749e1f01 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -30,6 +30,7 @@ from goe.config import orchestration_defaults from goe.config.config_validation_functions import normalise_size_option +from goe.exceptions import OffloadException, OffloadOptionError from goe.filesystem.goe_dfs import ( get_scheme_from_location_uri, OFFLOAD_FS_SCHEME_INHERIT, @@ -61,7 +62,7 @@ GOE_TYPE_TIMESTAMP_TZ, ) from goe.offload.offload_functions import convert_backend_identifier_case, data_db_name -from goe.offload.option_validation import normalise_ddl_file +from goe.offload.operation.ddl_file import normalise_ddl_file from goe.offload.offload_source_data import ( get_offload_type_for_config, OFFLOAD_SOURCE_CLIENT_OFFLOAD, @@ -91,7 +92,6 @@ offload_data_to_target, ) from goe.offload.offload import ( - OffloadException, active_data_append_options, check_ipa_predicate_type_option_conflicts, check_table_structure, @@ -112,7 +112,6 @@ validate_offload_partition_granularity, ) from goe.offload.option_validation import ( - OffloadOptionError, check_opt_is_posint, normalise_data_sampling_options, normalise_offload_predicate_options, diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index 30dc1189..fe9123cb 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -2425,7 +2425,7 @@ def create_backend_db_step(self): self._offload_step(command_steps.STEP_CREATE_DB, lambda: self.create_db()) post_register_data_gov_fn() - def create_backend_table_step(self, goe_object_type): + def create_backend_table_step(self, goe_object_type) -> list: ( pre_register_data_gov_fn, post_register_data_gov_fn, @@ -2445,6 +2445,7 @@ def create_backend_table_step(self, goe_object_type): command_steps.STEP_CREATE_TABLE, lambda: self.create_backend_table() ) post_register_data_gov_fn() + return executed_commands def empty_staging_area_step(self, staging_file): self._offload_step( diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 7163ed37..278026b0 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -23,6 +23,7 @@ from typing import TYPE_CHECKING from goe.config import config_descriptions, orchestration_defaults +from goe.exceptions import OffloadException from goe.filesystem.goe_dfs import VALID_OFFLOAD_FS_SCHEMES from goe.data_governance.hadoop_data_governance_constants import ( DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, @@ -45,6 +46,7 @@ from goe.offload.offload_transport import VALID_OFFLOAD_TRANSPORT_METHODS from goe.offload.operation.sort_columns import check_and_alter_backend_sort_columns from goe.offload.operation.data_type_controls import DECIMAL_COL_TYPE_SYNTAX_TEMPLATE +from goe.offload.operation.ddl_file import write_ddl_to_ddl_file from goe.offload.option_validation import ( active_data_append_options, check_opt_is_posint, @@ -61,6 +63,7 @@ from goe.util.misc_functions import format_list_for_logging if TYPE_CHECKING: + from goe.goe import OffloadOperation from goe.offload.backend_table import BackendTableInterface from goe.persistence.orchestration_repo_client import ( OrchestrationRepoClientInterface, @@ -70,10 +73,6 @@ OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT = "Column mismatch detected between the source and backend table. Resolve before offloading" -class OffloadException(Exception): - pass - - def check_ipa_predicate_type_option_conflicts( options, exc_cls=OffloadException, rdbms_table=None ): @@ -191,13 +190,19 @@ def check_table_structure(frontend_table, backend_table, messages: OffloadMessag def create_final_backend_table_step( - offload_target_table, - offload_operation, + offload_target_table: "BackendTableInterface", + offload_operation: "OffloadOperation", goe_object_type=DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, ): """Create the final backend table""" - if not offload_target_table.table_exists() or offload_operation.reset_backend_table: - offload_target_table.create_backend_table_step(goe_object_type) + if ( + not offload_target_table.table_exists() + or offload_operation.reset_backend_table + or offload_operation.ddl_file + ): + ddl = offload_target_table.create_backend_table_step(goe_object_type) + if offload_operation.ddl_file: + write_ddl_to_ddl_file(offload_operation.ddl_file, ddl) else: check_and_alter_backend_sort_columns(offload_target_table, offload_operation) diff --git a/src/goe/offload/offload_status_report.py b/src/goe/offload/offload_status_report.py index 6eb8976f..b37fee7a 100755 --- a/src/goe/offload/offload_status_report.py +++ b/src/goe/offload/offload_status_report.py @@ -37,6 +37,7 @@ get_rdbms_db_name, ) from goe.config.orchestration_config import OrchestrationConfig +from goe.exceptions import OffloadOptionError from goe.offload import offload_constants from goe.offload.backend_api import ( REPORT_ATTR_BACKEND_DISPLAY_NAME, @@ -46,7 +47,6 @@ from goe.offload.factory.backend_api_factory import backend_api_factory from goe.offload.factory.backend_table_factory import backend_table_factory from goe.offload.factory.frontend_api_factory import frontend_api_factory -from goe.offload.option_validation import OffloadOptionError from goe.offload.offload_functions import STARTS_WITH_DATE_PATTERN_RE from goe.offload.offload_messages import ( OffloadMessages, diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py new file mode 100644 index 00000000..8170865e --- /dev/null +++ b/src/goe/offload/operation/ddl_file.py @@ -0,0 +1,100 @@ +#! /usr/bin/env python3 + +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import TYPE_CHECKING + +from goe.exceptions import OffloadOptionError +from goe.filesystem.goe_dfs import get_scheme_from_location_uri +from goe.offload import offload_constants +from goe.util.misc_functions import standard_file_name + +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + from goe.offload.offload_messages import OffloadMessages + + +DDL_FILE_HEADER = """-- TODO +""" + + +def generate_ddl_file_path( + owner: str, table_name: str, config: "OrchestrationConfig" +) -> str: + """Generates a default path when DDL file option == AUTO.""" + file_name = standard_file_name( + f"{owner}.{table_name}", extension=".sql", with_datetime=True + ) + log_path = os.path.join(config.log_path, file_name) + return log_path + + +def validate_ddl_file(ddl_file: str): + """Simple validation that a value supplied via ddl_file looks good. + + Only local paths are fully validated at this point because paths to cloud storage are + prefixes and may not exist until the object is created.""" + # Simplistic check that the file path looks like a cloud storage one. + if ":" in ddl_file: + # We don't need to know the scheme right now, just validation that it is supported. + _ = get_scheme_from_location_uri(ddl_file) + return + + # Assume local filesystem, we can validate the path. + if os.path.exists(ddl_file): + raise OffloadOptionError(f"DDL path already exists: {ddl_file}") + + if "/" in ddl_file[1:]: + dirname = os.path.dirname(ddl_file) + if not os.path.isdir(dirname): + raise OffloadOptionError(f"DDL file directory does not exist: {dirname}") + + +def normalise_ddl_file( + options, config: "OrchestrationConfig", messages: "OffloadMessages" +): + """Validates path pointed to by ddl_file and generates a new path if AUTO. Mutates options.""" + if options.ddl_file: + options.ddl_file = options.ddl_file.strip() + else: + return options.ddl_file + + if options.execute and options.ddl_file: + messages.notice(offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT) + options.execute = False + + if options.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: + # Use an auto-generated path. + options.ddl_file = generate_ddl_file_path( + options.owner, options.table_name, config + ) + return + + validate_ddl_file(options.ddl_file) + + +def write_ddl_to_ddl_file(ddl_file: str, ddl: list): + """Take a list of DDL strings and write them to a file""" + assert ddl_file + ddl_str = "\n".join(ddl) + ddl_file_contents = f"{DDL_FILE_HEADER}\n\n{ddl_str}" + if ":" in ddl_file: + # Cloud storage. + pass + else: + # Local filesystem. + with open(ddl_file, "w") as f: + f.write(ddl_file_contents) diff --git a/src/goe/offload/operation/stats_controls.py b/src/goe/offload/operation/stats_controls.py index 1c9cac46..a18cf2c0 100644 --- a/src/goe/offload/operation/stats_controls.py +++ b/src/goe/offload/operation/stats_controls.py @@ -17,8 +17,8 @@ from goe.offload.column_metadata import ( get_partition_source_column_names, ) +from goe.exceptions import OffloadException from goe.offload.offload import ( - OffloadException, get_current_offload_hv, get_prior_offloaded_hv, ) diff --git a/src/goe/offload/option_validation.py b/src/goe/offload/option_validation.py index 8442186e..bc400e34 100644 --- a/src/goe/offload/option_validation.py +++ b/src/goe/offload/option_validation.py @@ -13,12 +13,9 @@ # limitations under the License. from optparse import OptionValueError -import os import re -from typing import TYPE_CHECKING - -from goe.filesystem.goe_dfs import get_scheme_from_location_uri +from goe.exceptions import OffloadOptionError from goe.offload import offload_constants from goe.offload.predicate_offload import GenericPredicate from goe.offload.offload_source_table import ( @@ -26,19 +23,7 @@ OFFLOAD_PARTITION_TYPE_RANGE, OFFLOAD_PARTITION_TYPE_LIST, ) -from goe.util.misc_functions import standard_file_name, is_pos_int - -if TYPE_CHECKING: - from goe.config.orchestration_config import OrchestrationConfig - from goe.offload.offload_messages import OffloadMessages - - -class OffloadOptionError(Exception): - def __init__(self, detail): - self.detail = detail - - def __str__(self): - return repr(self.detail) +from goe.util.misc_functions import is_pos_int def active_data_append_options( @@ -95,17 +80,6 @@ def check_opt_is_posint( ) -def generate_ddl_file_path( - owner: str, table_name: str, config: "OrchestrationConfig" -) -> str: - """Generates a default path when DDL file option == AUTO.""" - file_name = standard_file_name( - f"{owner}.{table_name}", extension=".sql", with_datetime=True - ) - log_path = os.path.join(config.log_path, file_name) - return log_path - - def normalise_data_sampling_options(options): if hasattr(options, "data_sample_pct"): if isinstance(options.data_sample_pct, str) and re.search( @@ -129,43 +103,6 @@ def normalise_data_sampling_options(options): ) -def normalise_ddl_file( - options, config: "OrchestrationConfig", messages: "OffloadMessages" -): - """Validates path pointed to by ddl_file and generates a new path if AUTO. Mutates options.""" - if options.ddl_file: - options.ddl_file = options.ddl_file.strip() - else: - return options.ddl_file - - if options.execute and options.ddl_file: - messages.notice(offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT) - options.execute = False - - if options.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: - # Use an auto-generated path. - options.ddl_file = generate_ddl_file_path( - options.owner, options.table_name, config - ) - return - - # Simplistic check that the file path looks like a cloud storage one. - if ":" in options.ddl_file: - # We don't need to know the scheme right now, just validation that it is supported. - _ = get_scheme_from_location_uri(options.ddl_file) - return - - # Assume local filesystem, we can validate the path. - - if os.path.exists(options.ddl_file): - raise OffloadOptionError(f"DDL path already exists: {options.ddl_file}") - - if "/" in options.ddl_file[1:]: - dirname = os.path.dirname(options.ddl_file) - if not os.path.isdir(dirname): - raise OffloadOptionError(f"DDL file directory does not exist: {dirname}") - - def normalise_offload_predicate_options(options): if options.offload_predicate: if isinstance(options.offload_predicate, str): diff --git a/src/goe/util/misc_functions.py b/src/goe/util/misc_functions.py index 7a59a149..6b62c0d9 100644 --- a/src/goe/util/misc_functions.py +++ b/src/goe/util/misc_functions.py @@ -541,11 +541,10 @@ def get_temp_path(tmp_dir="/tmp", prefix="goe_tmp_", suffix=""): def write_temp_file(data, prefix="goe_tmp_", suffix=""): - """writes some data to a temporary file and returns the path to the file""" + """Writes some data to a temporary file and returns the path to the file""" tmp_path = get_temp_path(prefix=prefix, suffix=suffix) - fh = open(tmp_path, "w") - fh.write(data) - fh.close() + with open(tmp_path, "w") as fh: + fh.write(data) return tmp_path diff --git a/tests/integration/offload/test_backend_api.py b/tests/integration/offload/test_backend_api.py index 86cf3f3a..2c9fb0ee 100644 --- a/tests/integration/offload/test_backend_api.py +++ b/tests/integration/offload/test_backend_api.py @@ -14,8 +14,8 @@ from unittest import main +from goe.exceptions import OffloadException from goe.offload.factory.backend_api_factory import backend_api_factory -from goe.offload.offload import OffloadException from goe.offload.offload_constants import DBTYPE_SPARK from goe.offload.offload_functions import ( convert_backend_identifier_case, diff --git a/tests/integration/offload/test_backend_table.py b/tests/integration/offload/test_backend_table.py index 367906f2..106faf04 100644 --- a/tests/integration/offload/test_backend_table.py +++ b/tests/integration/offload/test_backend_table.py @@ -24,6 +24,7 @@ import logging from unittest import TestCase, main +from goe.exceptions import OffloadException from goe.goe import OffloadOperation from goe.offload.column_metadata import ( CanonicalColumn, @@ -34,7 +35,6 @@ GOE_TYPE_INTEGER_8, ) from goe.offload.factory.backend_table_factory import backend_table_factory -from goe.offload.offload import OffloadException from goe.offload.offload_constants import DBTYPE_IMPALA from goe.offload.offload_functions import ( convert_backend_identifier_case, diff --git a/tests/integration/offload/test_predicate_offload.py b/tests/integration/offload/test_predicate_offload.py index 0c859471..3314c34a 100644 --- a/tests/integration/offload/test_predicate_offload.py +++ b/tests/integration/offload/test_predicate_offload.py @@ -21,10 +21,10 @@ from copy import copy import pytest +from goe.exceptions import OffloadException from goe.goe import OffloadOperation from goe.offload.factory.backend_table_factory import backend_table_factory from goe.offload.factory.offload_source_table_factory import OffloadSourceTable -from goe.offload.offload import OffloadException from goe.offload.offload_functions import ( convert_backend_identifier_case, data_db_name, diff --git a/tests/integration/scenarios/test_ddl_file.py b/tests/integration/scenarios/test_ddl_file.py index e1c69253..9e8f0571 100644 --- a/tests/integration/scenarios/test_ddl_file.py +++ b/tests/integration/scenarios/test_ddl_file.py @@ -45,7 +45,8 @@ ) -TEST_TABLE = "DDL_FILE_DIM" +TEST_TABLE1 = "DDL_FILE_DIM1" +TEST_TABLE2 = "DDL_FILE_DIM2" @pytest.fixture @@ -65,11 +66,13 @@ def data_db(schema, config): return data_db -def test_ddl_file_local_fs(config, schema, data_db): - id = "test_ddl_file_local_fs" +def test_ddl_file_new_table_local_fs(config, schema, data_db): + """Test requesting a DDL file to local FS for a new table.""" + id = "test_ddl_file_new_table_local_fs" messages = get_test_messages(config, id) backend_api = get_backend_testing_api(config, messages) frontend_api = get_frontend_testing_api(config, messages) + test_table = TEST_TABLE1 # Setup run_setup( @@ -77,44 +80,109 @@ def test_ddl_file_local_fs(config, schema, data_db): backend_api, config, messages, - frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, TEST_TABLE), + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), python_fns=lambda: drop_backend_test_table( - config, backend_api, messages, data_db, TEST_TABLE + config, backend_api, messages, data_db, test_table ), ) - # Offload in execute mode asking for ddl_file. + # Offload in execute mode requesting a DDL file. ddl_file = get_temp_path(prefix=id, suffix=".sql") options = { - "owner_table": schema + "." + TEST_TABLE, + "owner_table": schema + "." + test_table, "reset_backend_table": True, "ddl_file": ddl_file, + "create_backend_db": True, "execute": True, } offload_messages = run_offload(options, config, messages) # When using DDL file no table should be created, even in execute mode. assert not backend_table_exists( - config, backend_api, messages, data_db, TEST_TABLE - ), f"Backend table for {schema}.{TEST_TABLE} should not exist" + config, backend_api, messages, data_db, test_table + ), f"Backend table for {schema}.{test_table} should not exist" assert text_in_messages( offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT ) - assert os.path.isfile(ddl_file) + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" # Offload in non-execute mode asking for ddl_file. ddl_file = get_temp_path(prefix=id, suffix=".sql") options = { - "owner_table": schema + "." + TEST_TABLE, + "owner_table": schema + "." + test_table, "reset_backend_table": True, "ddl_file": ddl_file, "execute": False, } offload_messages = run_offload(options, config, messages) assert not backend_table_exists( - config, backend_api, messages, data_db, TEST_TABLE - ), f"Backend table for {schema}.{TEST_TABLE} should not exist" + config, backend_api, messages, data_db, test_table + ), f"Backend table for {schema}.{test_table} should not exist" + # Even in non-execture mode we expect to see a DDL file. + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + + +def test_ddl_file_existing_table_local_fs(config, schema, data_db): + """Test requesting a DDL file to local FS for a previously offloaded table.""" + id = "test_ddl_file_existing_table_local_fs" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages) + test_table = TEST_TABLE2 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), + python_fns=lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + ) + + # First offload the table + options = { + "owner_table": schema + "." + test_table, + "reset_backend_table": True, + "create_backend_db": True, + "execute": True, + } + run_offload(options, config, messages) + + # Now request a DDL file, in execute mode. + ddl_file = get_temp_path(prefix=id, suffix=".sql") + options = { + "owner_table": schema + "." + test_table, + "ddl_file": ddl_file, + "reset_backend_table": True, + "execute": True, + } + offload_messages = run_offload(options, config, messages) + assert backend_table_exists( + config, backend_api, messages, data_db, test_table + ), f"Backend table for {schema}.{test_table} should exist" assert text_in_messages( offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT ) - # Even in non-execture mode we expect to see a DDL file. - assert os.path.isfile(ddl_file) + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + + # Request a DDL file, in non-execute mode. + ddl_file = get_temp_path(prefix=id, suffix=".sql") + options = { + "owner_table": schema + "." + test_table, + "ddl_file": ddl_file, + "reset_backend_table": True, + "execute": False, + } + offload_messages = run_offload(options, config, messages) + assert backend_table_exists( + config, backend_api, messages, data_db, test_table + ), f"Backend table for {schema}.{test_table} should exist" + assert text_in_messages( + offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT + ) + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + + +# TODO Cloud storage diff --git a/tests/unit/offload/operation/__init__.py b/tests/unit/offload/operation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/offload/operation/test_ddl_file.py b/tests/unit/offload/operation/test_ddl_file.py new file mode 100644 index 00000000..6c798ed6 --- /dev/null +++ b/tests/unit/offload/operation/test_ddl_file.py @@ -0,0 +1,91 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +import pytest + +from goe.offload import offload_constants +from goe.offload.operation import ddl_file as module_under_test +from goe.offload.offload_messages import OffloadMessages + +from tests.unit.test_functions import ( + build_mock_offload_operation, + build_mock_options, + FAKE_ORACLE_BQ_ENV, +) + +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + + +@pytest.fixture(scope="module") +def config(): + return build_mock_options(FAKE_ORACLE_BQ_ENV) + + +@pytest.mark.parametrize( + "schema,table_name", + [ + ("my_user", "my_table123"), + ("MY-USER-123", "MY-TABLE"), + ], +) +def test_generate_ddl_file_path( + schema: str, table_name: str, config: "OrchestrationConfig" +): + path = module_under_test.generate_ddl_file_path(schema, table_name, config) + assert schema in path + assert table_name in path + offload_log = FAKE_ORACLE_BQ_ENV["OFFLOAD_LOG"] + assert path.startswith(offload_log) + assert path.endswith(".sql") + + +def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): + fake_messages = OffloadMessages() + fake_operation = build_mock_offload_operation() + fake_operation.ddl_file = offload_constants.DDL_FILE_AUTO + module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) + assert isinstance(fake_operation.ddl_file, str) + + +@pytest.mark.parametrize( + "path,expect_exception", + [ + ("/tmp", True), + ("/tmp/", True), + ("/tmp/ddl.sql", False), + # Should fail because "not-a-dir" should not exist. + ("/tmp/not-a-dir/not-a-file.sql", True), + # Cloud storage paths will pass as long as the scheme is valid. + ("gs://bucket/path/ddl.sql", False), + ("s3://bucket/path/ddl.sql", False), + ("unknown-scheme://bucket/path/ddl.sql", True), + ], +) +def test_normalise_ddl_file_path( + path: str, expect_exception: bool, config: "OrchestrationConfig" +): + fake_messages = OffloadMessages() + fake_operation = build_mock_offload_operation() + fake_operation.ddl_file = path + if expect_exception: + with pytest.raises(Exception): + _ = module_under_test.normalise_ddl_file( + fake_operation, config, fake_messages + ) + else: + # No exception expected. + _ = module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) diff --git a/tests/unit/offload/test_option_validation.py b/tests/unit/offload/test_option_validation.py index 23935c9e..e982ba48 100644 --- a/tests/unit/offload/test_option_validation.py +++ b/tests/unit/offload/test_option_validation.py @@ -12,26 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import TYPE_CHECKING import pytest -from goe.offload import offload_constants, option_validation as module_under_test -from goe.offload.offload_messages import OffloadMessages - -from tests.unit.test_functions import ( - build_mock_offload_operation, - build_mock_options, - FAKE_ORACLE_BQ_ENV, -) - -if TYPE_CHECKING: - from goe.config.orchestration_config import OrchestrationConfig - - -@pytest.fixture(scope="module") -def config(): - return build_mock_options(FAKE_ORACLE_BQ_ENV) +from goe.offload import option_validation as module_under_test @pytest.mark.parametrize( @@ -55,59 +39,3 @@ def test_check_opt_is_posint(input: str, expect_exception: bool): else: output = module_under_test.check_opt_is_posint("fake-option", input) assert output == input - - -@pytest.mark.parametrize( - "schema,table_name", - [ - ("my_user", "my_table123"), - ("MY-USER-123", "MY-TABLE"), - ], -) -def test_generate_ddl_file_path( - schema: str, table_name: str, config: "OrchestrationConfig" -): - path = module_under_test.generate_ddl_file_path(schema, table_name, config) - assert schema in path - assert table_name in path - offload_log = FAKE_ORACLE_BQ_ENV["OFFLOAD_LOG"] - assert path.startswith(offload_log) - assert path.endswith(".sql") - - -def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): - fake_messages = OffloadMessages() - fake_operation = build_mock_offload_operation() - fake_operation.ddl_file = offload_constants.DDL_FILE_AUTO - module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) - assert isinstance(fake_operation.ddl_file, str) - - -@pytest.mark.parametrize( - "path,expect_exception", - [ - ("/tmp", True), - ("/tmp/", True), - ("/tmp/ddl.sql", False), - # Should fail because "not-a-dir" should not exist. - ("/tmp/not-a-dir/not-a-file.sql", True), - # Cloud storage paths will pass as long as the scheme is valid. - ("gs://bucket/path/ddl.sql", False), - ("s3://bucket/path/ddl.sql", False), - ("unknown-scheme://bucket/path/ddl.sql", True), - ], -) -def test_normalise_ddl_file_path( - path: str, expect_exception: bool, config: "OrchestrationConfig" -): - fake_messages = OffloadMessages() - fake_operation = build_mock_offload_operation() - fake_operation.ddl_file = path - if expect_exception: - with pytest.raises(Exception): - _ = module_under_test.normalise_ddl_file( - fake_operation, config, fake_messages - ) - else: - # No exception expected. - _ = module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) From 4f0d519e91029cc746c9895c02aa05bca6340e5a Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 11 Apr 2024 11:10:49 +0000 Subject: [PATCH 05/28] feat: Add --ddl-file option --- bin/offload | 3 +- src/goe/filesystem/goe_dfs.py | 15 +- src/goe/filesystem/goe_dfs_factory.py | 68 +++--- src/goe/filesystem/goe_gcs.py | 2 +- src/goe/goe.py | 5 +- src/goe/offload/offload.py | 16 +- src/goe/offload/offload_constants.py | 3 +- src/goe/offload/offload_messages.py | 9 + src/goe/offload/operation/ddl_file.py | 49 +++-- src/goe/util/misc_functions.py | 12 +- tests/integration/scenarios/test_ddl_file.py | 206 +++++++++++++----- .../scenarios/test_offload_data.py | 2 +- tests/unit/offload/operation/test_ddl_file.py | 34 ++- 13 files changed, 303 insertions(+), 121 deletions(-) diff --git a/bin/offload b/bin/offload index 3af40a95..afcb1ab4 100755 --- a/bin/offload +++ b/bin/offload @@ -18,6 +18,7 @@ import sys from goe.config.config_checks import check_cli_path +from goe.exceptions import OffloadOptionError check_cli_path() @@ -33,7 +34,7 @@ from goe.goe import ( OFFLOAD_OP_NAME, get_log_fh, ) -from goe.offload.offload import OffloadOptionError, get_offload_options +from goe.offload.offload import get_offload_options from goe.orchestration.cli_entry_points import offload_by_cli from goe.util.goe_log import log_exception diff --git a/src/goe/filesystem/goe_dfs.py b/src/goe/filesystem/goe_dfs.py index f63efb91..a1280139 100755 --- a/src/goe/filesystem/goe_dfs.py +++ b/src/goe/filesystem/goe_dfs.py @@ -162,13 +162,13 @@ def get_scheme_from_location_uri(dfs_path): def gen_fs_uri( - path_prefix, + path_prefix: str, db_path_suffix=None, scheme=None, container=None, backend_db=None, table_name=None, -): +) -> str: """Generates a file URI Note this is not getting an existing URI for an existing db or table, it generates a new one based on inputs The return value can be at any of 3 levels: @@ -210,7 +210,7 @@ def gen_fs_uri( def gen_load_uri_from_options( offload_options, hadoop_db=None, table_name=None, scheme_override=None -): +) -> str: """Returns a Hadoop file URI based on options config.""" uri = gen_fs_uri( offload_options.hdfs_load, @@ -250,7 +250,6 @@ def client(self): @abstractmethod def delete(self, dfs_path, recursive=False): """Delete a file or directory and contents.""" - pass @abstractmethod def chmod(self, dfs_path, mode): @@ -285,19 +284,16 @@ def gen_uri( """Generate a URI for the DFS in use, mostly a wrapper for gen_fs_uri() but some DFSs have own specifics. Some implementations may format the container, container_override provides a way around that. """ - pass @abstractmethod def mkdir(self, dfs_path): """Create an empty directory, some file systems (such as object stores) will not support this and will raise NotImplementedError() """ - pass @abstractmethod def read(self, dfs_path, as_str=False): """Return the contents of a remote file as bytes unless as_str=True""" - pass @abstractmethod def rename(self, hdfs_src_path, hdfs_dst_path): @@ -318,17 +314,14 @@ def stat(self, dfs_path): 'permission': Octal permissions e.g. '755' or '640' } """ - pass @abstractmethod - def write(self, dfs_path, data, overwrite=False): + def write(self, dfs_path: str, data, overwrite=False): """Write some data to a remote file""" - pass @abstractmethod def list_dir(self, dfs_path): """Return a list of file/directory names within dfs_path""" - pass ########################################################################### # PRIVATE METHODS diff --git a/src/goe/filesystem/goe_dfs_factory.py b/src/goe/filesystem/goe_dfs_factory.py index 60423f9b..ab31e89c 100644 --- a/src/goe/filesystem/goe_dfs_factory.py +++ b/src/goe/filesystem/goe_dfs_factory.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import TYPE_CHECKING + from goe.filesystem.goe_dfs import ( OFFLOAD_FS_SCHEME_GS, OFFLOAD_FS_SCHEME_S3, @@ -22,53 +24,57 @@ ) from goe.offload.offload_constants import HADOOP_BASED_BACKEND_DISTRIBUTIONS +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + from goe.filesystem.goe_dfs import GOEDfs + def get_dfs_from_options( - offload_options, messages=None, force_ssh=False, dry_run=True, do_not_connect=False -): + config: "OrchestrationConfig", + messages=None, + force_ssh=False, + dry_run=True, + do_not_connect=False, +) -> "GOEDfs": """Helper function to get an appropriate GOEDfs object based on offload options.""" - if offload_options.backend_distribution in HADOOP_BASED_BACKEND_DISTRIBUTIONS: - if ( - offload_options.webhdfs_host - and offload_options.webhdfs_port - and not force_ssh - ): + if config.backend_distribution in HADOOP_BASED_BACKEND_DISTRIBUTIONS: + if config.webhdfs_host and config.webhdfs_port and not force_ssh: from goe.filesystem.web_hdfs import WebHdfs return WebHdfs( - offload_options.webhdfs_host, - offload_options.webhdfs_port, - offload_options.hadoop_ssh_user, - True if offload_options.kerberos_service else False, - offload_options.webhdfs_verify_ssl, + config.webhdfs_host, + config.webhdfs_port, + config.hadoop_ssh_user, + True if config.kerberos_service else False, + config.webhdfs_verify_ssl, dry_run=dry_run, messages=messages, do_not_connect=do_not_connect, - db_path_suffix=offload_options.hdfs_db_path_suffix, - hdfs_data=offload_options.hdfs_data, + db_path_suffix=config.hdfs_db_path_suffix, + hdfs_data=config.hdfs_data, ) else: from goe.filesystem.cli_hdfs import CliHdfs return CliHdfs( - offload_options.hdfs_host, - offload_options.hadoop_ssh_user, + config.hdfs_host, + config.hadoop_ssh_user, dry_run=dry_run, messages=messages, do_not_connect=do_not_connect, - db_path_suffix=offload_options.hdfs_db_path_suffix, - hdfs_data=offload_options.hdfs_data, + db_path_suffix=config.hdfs_db_path_suffix, + hdfs_data=config.hdfs_data, ) - elif offload_options.offload_fs_scheme == OFFLOAD_FS_SCHEME_GS: + elif config.offload_fs_scheme == OFFLOAD_FS_SCHEME_GS: from goe.filesystem.goe_gcs import GOEGcs return GOEGcs( messages, dry_run=dry_run, do_not_connect=do_not_connect, - db_path_suffix=offload_options.hdfs_db_path_suffix, + db_path_suffix=config.hdfs_db_path_suffix, ) - elif offload_options.offload_fs_scheme in ( + elif config.offload_fs_scheme in ( OFFLOAD_FS_SCHEME_S3, OFFLOAD_FS_SCHEME_S3A, ): @@ -78,27 +84,27 @@ def get_dfs_from_options( messages, dry_run=dry_run, do_not_connect=do_not_connect, - db_path_suffix=offload_options.hdfs_db_path_suffix, + db_path_suffix=config.hdfs_db_path_suffix, ) - elif offload_options.offload_fs_scheme in AZURE_OFFLOAD_FS_SCHEMES: + elif config.offload_fs_scheme in AZURE_OFFLOAD_FS_SCHEMES: from goe.filesystem.goe_azure import GOEAzure return GOEAzure( - offload_options.offload_fs_azure_account_name, - offload_options.offload_fs_azure_account_key, - offload_options.offload_fs_azure_account_domain, + config.offload_fs_azure_account_name, + config.offload_fs_azure_account_key, + config.offload_fs_azure_account_domain, messages, dry_run=dry_run, do_not_connect=do_not_connect, - db_path_suffix=offload_options.hdfs_db_path_suffix, + db_path_suffix=config.hdfs_db_path_suffix, ) else: - if offload_options.offload_fs_scheme: + if config.offload_fs_scheme: raise NotImplementedError( "Backend system/scheme has not been implemented: %s/%s" - % (offload_options.target, offload_options.offload_fs_scheme) + % (config.target, config.offload_fs_scheme) ) else: raise NotImplementedError( - "Backend system has not been implemented: %s" % offload_options.target + "Backend system has not been implemented: %s" % config.target ) diff --git a/src/goe/filesystem/goe_gcs.py b/src/goe/filesystem/goe_gcs.py index 9c97e4c2..fbeb8aed 100644 --- a/src/goe/filesystem/goe_gcs.py +++ b/src/goe/filesystem/goe_gcs.py @@ -334,7 +334,7 @@ def stat(self, dfs_path): else: return None - def write(self, dfs_path, data, overwrite=False): + def write(self, dfs_path: str, data, overwrite=False): assert dfs_path assert isinstance(dfs_path, str) logger.info("write(%s)" % dfs_path) diff --git a/src/goe/goe.py b/src/goe/goe.py index 749e1f01..063445cf 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -2853,7 +2853,10 @@ def offload_table( return_none_on_failure=True ) - create_final_backend_table_step(offload_target_table, offload_operation) + if not create_final_backend_table_step( + offload_target_table, offload_operation, offload_options, messages + ): + return True data_transport_client = offload_transport_factory( offload_operation.offload_transport_method, diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 278026b0..0b3508b9 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -63,6 +63,7 @@ from goe.util.misc_functions import format_list_for_logging if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig from goe.goe import OffloadOperation from goe.offload.backend_table import BackendTableInterface from goe.persistence.orchestration_repo_client import ( @@ -132,6 +133,7 @@ def check_ipa_predicate_type_option_conflicts( def check_table_structure(frontend_table, backend_table, messages: OffloadMessages): """Compare frontend and backend columns by name and throw an exception if there is a mismatch. + Ideally we would use SchemaSyncAnalyzer for this but circular dependencies prevent that for the time being. FIXME revisit this in the future to see if we can hook into SchemaSyncAnalyzer for comparison, see GOE-1307 """ @@ -192,9 +194,15 @@ def check_table_structure(frontend_table, backend_table, messages: OffloadMessag def create_final_backend_table_step( offload_target_table: "BackendTableInterface", offload_operation: "OffloadOperation", + config: "OrchestrationConfig", + messages: OffloadMessages, goe_object_type=DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, -): - """Create the final backend table""" +) -> bool: + """Create the final backend table. + + Returns: + A boolean for "do not exit early" + """ if ( not offload_target_table.table_exists() or offload_operation.reset_backend_table @@ -202,9 +210,11 @@ def create_final_backend_table_step( ): ddl = offload_target_table.create_backend_table_step(goe_object_type) if offload_operation.ddl_file: - write_ddl_to_ddl_file(offload_operation.ddl_file, ddl) + write_ddl_to_ddl_file(offload_operation.ddl_file, ddl, config, messages) + return False else: check_and_alter_backend_sort_columns(offload_target_table, offload_operation) + return True def drop_backend_table_step( diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index 7657d612..7cab05e2 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -145,8 +145,9 @@ ) TOTAL_ROWS_OFFLOADED_LOG_TEXT = "Total rows offloaded" DDL_FILE_EXECUTE_MESSAGE_TEXT = ( - "Switch command to non-exectute mode due to --ddl-file option" + "Switching command to non-exectute mode due to --ddl-file option" ) +DDL_FILE_WRITE_MESSAGE_TEMPLATE = "Table DDL has been written to file: {}" # Offload capabilities we can switch on/off by backend db type # Any capabilities that are version specific will have extra code in the BackendApi method diff --git a/src/goe/offload/offload_messages.py b/src/goe/offload/offload_messages.py index 093c4699..179e9c91 100755 --- a/src/goe/offload/offload_messages.py +++ b/src/goe/offload/offload_messages.py @@ -500,6 +500,8 @@ def step_repo_logging(check_command_type): self.log("Done", ansi_code="green") if record_step_delta: self.step_delta(title, td) + else: + self.step_no_delta(title) if step_repo_logging(parent_command_type): self._repo_client.end_command_step( @@ -591,6 +593,13 @@ def step_delta(self, step, time_delta): else: self.steps[step] = {"seconds": time_delta.total_seconds(), "count": 1} + def step_no_delta(self, step): + """Record a step without any time delta, for non-execture mode.""" + if step in self.steps: + self.steps[step]["count"] += 1 + else: + self.steps[step] = {"seconds": 0, "count": 1} + def log_step_deltas(self, topn=10, detail=VVERBOSE): logger.info("log_step_deltas()") if not self.steps: diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py index 8170865e..93bed0eb 100644 --- a/src/goe/offload/operation/ddl_file.py +++ b/src/goe/offload/operation/ddl_file.py @@ -14,20 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import os from typing import TYPE_CHECKING from goe.exceptions import OffloadOptionError from goe.filesystem.goe_dfs import get_scheme_from_location_uri +from goe.filesystem.goe_dfs_factory import get_dfs_from_options from goe.offload import offload_constants from goe.util.misc_functions import standard_file_name if TYPE_CHECKING: from goe.config.orchestration_config import OrchestrationConfig + from goe.goe import OffloadOperation from goe.offload.offload_messages import OffloadMessages -DDL_FILE_HEADER = """-- TODO +DDL_FILE_HEADER = "Table DDL generated by GOE" +DDL_FILE_HEADER_TEMPLATE = f"""-- {DDL_FILE_HEADER} +-- Time: {{}} + """ @@ -64,37 +70,52 @@ def validate_ddl_file(ddl_file: str): def normalise_ddl_file( - options, config: "OrchestrationConfig", messages: "OffloadMessages" + offload_operation: "OffloadOperation", + config: "OrchestrationConfig", + messages: "OffloadMessages", ): """Validates path pointed to by ddl_file and generates a new path if AUTO. Mutates options.""" - if options.ddl_file: - options.ddl_file = options.ddl_file.strip() + if offload_operation.ddl_file: + offload_operation.ddl_file = offload_operation.ddl_file.strip() else: - return options.ddl_file + return offload_operation.ddl_file - if options.execute and options.ddl_file: + if offload_operation.execute and offload_operation.ddl_file: messages.notice(offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT) - options.execute = False + offload_operation.execute = False - if options.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: + if offload_operation.ddl_file.upper() == offload_constants.DDL_FILE_AUTO: # Use an auto-generated path. - options.ddl_file = generate_ddl_file_path( - options.owner, options.table_name, config + offload_operation.ddl_file = generate_ddl_file_path( + offload_operation.owner, offload_operation.table_name, config ) return - validate_ddl_file(options.ddl_file) + validate_ddl_file(offload_operation.ddl_file) -def write_ddl_to_ddl_file(ddl_file: str, ddl: list): +def write_ddl_to_ddl_file( + ddl_file: str, + ddl: list[str], + config: "OrchestrationConfig", + messages: "OffloadMessages", +): """Take a list of DDL strings and write them to a file""" assert ddl_file ddl_str = "\n".join(ddl) - ddl_file_contents = f"{DDL_FILE_HEADER}\n\n{ddl_str}" + header = DDL_FILE_HEADER_TEMPLATE.format( + datetime.datetime.now().replace(microsecond=0).isoformat() + ) + ddl_file_contents = f"{header}\n{ddl_str}" if ":" in ddl_file: # Cloud storage. - pass + # dry_run=False below because, even in preview mode we need to write the file. + dfs_client = get_dfs_from_options(config, messages, dry_run=False) + if dfs_client.stat(ddl_file): + raise OffloadOptionError(f"DDL path already exists: {ddl_file}") + dfs_client.write(ddl_file, ddl_file_contents) else: # Local filesystem. with open(ddl_file, "w") as f: f.write(ddl_file_contents) + messages.notice(offload_constants.DDL_FILE_WRITE_MESSAGE_TEMPLATE.format(ddl_file)) diff --git a/src/goe/util/misc_functions.py b/src/goe/util/misc_functions.py index 6b62c0d9..cd83b23c 100644 --- a/src/goe/util/misc_functions.py +++ b/src/goe/util/misc_functions.py @@ -23,7 +23,7 @@ import inspect import logging import math -import os, os.path +import os import random import re import string @@ -144,7 +144,7 @@ def parse_python_from_string(value): # datetime ? try: value = parser.parse(value) - except ValueError as e: + except ValueError: pass return value @@ -216,7 +216,7 @@ def is_number(s): try: float(s) return True - except (ValueError, TypeError) as e: + except (ValueError, TypeError): return False @@ -535,12 +535,12 @@ def human_size_to_bytes(size, binary_sizes=True): # not used tempfile.mkstemp for get_temp_path/write_temp_file as sometimes # want to generate a file name to use remotely or fully control the random section -def get_temp_path(tmp_dir="/tmp", prefix="goe_tmp_", suffix=""): +def get_temp_path(tmp_dir: str = "/tmp", prefix: str = "goe_tmp_", suffix: str = ""): suffix_str = (".%s" % suffix.lstrip(".")) if suffix else "" - return "%s/%s%s%s" % (tmp_dir, prefix, str(uuid.uuid4()), suffix_str) + return os.path.join(tmp_dir, "%s%s%s" % (prefix, str(uuid.uuid4()), suffix_str)) -def write_temp_file(data, prefix="goe_tmp_", suffix=""): +def write_temp_file(data, prefix: str = "goe_tmp_", suffix: str = ""): """Writes some data to a temporary file and returns the path to the file""" tmp_path = get_temp_path(prefix=prefix, suffix=suffix) with open(tmp_path, "w") as fh: diff --git a/tests/integration/scenarios/test_ddl_file.py b/tests/integration/scenarios/test_ddl_file.py index 9e8f0571..d82aa263 100644 --- a/tests/integration/scenarios/test_ddl_file.py +++ b/tests/integration/scenarios/test_ddl_file.py @@ -16,11 +16,13 @@ import pytest +from goe.filesystem.goe_dfs_factory import get_dfs_from_options from goe.offload import offload_constants from goe.offload.offload_functions import ( convert_backend_identifier_case, data_db_name, ) +from goe.orchestration import command_steps from goe.util.misc_functions import get_temp_path from tests.integration.scenarios.assertion_functions import ( @@ -45,8 +47,10 @@ ) -TEST_TABLE1 = "DDL_FILE_DIM1" -TEST_TABLE2 = "DDL_FILE_DIM2" +TEST_TABLE_LFS_1 = "DDL_FILE_DIM_LFS_1" +TEST_TABLE_LFS_2 = "DDL_FILE_DIM_LFS_2" +TEST_TABLE_CS_1 = "DDL_FILE_DIM_CS_1" +TEST_TABLE_CS_2 = "DDL_FILE_DIM_CS_2" @pytest.fixture @@ -66,28 +70,36 @@ def data_db(schema, config): return data_db -def test_ddl_file_new_table_local_fs(config, schema, data_db): - """Test requesting a DDL file to local FS for a new table.""" - id = "test_ddl_file_new_table_local_fs" - messages = get_test_messages(config, id) - backend_api = get_backend_testing_api(config, messages) - frontend_api = get_frontend_testing_api(config, messages) - test_table = TEST_TABLE1 - - # Setup - run_setup( - frontend_api, - backend_api, - config, - messages, - frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), - python_fns=lambda: drop_backend_test_table( - config, backend_api, messages, data_db, test_table - ), +def step_assertions(offload_messages): + """Check that we didn't run Offload steps that come after the DDL file is produced.""" + assert ( + command_steps.step_title(command_steps.STEP_CREATE_TABLE) + in offload_messages.steps ) + # After creating the DDL file Offload should stop, therefore + # we should never see data staged or loaded. + assert ( + command_steps.step_title(command_steps.STEP_STAGING_TRANSPORT) + not in offload_messages.steps + ), f"We ran an offload step that shouldn't be run: {command_steps.step_title(command_steps.STEP_STAGING_TRANSPORT)}" + assert ( + command_steps.step_title(command_steps.STEP_FINAL_LOAD) + not in offload_messages.steps + ), f"We ran an offload step that shouldn't be run: {command_steps.step_title(command_steps.STEP_FINAL_LOAD)}" + +def new_table_ddl_file_tests( + config, + schema: str, + data_db: str, + test_table: str, + ddl_file_prefix: str, + backend_api, + messages, + dfs_client=None, +): # Offload in execute mode requesting a DDL file. - ddl_file = get_temp_path(prefix=id, suffix=".sql") + ddl_file = ddl_file_prefix + "_1.sql" options = { "owner_table": schema + "." + test_table, "reset_backend_table": True, @@ -103,44 +115,45 @@ def test_ddl_file_new_table_local_fs(config, schema, data_db): assert text_in_messages( offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT ) - assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + step_assertions(offload_messages) + + if dfs_client: + assert dfs_client.stat(ddl_file), f"DDL file has not been created: {ddl_file}" + else: + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" # Offload in non-execute mode asking for ddl_file. - ddl_file = get_temp_path(prefix=id, suffix=".sql") + ddl_file = ddl_file_prefix + "_2.sql" options = { "owner_table": schema + "." + test_table, "reset_backend_table": True, "ddl_file": ddl_file, "execute": False, } - offload_messages = run_offload(options, config, messages) + run_offload(options, config, messages) assert not backend_table_exists( config, backend_api, messages, data_db, test_table ), f"Backend table for {schema}.{test_table} should not exist" + step_assertions(offload_messages) # Even in non-execture mode we expect to see a DDL file. - assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + if dfs_client: + assert dfs_client.stat(ddl_file), f"DDL file has not been created: {ddl_file}" + else: + assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" + # Re-use same file name, should be rejected + run_offload(options, config, messages, expected_exception_string=ddl_file) -def test_ddl_file_existing_table_local_fs(config, schema, data_db): - """Test requesting a DDL file to local FS for a previously offloaded table.""" - id = "test_ddl_file_existing_table_local_fs" - messages = get_test_messages(config, id) - backend_api = get_backend_testing_api(config, messages) - frontend_api = get_frontend_testing_api(config, messages) - test_table = TEST_TABLE2 - - # Setup - run_setup( - frontend_api, - backend_api, - config, - messages, - frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), - python_fns=lambda: drop_backend_test_table( - config, backend_api, messages, data_db, test_table - ), - ) +def exsting_table_ddl_file_tests( + config, + schema: str, + data_db: str, + test_table: str, + ddl_file_prefix: str, + backend_api, + messages, +): # First offload the table options = { "owner_table": schema + "." + test_table, @@ -151,7 +164,7 @@ def test_ddl_file_existing_table_local_fs(config, schema, data_db): run_offload(options, config, messages) # Now request a DDL file, in execute mode. - ddl_file = get_temp_path(prefix=id, suffix=".sql") + ddl_file = ddl_file_prefix + "_1.sql" options = { "owner_table": schema + "." + test_table, "ddl_file": ddl_file, @@ -165,10 +178,11 @@ def test_ddl_file_existing_table_local_fs(config, schema, data_db): assert text_in_messages( offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT ) + step_assertions(offload_messages) assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" # Request a DDL file, in non-execute mode. - ddl_file = get_temp_path(prefix=id, suffix=".sql") + ddl_file = ddl_file_prefix + "_2.sql" options = { "owner_table": schema + "." + test_table, "ddl_file": ddl_file, @@ -179,10 +193,102 @@ def test_ddl_file_existing_table_local_fs(config, schema, data_db): assert backend_table_exists( config, backend_api, messages, data_db, test_table ), f"Backend table for {schema}.{test_table} should exist" - assert text_in_messages( - offload_messages, offload_constants.DDL_FILE_EXECUTE_MESSAGE_TEXT - ) + step_assertions(offload_messages) assert os.path.isfile(ddl_file), f"DDL file has not been created: {ddl_file}" -# TODO Cloud storage +def test_ddl_file_new_table_local_fs(config, schema, data_db): + """Test requesting a DDL file to local FS for a new table.""" + id = "test_ddl_file_new_table_local_fs" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages) + test_table = TEST_TABLE_LFS_1 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), + python_fns=lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + ) + + ddl_file_prefix = get_temp_path(prefix=id) + new_table_ddl_file_tests( + config, schema, data_db, test_table, ddl_file_prefix, backend_api, messages + ) + + +def test_ddl_file_existing_table_local_fs(config, schema, data_db): + """Test requesting a DDL file to local FS for a previously offloaded table.""" + id = "test_ddl_file_existing_table_local_fs" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages) + test_table = TEST_TABLE_LFS_2 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), + python_fns=lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + ) + + ddl_file_prefix = get_temp_path(prefix=id) + exsting_table_ddl_file_tests( + config, schema, data_db, test_table, ddl_file_prefix, backend_api, messages + ) + + +def test_ddl_file_new_table_cloud_storage(config, schema, data_db): + """Test requesting a DDL file to cloud storage for a new table.""" + id = "test_ddl_file_new_table_cloud_storage" + messages = get_test_messages(config, id) + + if not config.offload_fs_container: + messages.log(f"Skipping {id} when OFFLOAD_FS_CONTAINER is empty") + pytest.skip(f"Skipping {id} when OFFLOAD_FS_CONTAINER is empty") + + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages) + test_table = TEST_TABLE_CS_1 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), + python_fns=lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + ) + + dfs_client = get_dfs_from_options(config, messages) + bucket_path = dfs_client.gen_uri( + config.offload_fs_scheme, + config.offload_fs_container, + config.offload_fs_prefix, + ) + + ddl_file_prefix = get_temp_path(tmp_dir=bucket_path, prefix=id) + new_table_ddl_file_tests( + config, + schema, + data_db, + test_table, + ddl_file_prefix, + backend_api, + messages, + dfs_client=dfs_client, + ) diff --git a/tests/integration/scenarios/test_offload_data.py b/tests/integration/scenarios/test_offload_data.py index 99a061a5..be7eca1f 100644 --- a/tests/integration/scenarios/test_offload_data.py +++ b/tests/integration/scenarios/test_offload_data.py @@ -570,7 +570,7 @@ def test_offload_data_partition_by_nanosecond(config, schema, data_db): if not frontend_api.nanoseconds_supported(): messages.log(f"Skipping {id} on frontend system") - pytest.skip(f"Skipping {id} on rontend system") + pytest.skip(f"Skipping {id} on frontend system") backend_api = get_backend_testing_api(config, messages) repo_client = orchestration_repo_client_factory( diff --git a/tests/unit/offload/operation/test_ddl_file.py b/tests/unit/offload/operation/test_ddl_file.py index 6c798ed6..ce3fab3e 100644 --- a/tests/unit/offload/operation/test_ddl_file.py +++ b/tests/unit/offload/operation/test_ddl_file.py @@ -13,12 +13,14 @@ # limitations under the License. from typing import TYPE_CHECKING +from unittest import mock import pytest from goe.offload import offload_constants from goe.offload.operation import ddl_file as module_under_test from goe.offload.offload_messages import OffloadMessages +from goe.util.misc_functions import get_temp_path from tests.unit.test_functions import ( build_mock_offload_operation, @@ -31,7 +33,7 @@ @pytest.fixture(scope="module") -def config(): +def config() -> "OrchestrationConfig": return build_mock_options(FAKE_ORACLE_BQ_ENV) @@ -89,3 +91,33 @@ def test_normalise_ddl_file_path( else: # No exception expected. _ = module_under_test.normalise_ddl_file(fake_operation, config, fake_messages) + + +@pytest.mark.parametrize( + "ddl_list", + [ + [ + "CREATE TABLE foo (bar INT);", + ], + [ + "DROP TABLE foo;", + "CREATE TABLE foo (bar INT);", + ], + [], + ], +) +def test_write_ddl_to_ddl_file(ddl_list: list, config): + fake_messages = OffloadMessages() + ddl_file = get_temp_path(prefix="test_write_ddl_to_ddl_file", suffix=".sql") + m = mock.mock_open() + with mock.patch("goe.offload.operation.ddl_file.open", m): + module_under_test.write_ddl_to_ddl_file( + ddl_file, ddl_list, config, fake_messages + ) + fh = m() + assert fh.write.mock_calls + write_arg = fh.write.mock_calls[0].args[0] + # Check the header is included in the write call. + assert module_under_test.DDL_FILE_HEADER in write_arg + # Check all lines of the DDL are in the write call. + assert all(_ in write_arg for _ in ddl_list) From 0fd66662558931a9db70d7083c642a74c51376c7 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 11 Apr 2024 11:51:24 +0000 Subject: [PATCH 06/28] feat: Add --ddl-file option --- src/goe/offload/operation/ddl_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py index 93bed0eb..58145f95 100644 --- a/src/goe/offload/operation/ddl_file.py +++ b/src/goe/offload/operation/ddl_file.py @@ -96,7 +96,7 @@ def normalise_ddl_file( def write_ddl_to_ddl_file( ddl_file: str, - ddl: list[str], + ddl: list, config: "OrchestrationConfig", messages: "OffloadMessages", ): From 92d44c6726bebdafc6895714cc835282cc62282d Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 12 Apr 2024 08:01:58 +0000 Subject: [PATCH 07/28] feat: Add --ddl-file option --- src/goe/goe.py | 36 ++++++++++--------------- src/goe/offload/offload.py | 37 ++++++++++++++------------ src/goe/orchestration/command_steps.py | 2 ++ 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index 063445cf..c788d4bd 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -95,6 +95,7 @@ active_data_append_options, check_ipa_predicate_type_option_conflicts, check_table_structure, + create_ddl_file_step, create_final_backend_table_step, drop_backend_table_step, get_current_offload_hv, @@ -411,10 +412,6 @@ def get_db_charset(opts): ) -def next_power_of_two(x): - return int(2 ** (math.ceil(math.log(x, 2)))) - - def nls_lang_exists(): return os.environ.get("NLS_LANG") @@ -491,20 +488,10 @@ def get_offload_type( def log_timestamp(ansi_code="grey"): - if options and options.execute: - global ts - ts = datetime.now() - ts = ts.replace(microsecond=0) - log(ts.strftime("%c"), detail=verbose, ansi_code=ansi_code) - - -def log_timedelta(ansi_code="grey", hybrid_options=None): - use_opts = hybrid_options or options - if use_opts.execute: - ts2 = datetime.now() - ts2 = ts2.replace(microsecond=0) - log("Step time: %s" % (ts2 - ts), detail=verbose, ansi_code=ansi_code) - return ts2 - ts + global ts + ts = datetime.now() + ts = ts.replace(microsecond=0) + log(ts.strftime("%c"), detail=verbose, ansi_code=ansi_code) # TODO Should really be named oracle_adm_connection @@ -2830,6 +2817,14 @@ def offload_table( data_gov_client=data_gov_client, ) + if offload_operation.ddl_file: + # For DDL file creation we need to drop out early, before we concern + # ourselves with database creation or table drop commands. + create_ddl_file_step( + offload_target_table, offload_operation, offload_options, messages + ) + return True + if offload_operation.create_backend_db: offload_target_table.create_backend_db_step() @@ -2853,10 +2848,7 @@ def offload_table( return_none_on_failure=True ) - if not create_final_backend_table_step( - offload_target_table, offload_operation, offload_options, messages - ): - return True + create_final_backend_table_step(offload_target_table, offload_operation) data_transport_client = offload_transport_factory( offload_operation.offload_transport_method, diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 0b3508b9..ea7f43c4 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -191,30 +191,33 @@ def check_table_structure(frontend_table, backend_table, messages: OffloadMessag ) -def create_final_backend_table_step( +def create_ddl_file_step( offload_target_table: "BackendTableInterface", offload_operation: "OffloadOperation", config: "OrchestrationConfig", messages: OffloadMessages, - goe_object_type=DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, -) -> bool: - """Create the final backend table. +): + """Create a DDL file for the final backend table.""" + if not offload_operation.ddl_file: + return - Returns: - A boolean for "do not exit early" - """ - if ( - not offload_target_table.table_exists() - or offload_operation.reset_backend_table - or offload_operation.ddl_file - ): - ddl = offload_target_table.create_backend_table_step(goe_object_type) - if offload_operation.ddl_file: - write_ddl_to_ddl_file(offload_operation.ddl_file, ddl, config, messages) - return False + def step_fn(): + ddl = offload_target_table.create_backend_table() + write_ddl_to_ddl_file(offload_operation.ddl_file, ddl, config, messages) + + messages.offload_step(command_steps.STEP_DDL_FILE, step_fn, execute=False) + + +def create_final_backend_table_step( + offload_target_table: "BackendTableInterface", + offload_operation: "OffloadOperation", + goe_object_type=DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, +): + """Create the final backend table.""" + if not offload_target_table.table_exists() or offload_operation.reset_backend_table: + offload_target_table.create_backend_table_step(goe_object_type) else: check_and_alter_backend_sort_columns(offload_target_table, offload_operation) - return True def drop_backend_table_step( diff --git a/src/goe/orchestration/command_steps.py b/src/goe/orchestration/command_steps.py index e1a46e21..3710d7a6 100644 --- a/src/goe/orchestration/command_steps.py +++ b/src/goe/orchestration/command_steps.py @@ -35,6 +35,7 @@ class CommandStepsException(Exception): STEP_COPY_STATS_TO_BACKEND = "COPY_STATS_TO_BACKEND" STEP_CREATE_DB = "CREATE_DB" STEP_CREATE_TABLE = "CREATE_TABLE" +STEP_DDL_FILE = "DDL_FILE" STEP_DROP_TABLE = "DROP_TABLE" STEP_FINAL_LOAD = "FINAL_LOAD" STEP_FIND_OFFLOAD_DATA = "FIND_OFFLOAD_DATA" @@ -75,6 +76,7 @@ class CommandStepsException(Exception): STEP_COPY_STATS_TO_BACKEND: "Copy RDBMS stats to Backend", STEP_CREATE_DB: "Create backend database", STEP_CREATE_TABLE: "Create backend table", + STEP_DDL_FILE: "Create DDL file", STEP_DROP_TABLE: "Drop backend table", STEP_FINAL_LOAD: "Load staged data", STEP_FIND_OFFLOAD_DATA: "Find data to offload", From 0470bda5e302ed079bee9bf8fd1ed2f0c1ac2e1b Mon Sep 17 00:00:00 2001 From: nj1973 Date: Mon, 15 Apr 2024 16:45:18 +0000 Subject: [PATCH 08/28] feat: Decouple table creation and data loading --- src/goe/goe.py | 66 +++-- src/goe/offload/backend_api.py | 8 +- src/goe/offload/backend_table.py | 19 +- .../offload/bigquery/bigquery_backend_api.py | 8 +- .../bigquery/bigquery_backend_table.py | 12 +- src/goe/offload/frontend_api.py | 4 + src/goe/offload/hadoop/hadoop_backend_api.py | 8 +- .../offload/hadoop/hadoop_backend_table.py | 12 +- .../offload/microsoft/mssql_frontend_api.py | 1 + .../mssql_offload_transport_rdbms_api.py | 1 - .../offload/microsoft/synapse_backend_api.py | 40 +-- .../offload/netezza/netezza_frontend_api.py | 1 + .../netezza_offload_transport_rdbms_api.py | 1 - src/goe/offload/offload.py | 92 ++----- src/goe/offload/offload_source_data.py | 6 +- src/goe/offload/option_validation.py | 72 +++++- src/goe/offload/oracle/oracle_frontend_api.py | 7 +- .../oracle_offload_transport_rdbms_api.py | 1 - .../snowflake/snowflake_backend_api.py | 8 +- .../spark/dataproc_offload_transport.py | 17 +- .../offload/teradata/teradata_frontend_api.py | 1 + .../teradata_offload_transport_rdbms_api.py | 1 - .../oracle_orchestration_repo_client.py | 2 + .../integration/offload/test_backend_table.py | 4 + .../integration/scenarios/setup_functions.py | 70 ++++-- tests/integration/scenarios/test_ddl_file.py | 9 + .../scenarios/test_offload_basic.py | 232 ++++++++++++++---- .../test_framework/backend_testing_api.py | 1 - .../testlib/test_framework/test_functions.py | 140 ----------- tests/unit/offload/test_backend_api.py | 7 + 30 files changed, 507 insertions(+), 344 deletions(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index c788d4bd..f2993002 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -14,13 +14,11 @@ import os import sys - from copy import copy from datetime import datetime, timedelta import json import logging import os.path -import math from optparse import OptionParser, Option, OptionValueError, SUPPRESS_HELP import re import traceback @@ -93,7 +91,6 @@ ) from goe.offload.offload import ( active_data_append_options, - check_ipa_predicate_type_option_conflicts, check_table_structure, create_ddl_file_step, create_final_backend_table_step, @@ -114,6 +111,7 @@ ) from goe.offload.option_validation import ( check_opt_is_posint, + check_ipa_predicate_type_option_conflicts, normalise_data_sampling_options, normalise_offload_predicate_options, normalise_stats_options, @@ -149,8 +147,9 @@ from goe.util.redis_tools import RedisClient if TYPE_CHECKING: - from goe.offload.backend_table import BackendTableInterface from goe.config.orchestration_config import OrchestrationConfig + from goe.offload.backend_table import BackendTableInterface + from goe.offload.offload_source_data import OffloadSourceDataInterface dev_logger = logging.getLogger("goe") @@ -650,12 +649,12 @@ def verify_row_count_by_aggs( def offload_data_verification( - offload_source_table, - offload_target_table, + offload_source_table: OffloadSourceTableInterface, + offload_target_table: "BackendTableInterface", offload_operation, - offload_options, - messages, - source_data_client, + offload_options: "OrchestrationConfig", + messages: OffloadMessages, + source_data_client: "OffloadSourceDataInterface", ): """Verify offloaded data by either rowcount or sampling aggregation functions. Boundary conditions used to verify only those partitions offloaded by the current operation. @@ -1651,7 +1650,11 @@ def defaults_for_fresh_offload( offload_target_table.bucket_hash_column_supported(), ) - def defaults_for_existing_table(self, messages): + def defaults_for_existing_table( + self, + offload_target_table: "BackendTableInterface", + messages: OffloadMessages, + ): """Default bucket hash column and datatype mappings from existing table This is required for setting up a pre-existing table and is used by incremental partition append. @@ -1660,10 +1663,18 @@ def defaults_for_existing_table(self, messages): existing_metadata = self.get_hybrid_metadata() if not existing_metadata: - raise OffloadException( - offload_constants.MISSING_METADATA_EXCEPTION_TEMPLATE - % (self.owner, self.table_name) + if offload_target_table.has_rows(): + # If the table has rows but no metadata then we need to abort. + raise OffloadException( + offload_constants.MISSING_METADATA_EXCEPTION_TEMPLATE + % (self.owner, self.table_name) + ) + # If the table is empty then we allow the offload to continue. + messages.log( + f"Allowing Offload to populate exists empty table: {offload_target_table.db_name}.{offload_target_table.table_name}", + detail=VERBOSE, ) + return None self.set_bucket_info_from_metadata(existing_metadata, messages) @@ -2518,14 +2529,14 @@ def canonical_to_rdbms_mappings( def offload_operation_logic( - offload_operation, - offload_source_table, - offload_target_table, - offload_options, - source_data_client, + offload_operation: OffloadOperation, + offload_source_table: OffloadSourceTableInterface, + offload_target_table: "BackendTableInterface", + offload_options: "OrchestrationConfig", + source_data_client: "OffloadSourceDataInterface", existing_metadata, - messages, -): + messages: OffloadMessages, +) -> bool: """Logic defining what will be offloaded and what the final objects will look like There's a lot goes on in here but one key item to note is there are 2 distinct routes through: 1) The table either is new or is being reset, we take on board lots of options to define the final table @@ -2640,7 +2651,11 @@ def offload_operation_logic( messages=messages, ) - if offload_target_table.exists() and not offload_operation.reset_backend_table: + if ( + offload_target_table.exists() + and offload_target_table.has_rows() + and not offload_operation.reset_backend_table + ): if incr_append_capable: if source_data_client.nothing_to_offload(): return False @@ -2733,8 +2748,11 @@ def offload_table( existing_metadata = None if offload_target_table.exists() and not offload_operation.reset_backend_table: - # We need to pickup defaults for an existing table here, BEFORE we start looking for data to offload (get_offload_data_manager()) - existing_metadata = offload_operation.defaults_for_existing_table(messages) + # We need to pickup defaults for an existing table here, + # BEFORE we start looking for data to offload (get_offload_data_manager()). + existing_metadata = offload_operation.defaults_for_existing_table( + offload_target_table, messages + ) check_table_structure(offload_source_table, offload_target_table, messages) offload_target_table.refresh_operational_settings( offload_operation, rdbms_columns=offload_source_table.columns @@ -2757,7 +2775,7 @@ def offload_table( ) ) - source_data_client = messages.offload_step( + source_data_client: "OffloadSourceDataInterface" = messages.offload_step( command_steps.STEP_FIND_OFFLOAD_DATA, lambda: get_offload_data_manager( offload_source_table, diff --git a/src/goe/offload/backend_api.py b/src/goe/offload/backend_api.py index fb8b457d..62598b12 100644 --- a/src/goe/offload/backend_api.py +++ b/src/goe/offload/backend_api.py @@ -1849,8 +1849,12 @@ def table_distribution(self, db_name, table_name): pass @abstractmethod - def table_exists(self, db_name, table_name): - pass + def table_exists(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table exists or not.""" + + @abstractmethod + def table_has_rows(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table has rows or not.""" @abstractmethod def target_version(self): diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index fe9123cb..a1381c9b 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -26,7 +26,7 @@ import collections import inspect import logging -from typing import Callable, Optional +from typing import Callable, Optional, TYPE_CHECKING from goe.data_governance.hadoop_data_governance import ( data_governance_register_new_db_step, @@ -70,6 +70,9 @@ from goe.offload.hadoop.hadoop_column import HADOOP_TYPE_STRING from goe.util.misc_functions import csv_split +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + class BackendTableException(Exception): pass @@ -128,10 +131,10 @@ class BackendTableInterface(metaclass=ABCMeta): def __init__( self, - db_name, - table_name, - backend_type, - orchestration_options, + db_name: str, + table_name: str, + backend_type: str, + orchestration_options: "OrchestrationConfig", messages, orchestration_operation=None, hybrid_metadata=None, @@ -208,6 +211,7 @@ def __init__( # Cache some attributes in state self._columns = None self._partition_columns = None + self._has_rows = None self._log_profile_after_final_table_load = None self._log_profile_after_verification_queries = None @@ -2093,6 +2097,11 @@ def get_verification_cast(self, column): column_name = column.upper() return self._final_table_casts[column_name]["verify_cast"] + def has_rows(self): + if self._has_rows is None: + self._has_rows = self._db_api.table_has_rows(self.db_name, self.table_name) + return self._has_rows + def identifier_contains_invalid_characters(self, identifier): return self._db_api.identifier_contains_invalid_characters(identifier) diff --git a/src/goe/offload/bigquery/bigquery_backend_api.py b/src/goe/offload/bigquery/bigquery_backend_api.py index 50fba6ea..a35a7c35 100644 --- a/src/goe/offload/bigquery/bigquery_backend_api.py +++ b/src/goe/offload/bigquery/bigquery_backend_api.py @@ -2191,9 +2191,15 @@ def synthetic_partition_numbers_are_string(self): def table_distribution(self, db_name, table_name): return None - def table_exists(self, db_name, table_name): + def table_exists(self, db_name: str, table_name: str) -> bool: return self._object_exists(db_name, table_name, table_type="TABLE") + def table_has_rows(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table has rows or not.""" + sql = f"SELECT 1 FROM {self.enclose_object_reference(db_name, table_name)} LIMIT 1" + row = self.execute_query_fetch_one(sql, log_level=VVERBOSE) + return bool(row) + def target_version(self): """No version available via SQL or API for BigQuery""" return None diff --git a/src/goe/offload/bigquery/bigquery_backend_table.py b/src/goe/offload/bigquery/bigquery_backend_table.py index 7422687c..9059418a 100644 --- a/src/goe/offload/bigquery/bigquery_backend_table.py +++ b/src/goe/offload/bigquery/bigquery_backend_table.py @@ -21,6 +21,7 @@ """ import logging +from typing import TYPE_CHECKING from google.cloud import bigquery @@ -55,6 +56,9 @@ PARQUET_TYPE_INT64, ) +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + ############################################################################### # CONSTANTS @@ -75,10 +79,10 @@ class BackendBigQueryTable(BackendTableInterface): def __init__( self, - db_name, - table_name, - backend_type, - orchestration_options, + db_name: str, + table_name: str, + backend_type: str, + orchestration_options: "OrchestrationConfig", messages, orchestration_operation=None, hybrid_metadata=None, diff --git a/src/goe/offload/frontend_api.py b/src/goe/offload/frontend_api.py index f43fb7ef..c771d090 100644 --- a/src/goe/offload/frontend_api.py +++ b/src/goe/offload/frontend_api.py @@ -357,6 +357,7 @@ def enclose_query_hints(self, hint_contents): @abstractmethod def enclosure_character(self): """Correct character for the RDBMS for enclose_identifier(). + Superceded by enclose_identifier(), ideally we shouldn't use enclosure_character() any longer. """ @@ -369,8 +370,10 @@ def execute_function( arg_list=None, log_level=VERBOSE, not_when_dry_running=False, + commit=False, ): """Execute a function in the frontend system and return the result. + Parameters are based on Oracle functionality so may need to change as new frontends are implemented. not_when_dry_running: Because we don't know if a frontend function is read-only or read/write we assume we should run the function even in read-only mode. If the function is making changes @@ -380,6 +383,7 @@ def execute_function( @abstractmethod def format_query_parameter(self, param_name): """For engines that support query parameters this method prefixes or suffixes a parameter name. + For example on Oracle it prefixes a ":" symbol. """ diff --git a/src/goe/offload/hadoop/hadoop_backend_api.py b/src/goe/offload/hadoop/hadoop_backend_api.py index 8d3d834a..ec43e5fe 100644 --- a/src/goe/offload/hadoop/hadoop_backend_api.py +++ b/src/goe/offload/hadoop/hadoop_backend_api.py @@ -1619,9 +1619,15 @@ def synthetic_partition_numbers_are_string(self): def table_distribution(self, db_name, table_name): return None - def table_exists(self, db_name, table_name): + def table_exists(self, db_name: str, table_name: str) -> bool: return self.exists(db_name, table_name) + def table_has_rows(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table has rows or not.""" + sql = f"SELECT 1 FROM {self.enclose_object_reference(db_name, table_name)} LIMIT 1" + row = self.execute_query_fetch_one(sql, log_level=VVERBOSE) + return bool(row) + def target_version(self): if self._target_version is None: if self._hive_conn is None: diff --git a/src/goe/offload/hadoop/hadoop_backend_table.py b/src/goe/offload/hadoop/hadoop_backend_table.py index 383faf00..d64e7972 100644 --- a/src/goe/offload/hadoop/hadoop_backend_table.py +++ b/src/goe/offload/hadoop/hadoop_backend_table.py @@ -24,6 +24,7 @@ import logging import os +from typing import TYPE_CHECKING from goe.data_governance.hadoop_data_governance import ( data_governance_register_new_table_step, @@ -69,6 +70,9 @@ PARQUET_TYPE_INT64, ) +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + ############################################################################### # CONSTANTS @@ -99,10 +103,10 @@ class BackendHadoopTable(BackendTableInterface): def __init__( self, - db_name, - table_name, - backend_type, - orchestration_options, + db_name: str, + table_name: str, + backend_type: str, + orchestration_options: "OrchestrationConfig", messages, orchestration_operation=None, hybrid_metadata=None, diff --git a/src/goe/offload/microsoft/mssql_frontend_api.py b/src/goe/offload/microsoft/mssql_frontend_api.py index 5628164c..e0d2326a 100644 --- a/src/goe/offload/microsoft/mssql_frontend_api.py +++ b/src/goe/offload/microsoft/mssql_frontend_api.py @@ -407,6 +407,7 @@ def execute_function( arg_list=None, log_level=VERBOSE, not_when_dry_running=False, + commit=False, ): raise NotImplementedError("MSSQL execute_function not implemented.") diff --git a/src/goe/offload/microsoft/mssql_offload_transport_rdbms_api.py b/src/goe/offload/microsoft/mssql_offload_transport_rdbms_api.py index 1b4cfaf7..d0f3c72a 100644 --- a/src/goe/offload/microsoft/mssql_offload_transport_rdbms_api.py +++ b/src/goe/offload/microsoft/mssql_offload_transport_rdbms_api.py @@ -82,7 +82,6 @@ def get_rdbms_query_cast( nan_values_as_null=False, ): """Returns an expression suitable for reading a specific column from the RDBMS table""" - self.debug("get_rdbms_query_cast()") raise NotImplementedError("MSSQL get_rdbms_query_cast() pending implementation") def get_rdbms_session_setup_commands( diff --git a/src/goe/offload/microsoft/synapse_backend_api.py b/src/goe/offload/microsoft/synapse_backend_api.py index 86c723ea..62490866 100644 --- a/src/goe/offload/microsoft/synapse_backend_api.py +++ b/src/goe/offload/microsoft/synapse_backend_api.py @@ -1910,24 +1910,30 @@ def get_table_stats(self, db_name, table_name, as_dict=False): stats.extend( [ col_stat_name[0], # name - int( - 1 - / stats_row[ + ( + int( + 1 + / stats_row[ + dbcc_showstatistics_stat_header.density.value + ] + ) + if stats_row[ dbcc_showstatistics_stat_header.density.value ] - ) - if stats_row[dbcc_showstatistics_stat_header.density.value] - else 0, # ndv + else 0 + ), # ndv None, # num_nulls - int( - stats_row[ + ( + int( + stats_row[ + dbcc_showstatistics_stat_header.average_key_length.value + ] + ) + if stats_row[ dbcc_showstatistics_stat_header.average_key_length.value ] - ) - if stats_row[ - dbcc_showstatistics_stat_header.average_key_length.value - ] - else 0, # avg_col_len + else 0 + ), # avg_col_len None, # low_value None, # high_value col_stat_name[2], # max_col_len @@ -2393,7 +2399,7 @@ def table_distribution(self, db_name, table_name): ) return row[0] if row else row - def table_exists(self, db_name, table_name): + def table_exists(self, db_name: str, table_name: str) -> bool: sql = dedent( """\ SELECT table_name @@ -2410,6 +2416,12 @@ def table_exists(self, db_name, table_name): ) return bool(row) + def table_has_rows(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table has rows or not.""" + sql = f"SELECT 1 FROM {self.enclose_object_reference(db_name, table_name)} LIMIT 1" + row = self.execute_query_fetch_one(sql, log_level=VVERBOSE) + return bool(row) + def target_version(self): """Return version of the backend SQL engine in x.y.z format that can be used by GOEVersion(). This is different to backend_version() even though it appears similar in function. diff --git a/src/goe/offload/netezza/netezza_frontend_api.py b/src/goe/offload/netezza/netezza_frontend_api.py index fe9295c2..35e9f867 100644 --- a/src/goe/offload/netezza/netezza_frontend_api.py +++ b/src/goe/offload/netezza/netezza_frontend_api.py @@ -358,6 +358,7 @@ def execute_function( arg_list=None, log_level=VERBOSE, not_when_dry_running=False, + commit=False, ): raise NotImplementedError("Netezza execute_function not implemented.") diff --git a/src/goe/offload/netezza/netezza_offload_transport_rdbms_api.py b/src/goe/offload/netezza/netezza_offload_transport_rdbms_api.py index 489e7be7..0ac7df97 100644 --- a/src/goe/offload/netezza/netezza_offload_transport_rdbms_api.py +++ b/src/goe/offload/netezza/netezza_offload_transport_rdbms_api.py @@ -83,7 +83,6 @@ def get_rdbms_query_cast( nan_values_as_null=False, ): """Returns an expression suitable for reading a specific column from the RDBMS table""" - self.debug("get_rdbms_query_cast()") raise NotImplementedError( "Netezza get_rdbms_query_cast() pending implementation" ) diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index ea7f43c4..3c59c7a0 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -40,7 +40,6 @@ ) from goe.offload.offload_source_data import offload_source_data_factory from goe.offload.offload_source_table import ( - OFFLOAD_PARTITION_TYPE_RANGE, OFFLOAD_PARTITION_TYPE_LIST, ) from goe.offload.offload_transport import VALID_OFFLOAD_TRANSPORT_METHODS @@ -49,15 +48,14 @@ from goe.offload.operation.ddl_file import write_ddl_to_ddl_file from goe.offload.option_validation import ( active_data_append_options, + check_ipa_predicate_type_option_conflicts, check_opt_is_posint, ) from goe.orchestration import command_steps from goe.persistence.orchestration_metadata import ( hwm_column_names_from_predicates, - INCREMENTAL_PREDICATE_TYPE_PREDICATE, INCREMENTAL_PREDICATE_TYPE_LIST, INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, - INCREMENTAL_PREDICATE_TYPE_RANGE, INCREMENTAL_PREDICATE_TYPES_WITH_PREDICATE_IN_HV, ) from goe.util.misc_functions import format_list_for_logging @@ -66,6 +64,8 @@ from goe.config.orchestration_config import OrchestrationConfig from goe.goe import OffloadOperation from goe.offload.backend_table import BackendTableInterface + from goe.offload.offload_source_data import OffloadSourceDataInterface + from goe.offload.offload_source_table import OffloadSourceTableInterface from goe.persistence.orchestration_repo_client import ( OrchestrationRepoClientInterface, ) @@ -74,63 +74,6 @@ OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT = "Column mismatch detected between the source and backend table. Resolve before offloading" -def check_ipa_predicate_type_option_conflicts( - options, exc_cls=OffloadException, rdbms_table=None -): - ipa_predicate_type = getattr(options, "ipa_predicate_type", None) - active_lpa_opts = active_data_append_options( - options, - partition_type=OFFLOAD_PARTITION_TYPE_LIST, - ignore_partition_names_opt=True, - ) - active_rpa_opts = active_data_append_options( - options, - partition_type=OFFLOAD_PARTITION_TYPE_RANGE, - ignore_partition_names_opt=True, - ) - if ipa_predicate_type in [ - INCREMENTAL_PREDICATE_TYPE_RANGE, - INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, - ]: - if active_lpa_opts: - raise exc_cls( - "LIST %s with %s: %s" - % ( - offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, - ipa_predicate_type, - ", ".join(active_lpa_opts), - ) - ) - if rdbms_table and active_rpa_opts: - # If we have access to an RDBMS table then we can check if the partition column data types are valid for IPA - unsupported_types = rdbms_table.unsupported_partition_data_types( - partition_type_override=OFFLOAD_PARTITION_TYPE_RANGE - ) - if unsupported_types: - raise exc_cls( - "RANGE %s with partition data types: %s" - % ( - offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, - ", ".join(unsupported_types), - ) - ) - elif ipa_predicate_type == INCREMENTAL_PREDICATE_TYPE_LIST: - if active_rpa_opts: - raise exc_cls( - "RANGE %s with %s: %s" - % ( - offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, - ipa_predicate_type, - ", ".join(active_rpa_opts), - ) - ) - elif ipa_predicate_type == INCREMENTAL_PREDICATE_TYPE_PREDICATE: - if not options.offload_predicate: - raise exc_cls( - offload_constants.IPA_PREDICATE_TYPE_REQUIRES_PREDICATE_EXCEPTION_TEXT - ) - - def check_table_structure(frontend_table, backend_table, messages: OffloadMessages): """Compare frontend and backend columns by name and throw an exception if there is a mismatch. @@ -239,8 +182,8 @@ def step_fn(): def get_current_offload_hv( - offload_source_table, - source_data_client, + offload_source_table: "OffloadSourceTableInterface", + source_data_client: "OffloadSourceDataInterface", offload_operation, messages: OffloadMessages, ): @@ -293,7 +236,10 @@ def get_current_offload_hv( def get_prior_offloaded_hv( - rdbms_table, source_data_client, offload_operation, messages: OffloadMessages + rdbms_table: "OffloadSourceTableInterface", + source_data_client: "OffloadSourceDataInterface", + offload_operation, + messages: OffloadMessages, ): """Identifies the HV for a RANGE offload of the partition prior to the offload If there is pre-offload metadata we can use that otherwise we need to go back to the list of partitions @@ -348,16 +294,16 @@ def get_prior_offloaded_hv( def get_offload_data_manager( - offload_source_table, - offload_target_table, - offload_operation, + offload_source_table: "OffloadSourceTableInterface", + offload_target_table: "BackendTableInterface", + offload_operation: "OffloadOperation", offload_options, messages: OffloadMessages, existing_metadata, source_client_type, partition_columns=None, include_col_offload_source_table=False, -): +) -> "OffloadSourceDataInterface": """Return a source data manager object which has methods for slicing and dicing RDBMS partitions and state containing which partitions to offload and data to construct hybrid view/verification predicates """ @@ -368,10 +314,6 @@ def get_offload_data_manager( ): # "not offload_target_table.is_view()" because we pass through here for presented joins too and do not expect previous metadata messages.log("Pre-offload metadata: " + str(existing_metadata), detail=VVERBOSE) - if not existing_metadata: - messages.warning( - "Backend table exists but hybrid metadata is missing, this appears to be recovery from a failed offload" - ) if include_col_offload_source_table and existing_metadata: col_offload_source_table_override = OffloadSourceTable.create( @@ -424,11 +366,11 @@ def offload_backend_db_message( def offload_type_force_effects( - hybrid_operation, - source_data_client, + hybrid_operation: "OffloadOperation", + source_data_client: "OffloadSourceDataInterface", original_metadata, - offload_source_table, - messages, + offload_source_table: "OffloadSourceTableInterface", + messages: OffloadMessages, ): if source_data_client.is_incremental_append_capable(): original_offload_type = original_metadata.offload_type diff --git a/src/goe/offload/offload_source_data.py b/src/goe/offload/offload_source_data.py index 8a60ba54..46457234 100755 --- a/src/goe/offload/offload_source_data.py +++ b/src/goe/offload/offload_source_data.py @@ -2210,10 +2210,8 @@ def is_valid_common_boundary(check_partition, retained_partitions_ascending): def more_human_readable_python_hwm(python_hwm): # convert to str to remove datatype info and chop off any redundant trailing fractional seconds - str_fn = ( - lambda x: re.sub(r"\.000000$", "", str(x)) - if type(x) is datetime64 - else str(x) + str_fn = lambda x: ( + re.sub(r"\.000000$", "", str(x)) if type(x) is datetime64 else str(x) ) return str([str_fn(hv) for hv in python_hwm]) diff --git a/src/goe/offload/option_validation.py b/src/goe/offload/option_validation.py index bc400e34..74d865b0 100644 --- a/src/goe/offload/option_validation.py +++ b/src/goe/offload/option_validation.py @@ -14,8 +14,9 @@ from optparse import OptionValueError import re +from typing import TYPE_CHECKING -from goe.exceptions import OffloadOptionError +from goe.exceptions import OffloadException, OffloadOptionError from goe.offload import offload_constants from goe.offload.predicate_offload import GenericPredicate from goe.offload.offload_source_table import ( @@ -23,8 +24,18 @@ OFFLOAD_PARTITION_TYPE_RANGE, OFFLOAD_PARTITION_TYPE_LIST, ) +from goe.persistence.orchestration_metadata import ( + INCREMENTAL_PREDICATE_TYPE_PREDICATE, + INCREMENTAL_PREDICATE_TYPE_LIST, + INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, + INCREMENTAL_PREDICATE_TYPE_RANGE, +) from goe.util.misc_functions import is_pos_int +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + from goe.offload.offload_source_table import OffloadSourceTableInterface + def active_data_append_options( opts, @@ -80,6 +91,63 @@ def check_opt_is_posint( ) +def check_ipa_predicate_type_option_conflicts( + options, exc_cls=OffloadException, rdbms_table: "OffloadSourceTableInterface" = None +): + ipa_predicate_type = getattr(options, "ipa_predicate_type", None) + active_lpa_opts = active_data_append_options( + options, + partition_type=OFFLOAD_PARTITION_TYPE_LIST, + ignore_partition_names_opt=True, + ) + active_rpa_opts = active_data_append_options( + options, + partition_type=OFFLOAD_PARTITION_TYPE_RANGE, + ignore_partition_names_opt=True, + ) + if ipa_predicate_type in [ + INCREMENTAL_PREDICATE_TYPE_RANGE, + INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, + ]: + if active_lpa_opts: + raise exc_cls( + "LIST %s with %s: %s" + % ( + offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, + ipa_predicate_type, + ", ".join(active_lpa_opts), + ) + ) + if rdbms_table and active_rpa_opts: + # If we have access to an RDBMS table then we can check if the partition column data types are valid for IPA + unsupported_types = rdbms_table.unsupported_partition_data_types( + partition_type_override=OFFLOAD_PARTITION_TYPE_RANGE + ) + if unsupported_types: + raise exc_cls( + "RANGE %s with partition data types: %s" + % ( + offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, + ", ".join(unsupported_types), + ) + ) + elif ipa_predicate_type == INCREMENTAL_PREDICATE_TYPE_LIST: + if active_rpa_opts: + raise exc_cls( + "RANGE %s with %s: %s" + % ( + offload_constants.IPA_PREDICATE_TYPE_FILTER_EXCEPTION_TEXT, + ipa_predicate_type, + ", ".join(active_rpa_opts), + ) + ) + elif ipa_predicate_type == INCREMENTAL_PREDICATE_TYPE_PREDICATE: + if not options.offload_predicate: + raise exc_cls( + offload_constants.IPA_PREDICATE_TYPE_REQUIRES_PREDICATE_EXCEPTION_TEXT + ) + + def normalise_data_sampling_options(options): if hasattr(options, "data_sample_pct"): if isinstance(options.data_sample_pct, str) and re.search( @@ -124,7 +192,7 @@ def normalise_offload_predicate_options(options): ) -def normalise_stats_options(options, target_backend): +def normalise_stats_options(options, target_backend: str): if options.offload_stats_method not in [ offload_constants.OFFLOAD_STATS_METHOD_NATIVE, offload_constants.OFFLOAD_STATS_METHOD_HISTORY, diff --git a/src/goe/offload/oracle/oracle_frontend_api.py b/src/goe/offload/oracle/oracle_frontend_api.py index c6ca655a..84d04a5d 100644 --- a/src/goe/offload/oracle/oracle_frontend_api.py +++ b/src/goe/offload/oracle/oracle_frontend_api.py @@ -260,7 +260,7 @@ def _create_table_sql_text( and CREATE_TABLE_ORACLE_PARTITION_SPEC in table_properties ) raise NotImplementedError( - f"Create table partitioning pending implementation" + "Create table partitioning pending implementation" ) sql = ( @@ -358,6 +358,7 @@ def _execute_plsql_function( arg_list=None, log_level=None, not_when_dry_running=False, + commit=False, ): """Wrapper over cx_Oracle callfunc to additionally get and close a cursor""" logger.debug("Calling SQL function %s" % sql_fn) @@ -391,6 +392,8 @@ def _execute_plsql_function( return None finally: self._close_cursor() + if commit: + self._db_conn.commit() def _execute_query_fetch_x( self, @@ -836,6 +839,7 @@ def execute_function( arg_list=None, log_level=VERBOSE, not_when_dry_running=False, + commit=False, ): return self._execute_plsql_function( sql_fn, @@ -844,6 +848,7 @@ def execute_function( arg_list=arg_list, log_level=log_level, not_when_dry_running=not_when_dry_running, + commit=commit, ) def fetchmany_takes_fetch_size(self): diff --git a/src/goe/offload/oracle/oracle_offload_transport_rdbms_api.py b/src/goe/offload/oracle/oracle_offload_transport_rdbms_api.py index 34f4a586..461b5f43 100644 --- a/src/goe/offload/oracle/oracle_offload_transport_rdbms_api.py +++ b/src/goe/offload/oracle/oracle_offload_transport_rdbms_api.py @@ -329,7 +329,6 @@ def get_rdbms_query_cast( for_spark: Spark fails to process TS WITH LOCAL TIME ZONE and INTERVALs so we convert to character in the DB. for_qi: Allows different behaviour for Query Import. """ - self.debug("get_rdbms_query_cast()") ff_scale = self._get_ts_ff_scale(max_ts_scale) cast_expression = column_expression if rdbms_column.data_type == ORACLE_TYPE_TIMESTAMP_LOCAL_TZ: diff --git a/src/goe/offload/snowflake/snowflake_backend_api.py b/src/goe/offload/snowflake/snowflake_backend_api.py index 8a2f61ff..9f54f347 100644 --- a/src/goe/offload/snowflake/snowflake_backend_api.py +++ b/src/goe/offload/snowflake/snowflake_backend_api.py @@ -1949,7 +1949,7 @@ def synthetic_partition_numbers_are_string(self): def table_distribution(self, db_name, table_name): return None - def table_exists(self, db_name, table_name): + def table_exists(self, db_name: str, table_name: str) -> bool: """Return True/False if a table exists using SHOW TABLES. DESCRIBE TABLE/SHOW TABLES are much faster than using INFORMATION_SCHEMA but both have problems: SHOW TABLES is has case-insensitive LIKE and IN clauses plus throws exceptions if the schema doesn't exist. @@ -1988,6 +1988,12 @@ def table_exists(self, db_name, table_name): raise return False + def table_has_rows(self, db_name: str, table_name: str) -> bool: + """Return bool depending whether the table has rows or not.""" + sql = f"SELECT 1 FROM {self.enclose_object_reference(db_name, table_name)} LIMIT 1" + row = self.execute_query_fetch_one(sql, log_level=VVERBOSE) + return bool(row) + def target_version(self): """Return version of the backend SQL engine in x.y.z format that can be used by GOEVersion(). This is different to backend_version() even though it appears similar in function. diff --git a/src/goe/offload/spark/dataproc_offload_transport.py b/src/goe/offload/spark/dataproc_offload_transport.py index c0bde7c1..2c7382f6 100644 --- a/src/goe/offload/spark/dataproc_offload_transport.py +++ b/src/goe/offload/spark/dataproc_offload_transport.py @@ -13,7 +13,7 @@ # limitations under the License. import re -from typing import Union +from typing import Union, TYPE_CHECKING from goe.config import orchestration_defaults from goe.offload.factory.offload_transport_rdbms_api_factory import ( @@ -24,7 +24,6 @@ from goe.offload.offload_transport import ( OffloadTransportException, OffloadTransportSpark, - FRONTEND_TRACE_MODULE, MISSING_ROWS_SPARK_WARNING, OFFLOAD_TRANSPORT_METHOD_SPARK_BATCHES_GCLOUD, OFFLOAD_TRANSPORT_METHOD_SPARK_DATAPROC_GCLOUD, @@ -36,6 +35,12 @@ from goe.orchestration import command_steps from goe.util.misc_functions import write_temp_file +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + from goe.offload.backend_table import BackendTableInterface + from goe.offload.offload_messages import OffloadMessages + from goe.offload.offload_source_table import OffloadSourceTableInterface + GCLOUD_PROPERTY_SEPARATOR = ",GSEP," @@ -45,11 +50,11 @@ class OffloadTransportSparkBatchesGcloud(OffloadTransportSpark): def __init__( self, - offload_source_table, - offload_target_table, + offload_source_table: "OffloadSourceTableInterface", + offload_target_table: "BackendTableInterface", offload_operation, - offload_options, - messages, + offload_options: "OrchestrationConfig", + messages: "OffloadMessages", dfs_client, rdbms_columns_override=None, ): diff --git a/src/goe/offload/teradata/teradata_frontend_api.py b/src/goe/offload/teradata/teradata_frontend_api.py index 618333ca..7a15e8ae 100644 --- a/src/goe/offload/teradata/teradata_frontend_api.py +++ b/src/goe/offload/teradata/teradata_frontend_api.py @@ -613,6 +613,7 @@ def execute_function( arg_list=None, log_level=VERBOSE, not_when_dry_running=False, + commit=False, ): raise NotImplementedError("execute_function() is not implemented for Teradata") diff --git a/src/goe/offload/teradata/teradata_offload_transport_rdbms_api.py b/src/goe/offload/teradata/teradata_offload_transport_rdbms_api.py index f24f9ec9..ac446faa 100644 --- a/src/goe/offload/teradata/teradata_offload_transport_rdbms_api.py +++ b/src/goe/offload/teradata/teradata_offload_transport_rdbms_api.py @@ -199,7 +199,6 @@ def get_rdbms_query_cast( nan_values_as_null=False, ): """Returns an expression suitable for reading a specific column from the RDBMS table""" - self.debug("get_rdbms_query_cast()") ff_scale = self._get_ts_ff_scale(max_ts_scale) cast_expression = column_expression if rdbms_column.data_type == TERADATA_TYPE_TIMESTAMP_TZ: diff --git a/src/goe/persistence/oracle/oracle_orchestration_repo_client.py b/src/goe/persistence/oracle/oracle_orchestration_repo_client.py index 7d768db2..31173564 100644 --- a/src/goe/persistence/oracle/oracle_orchestration_repo_client.py +++ b/src/goe/persistence/oracle/oracle_orchestration_repo_client.py @@ -122,6 +122,7 @@ def _drop_metadata(self, frontend_owner: str, frontend_name: str): "offload_repo.delete_offload_metadata", arg_list=[frontend_owner, frontend_name], not_when_dry_running=True, + commit=True, ) def _get_metadata(self, frontend_owner: str, frontend_name: str) -> dict: @@ -266,6 +267,7 @@ def _set_metadata( "offload_repo.save_offload_metadata", arg_list=[frontend_owner, frontend_name, ora_metadata], not_when_dry_running=True, + commit=True, ) # FrontendApi logging won't show metadata values due to being in an Oracle type. So we log it here for # benefit of support. diff --git a/tests/integration/offload/test_backend_table.py b/tests/integration/offload/test_backend_table.py index 106faf04..b297b05b 100644 --- a/tests/integration/offload/test_backend_table.py +++ b/tests/integration/offload/test_backend_table.py @@ -432,6 +432,9 @@ def _test_get_partition_columns(self): else: self.assertFalse(cols) + def _test_has_rows(self): + self.assertTrue(self.api.has_rows()) + def _test_get_staging_table_location(self): try: self.api.get_staging_table_location() @@ -617,6 +620,7 @@ def _run_all_tests(self): self._test_get_default_location() self._test_get_partition_columns() self._test_get_staging_table_location() + self._test_has_rows() self._test_result_cache_area_exists() self._test_setup_result_cache_area() self._test_setup_staging_area() diff --git a/tests/integration/scenarios/setup_functions.py b/tests/integration/scenarios/setup_functions.py index 5844db27..2ec51ac3 100644 --- a/tests/integration/scenarios/setup_functions.py +++ b/tests/integration/scenarios/setup_functions.py @@ -12,16 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import TYPE_CHECKING + from goe.offload import offload_transport from goe.offload.offload_functions import convert_backend_identifier_case from goe.offload.offload_messages import VERBOSE +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + from goe.persistence.orchestration_repo_client import ( + OrchestrationRepoClientInterface, + ) + from tests.testlib.test_framework.backend_testing_api import ( + BackendTestingApiInterface, + ) + from tests.testlib.test_framework.frontend_testing_api import ( + FrontendTestingApiInterface, + ) + from tests.testlib.test_framework.offload_test_messages import OffloadTestMessages + def drop_backend_test_table( - options, backend_api, test_messages, db, table_name, drop_any=False, view=False + config: "OrchestrationConfig", + backend_api: "BackendTestingApiInterface", + test_messages: "OffloadTestMessages", + db: str, + table_name: str, + drop_any=False, + view=False, ): """Convert the db and table name to the correct case before issuing the drop.""" - db, table_name = convert_backend_identifier_case(options, db, table_name) + db, table_name = convert_backend_identifier_case(config, db, table_name) if not backend_api.database_exists(db): test_messages.log( "drop_backend_test_table(%s, %s) DB does not exist" % (db, table_name), @@ -48,17 +69,30 @@ def drop_backend_test_table( backend_api.drop_table(db, table_name, sync=True) -def drop_backend_test_load_table(options, backend_api, test_messages, db, table_name): +def drop_backend_test_load_table( + config: "OrchestrationConfig", + backend_api: "BackendTestingApiInterface", + test_messages: "OffloadTestMessages", + db: str, + table_name: str, +): if backend_api and not backend_api.load_db_transport_supported(): return - drop_backend_test_table(options, backend_api, test_messages, db, table_name) + drop_backend_test_table(config, backend_api, test_messages, db, table_name) + + +def drop_offload_metadata( + repo_client: "OrchestrationRepoClientInterface", schema: str, table_name: str +): + """Simple wrapper over drop_offload_metadata() in case we need to catch exceptions in the future.""" + repo_client.drop_offload_metadata(schema, table_name) def gen_drop_sales_based_fact_partition_ddls( - schema, - table_name, + schema: str, + table_name: str, hv_string_list, - frontend_api, + frontend_api: "FrontendTestingApiInterface", truncate_instead_of_drop=False, dropping_oldest=None, ) -> list: @@ -80,7 +114,10 @@ def gen_drop_sales_based_fact_partition_ddls( def gen_truncate_sales_based_fact_partition_ddls( - schema, table_name, hv_string_list, frontend_api + schema: str, + table_name: str, + hv_string_list, + frontend_api: "FrontendTestingApiInterface", ): """hv_string_list in format YYYY-MM-DD""" return gen_drop_sales_based_fact_partition_ddls( @@ -89,7 +126,10 @@ def gen_truncate_sales_based_fact_partition_ddls( def get_sales_based_fact_partition_list( - schema, table_name, hv_string_list, frontend_api + schema: str, + table_name: str, + hv_string_list, + frontend_api: "FrontendTestingApiInterface", ) -> list: """Return a list of partitions matching a date high value string, used for SALES based tests hv_string_list in format YYYY-MM-DD @@ -105,14 +145,16 @@ def get_sales_based_fact_partition_list( return partitions -def no_query_import_transport_method(options, no_table_centric_sqoop=False): - if not options: +def no_query_import_transport_method( + config: "OrchestrationConfig", no_table_centric_sqoop=False +): + if not config: return offload_transport.OFFLOAD_TRANSPORT_METHOD_QUERY_IMPORT - if offload_transport.is_spark_thrift_available(options, None): + if offload_transport.is_spark_thrift_available(config, None): return offload_transport.OFFLOAD_TRANSPORT_METHOD_SPARK_THRIFT - elif offload_transport.is_spark_submit_available(options, None): + elif offload_transport.is_spark_submit_available(config, None): return offload_transport.OFFLOAD_TRANSPORT_METHOD_SPARK_SUBMIT - elif offload_transport.is_sqoop_available(None, options): + elif offload_transport.is_sqoop_available(None, config): if no_table_centric_sqoop: return offload_transport.OFFLOAD_TRANSPORT_METHOD_SQOOP_BY_QUERY else: diff --git a/tests/integration/scenarios/test_ddl_file.py b/tests/integration/scenarios/test_ddl_file.py index d82aa263..c28532c2 100644 --- a/tests/integration/scenarios/test_ddl_file.py +++ b/tests/integration/scenarios/test_ddl_file.py @@ -222,6 +222,9 @@ def test_ddl_file_new_table_local_fs(config, schema, data_db): config, schema, data_db, test_table, ddl_file_prefix, backend_api, messages ) + # Connections are being left open, explicitly close them. + frontend_api.close() + def test_ddl_file_existing_table_local_fs(config, schema, data_db): """Test requesting a DDL file to local FS for a previously offloaded table.""" @@ -248,6 +251,9 @@ def test_ddl_file_existing_table_local_fs(config, schema, data_db): config, schema, data_db, test_table, ddl_file_prefix, backend_api, messages ) + # Connections are being left open, explicitly close them. + frontend_api.close() + def test_ddl_file_new_table_cloud_storage(config, schema, data_db): """Test requesting a DDL file to cloud storage for a new table.""" @@ -292,3 +298,6 @@ def test_ddl_file_new_table_cloud_storage(config, schema, data_db): messages, dfs_client=dfs_client, ) + + # Connections are being left open, explicitly close them. + frontend_api.close() diff --git a/tests/integration/scenarios/test_offload_basic.py b/tests/integration/scenarios/test_offload_basic.py index daa2909e..712dd111 100644 --- a/tests/integration/scenarios/test_offload_basic.py +++ b/tests/integration/scenarios/test_offload_basic.py @@ -25,11 +25,14 @@ data_db_name, load_db_name, ) +from goe.offload.offload_messages import FORCED_EXCEPTION_TEXT from goe.offload.offload_metadata_functions import ( INCREMENTAL_PREDICATE_TYPE_LIST, INCREMENTAL_PREDICATE_TYPE_RANGE, ) from goe.offload.offload_source_data import MAX_QUERY_OPTIMISTIC_PRUNE_CLAUSE +from goe.orchestration import command_steps +from goe.orchestration.command_steps import step_title from goe.persistence.factory.orchestration_repo_client_factory import ( orchestration_repo_client_factory, ) @@ -50,6 +53,7 @@ from tests.integration.scenarios.setup_functions import ( drop_backend_test_load_table, drop_backend_test_table, + drop_offload_metadata, gen_truncate_sales_based_fact_partition_ddls, partition_columns_if_supported, ) @@ -66,7 +70,9 @@ OFFLOAD_DIM = "STORY_DIM" +OFFLOAD_DIM2 = "STORY_EXISTS_DIM" OFFLOAD_FACT = "STORY_FACT" +OFFLOAD_FACT2 = "STORY_EXISTS_FACT" @pytest.fixture @@ -271,7 +277,8 @@ def test_offload_basic_dim(config, schema, data_db): config, messages, trace_action=f"repo_client({id})" ) - backend_name = convert_backend_identifier_case(config, OFFLOAD_DIM) + test_table = OFFLOAD_DIM + backend_name = convert_backend_identifier_case(config, test_table) copy_stats_available = backend_api.table_stats_set_supported() # Setup @@ -280,37 +287,38 @@ def test_offload_basic_dim(config, schema, data_db): backend_api, config, messages, - frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, OFFLOAD_DIM), + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), python_fns=[ lambda: drop_backend_test_table( - config, backend_api, messages, data_db, OFFLOAD_DIM + config, backend_api, messages, data_db, test_table ), lambda: drop_backend_test_load_table( - config, backend_api, messages, load_db, OFFLOAD_DIM + config, backend_api, messages, load_db, test_table ), + lambda: drop_offload_metadata(repo_client, schema, test_table), ], ) # Frontend API is not used for anything else so let's close it. frontend_api.close() - assert not backend_table_exists(config, backend_api, messages, data_db, OFFLOAD_DIM) - assert not backend_table_exists(config, backend_api, messages, load_db, OFFLOAD_DIM) + assert not backend_table_exists(config, backend_api, messages, data_db, test_table) + assert not backend_table_exists(config, backend_api, messages, load_db, test_table) # Basic verification mode offload of a simple dimension. options = { - "owner_table": schema + "." + OFFLOAD_DIM, + "owner_table": schema + "." + test_table, "reset_backend_table": True, "execute": False, } run_offload(options, config, messages) assert not backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_DIM + config, backend_api, messages, data_db, test_table ), "Backend table should NOT exist" # Basic offload of a simple dimension. options = { - "owner_table": schema + "." + OFFLOAD_DIM, + "owner_table": schema + "." + test_table, "offload_stats_method": ( offload_constants.OFFLOAD_STATS_METHOD_COPY if copy_stats_available @@ -326,22 +334,22 @@ def test_offload_basic_dim(config, schema, data_db): run_offload(options, config, messages) assert backend_table_exists( - config, backend_api, messages, load_db, OFFLOAD_DIM + config, backend_api, messages, load_db, test_table ), "Backend load table should exist" assert standard_dimension_assertion( - config, backend_api, messages, repo_client, schema, data_db, OFFLOAD_DIM + config, backend_api, messages, repo_client, schema, data_db, test_table ) # Attempt to re-offload, expect to fail. options = { - "owner_table": schema + "." + OFFLOAD_DIM, + "owner_table": schema + "." + test_table, "execute": True, } run_offload(options, config, messages, expected_status=False) # Reset offload the dimension adding backend partitioning (if supported). options = { - "owner_table": schema + "." + OFFLOAD_DIM, + "owner_table": schema + "." + test_table, "offload_partition_lower_value": 0, "offload_partition_upper_value": 1000, "reset_backend_table": True, @@ -365,13 +373,13 @@ def test_offload_basic_dim(config, schema, data_db): run_offload(options, config, messages) assert backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_DIM + config, backend_api, messages, data_db, test_table ), "Backend table should exist" assert not backend_table_exists( - config, backend_api, messages, load_db, OFFLOAD_DIM + config, backend_api, messages, load_db, test_table ), "Backend load table should NOT exist" assert standard_dimension_assertion( - config, backend_api, messages, repo_client, schema, data_db, OFFLOAD_DIM + config, backend_api, messages, repo_client, schema, data_db, test_table ) assert offload_basic_dim_assertion(backend_api, messages, data_db, backend_name) @@ -388,7 +396,8 @@ def test_offload_basic_fact(config, schema, data_db): config, messages, trace_action=f"repo_client({id})" ) - backend_name = convert_backend_identifier_case(config, OFFLOAD_FACT) + test_table = OFFLOAD_FACT + backend_name = convert_backend_identifier_case(config, test_table) # Setup run_setup( @@ -397,20 +406,23 @@ def test_offload_basic_fact(config, schema, data_db): config, messages, frontend_sqls=frontend_api.sales_based_fact_create_ddl( - schema, OFFLOAD_FACT, simple_partition_names=True - ), - python_fns=lambda: drop_backend_test_table( - config, backend_api, messages, data_db, OFFLOAD_FACT + schema, test_table, simple_partition_names=True ), + python_fns=[ + lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + lambda: drop_offload_metadata(repo_client, schema, test_table), + ], ) assert not backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_FACT + config, backend_api, messages, data_db, test_table ), "The backend table should NOT exist" # Non-Execute offload of first partition with basic options. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, "reset_backend_table": True, @@ -419,12 +431,12 @@ def test_offload_basic_fact(config, schema, data_db): run_offload(options, config, messages) assert not backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_FACT + config, backend_api, messages, data_db, test_table ), "The backend table should NOT exist" # Offload of RANGE requesting LIST. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_LIST, "reset_backend_table": True, @@ -438,13 +450,13 @@ def test_offload_basic_fact(config, schema, data_db): ) assert not backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_FACT + config, backend_api, messages, data_db, test_table ), "The backend table should NOT exist" if config.db_type != offload_constants.DBTYPE_TERADATA: # Offloads only empty partitions. Ensure 0 rows in backend. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_PRE_HV, "reset_backend_table": True, "create_backend_db": True, @@ -453,11 +465,10 @@ def test_offload_basic_fact(config, schema, data_db): run_offload(options, config, messages) assert backend_table_exists( - config, backend_api, messages, data_db, OFFLOAD_FACT + config, backend_api, messages, data_db, test_table ), "Backend table should exist" assert ( - backend_table_count(config, backend_api, messages, data_db, OFFLOAD_FACT) - == 0 + backend_table_count(config, backend_api, messages, data_db, test_table) == 0 ), "Backend table should be empty" # Non-Execute offload of first partition with advanced options. @@ -467,7 +478,7 @@ def test_offload_basic_fact(config, schema, data_db): else offload_constants.OFFLOAD_STATS_METHOD_NATIVE ) options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_1, "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, "integer_2_columns_csv": "channel_id", @@ -510,7 +521,7 @@ def test_offload_basic_fact(config, schema, data_db): repo_client, schema, data_db, - OFFLOAD_FACT, + test_table, test_constants.SALES_BASED_FACT_HV_1, check_backend_rowcount=True, ) @@ -520,7 +531,7 @@ def test_offload_basic_fact(config, schema, data_db): # Incremental Offload of Fact - Non-Execute. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_2, "execute": False, } @@ -532,7 +543,7 @@ def test_offload_basic_fact(config, schema, data_db): # Offloads next partition from fact table. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_2, "execute": True, } @@ -545,7 +556,7 @@ def test_offload_basic_fact(config, schema, data_db): repo_client, schema, data_db, - OFFLOAD_FACT, + test_table, test_constants.SALES_BASED_FACT_HV_2, ) @@ -559,13 +570,13 @@ def test_offload_basic_fact(config, schema, data_db): repo_client, schema, data_db, - OFFLOAD_FACT, + test_table, test_constants.SALES_BASED_FACT_HV_2, ) # Offloads next partition with dodgy settings, offload will override these with sensible options. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_3, "integer_1_columns_csv": "cust_id,channel_id,prod_id", "offload_partition_granularity": 100, @@ -586,7 +597,7 @@ def test_offload_basic_fact(config, schema, data_db): repo_client, schema, data_db, - OFFLOAD_FACT, + test_table, test_constants.SALES_BASED_FACT_HV_3, ) assert offload_basic_fact_2nd_incr_assertion( @@ -600,13 +611,13 @@ def test_offload_basic_fact(config, schema, data_db): config, messages, frontend_sqls=gen_truncate_sales_based_fact_partition_ddls( - schema, OFFLOAD_FACT, [test_constants.SALES_BASED_FACT_HV_4], frontend_api + schema, test_table, [test_constants.SALES_BASED_FACT_HV_4], frontend_api ), ) # Offloads next partition from fact table after all offloaded partitions have been truncated. options = { - "owner_table": schema + "." + OFFLOAD_FACT, + "owner_table": schema + "." + test_table, "older_than_date": test_constants.SALES_BASED_FACT_HV_4, "execute": True, } @@ -622,9 +633,148 @@ def test_offload_basic_fact(config, schema, data_db): repo_client, schema, data_db, - OFFLOAD_FACT, + test_table, test_constants.SALES_BASED_FACT_HV_4, ) # Connections are being left open, explicitly close them. frontend_api.close() + + +def test_offload_dim_to_existing_table(config, schema, data_db): + id = "test_offload_dim_to_existing_table" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages, trace_action=id) + repo_client = orchestration_repo_client_factory( + config, messages, trace_action=f"repo_client({id})" + ) + + test_table = OFFLOAD_DIM2 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl(schema, test_table), + python_fns=[ + lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + lambda: drop_offload_metadata(repo_client, schema, test_table), + ], + ) + + # Offload the table to create the backend table but exit before doing anything else. + options = { + "owner_table": schema + "." + test_table, + "error_after_step": step_title(command_steps.STEP_CREATE_TABLE), + "reset_backend_table": True, + "execute": True, + } + run_offload( + options, + config, + messages, + expected_exception_string=FORCED_EXCEPTION_TEXT, + ) + + assert ( + backend_table_count(config, backend_api, messages, data_db, test_table) == 0 + ), "Backend table should be empty" + + # Now we can attempt to offload to a pre-created empty backend table, this should succeed. + options = { + "owner_table": schema + "." + test_table, + "execute": True, + } + run_offload( + options, + config, + messages, + ) + + # If we try the offload again it should fail because the table has contents. + run_offload( + options, + config, + messages, + expected_status=False, + ) + + # Connections are being left open, explicitly close them. + frontend_api.close() + + +def test_offload_fact_to_existing_table(config, schema, data_db): + id = "test_offload_fact_to_existing_table" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages, trace_action=id) + repo_client = orchestration_repo_client_factory( + config, messages, trace_action=f"repo_client({id})" + ) + + test_table = OFFLOAD_FACT2 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.sales_based_fact_create_ddl( + schema, test_table, simple_partition_names=True + ), + python_fns=[ + lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + lambda: drop_offload_metadata(repo_client, schema, test_table), + ], + ) + + # Offload the table to create the backend table but exit before doing anything else. + options = { + "owner_table": schema + "." + test_table, + "older_than_date": test_constants.SALES_BASED_FACT_HV_1, + "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, + "error_after_step": step_title(command_steps.STEP_CREATE_TABLE), + "reset_backend_table": True, + "execute": True, + } + run_offload( + options, + config, + messages, + expected_exception_string=FORCED_EXCEPTION_TEXT, + ) + assert ( + backend_table_count(config, backend_api, messages, data_db, test_table) == 0 + ), "Backend table should be empty" + + # Now we can attempt to offload to a pre-created empty backend table, this should succeed. + options = { + "owner_table": schema + "." + test_table, + "older_than_date": test_constants.SALES_BASED_FACT_HV_1, + "ipa_predicate_type": INCREMENTAL_PREDICATE_TYPE_RANGE, + "execute": True, + } + run_offload( + options, + config, + messages, + ) + + # If we try the offload again it should fail because the table has metadata. + run_offload( + options, + config, + messages, + expected_status=False, + ) + + # Connections are being left open, explicitly close them. + frontend_api.close() diff --git a/tests/testlib/test_framework/backend_testing_api.py b/tests/testlib/test_framework/backend_testing_api.py index 304fe97c..b75a5af8 100644 --- a/tests/testlib/test_framework/backend_testing_api.py +++ b/tests/testlib/test_framework/backend_testing_api.py @@ -24,7 +24,6 @@ import subprocess from subprocess import PIPE, STDOUT import sys -import time from goe.offload.column_metadata import ( CanonicalColumn, diff --git a/tests/testlib/test_framework/test_functions.py b/tests/testlib/test_framework/test_functions.py index 63f43a42..bfcd1ad8 100644 --- a/tests/testlib/test_framework/test_functions.py +++ b/tests/testlib/test_framework/test_functions.py @@ -40,15 +40,6 @@ from tests.testlib.test_framework.offload_test_messages import OffloadTestMessages -def get_backend_db_table_name_from_metadata(hybrid_schema, hybrid_view, repo_client): - """Use metadata to get correct case for db name/table, returned as a tuple""" - hybrid_metadata = repo_client.get_offload_metadata(hybrid_schema, hybrid_view) - assert ( - hybrid_metadata - ), f"Missing hybrid metadata for: {hybrid_schema}.{hybrid_view}" - return hybrid_metadata.backend_owner, hybrid_metadata.backend_table - - def get_backend_testing_api(config, messages, no_caching=True): return backend_testing_api_factory( config.target, config, messages, dry_run=False, no_caching=no_caching @@ -67,10 +58,6 @@ def get_test_messages(config, test_id, execution_id=None): return OffloadTestMessages(messages) -def get_data_db_for_schema(schema, config): - return convert_backend_identifier_case(config, data_db_name(schema, config)) - - def get_lines_from_log( search_text, search_from_text="", max_matches=None, file_name_override=None ) -> list: @@ -127,133 +114,6 @@ def log(line: str, detail: int = normal, ansi_code=None): offload_log(line, detail=detail, ansi_code=ansi_code, redis_publish=False) -def test_data_host_compare_no_hybrid_schema( - test, - frontend_schema, - frontend_table_name, - backend_schema, - backend_table_name, - frontend_api, - backend_api, - column_csv=None, -): - """Compare data in a CSV of columns or all columns of a table when there is no hybrid schema. - We load frontend and backend data into Python sets and use minus operator. - Because of variations in data types returned by the assorted frontend/backend clients all - date based columns are converted to strings in SQL. - """ - - def fix_numeric_variations(v, column): - """Convert any values like '.123' or '-.123' to '0.123' or '-0.123'""" - if column.is_number_based() and isinstance(v, str): - if v.startswith("-."): - return "-0.{}".format(v[2:]) - elif v.startswith("."): - return "0.{}".format(v[1:]) - elif v and v.lower() == "nan": - return "NaN" - elif v and v.lower() == "inf": - return "Inf" - elif v and v.lower() == "-inf": - return "-Inf" - else: - return v - else: - return v - - def preprocess_data(data, columns): - new_data = [ - fix_numeric_variations(d, col) - for row in data - for d, col in zip(row, columns) - ] - return set(new_data) - - fe_owner_table = frontend_api.enclose_object_reference( - frontend_schema, frontend_table_name - ) - be_owner_table = backend_api.enclose_object_reference( - backend_schema, backend_table_name - ) - fe_columns = frontend_api.get_columns(frontend_schema, frontend_table_name) - fe_id_column = match_table_column("ID", fe_columns) - be_columns = backend_api.get_columns(backend_schema, backend_table_name) - be_id_column = match_table_column("ID", be_columns) - - if column_csv: - # We've been asked to verify specific columns - fe_columns = [match_table_column(_, fe_columns) for _ in column_csv.split()] - - # Validate the columns one at a time otherwise it is too hard to unpick which ones have problems - for validation_column in fe_columns: - if validation_column.is_nan_capable(): - # TODO For the moment it is proving too difficult to validate float/double data - # The results coming back from different systems are sometimes rounded, sometimes in scientific - # notation. Plus NaN/Inf/-Inf handling is problematic. For now I've excluded from validation. - continue - - log("Checking {}".format(validation_column.name), detail=verbose) - fe_validation_columns = [validation_column] - be_validation_columns = [match_table_column(validation_column.name, be_columns)] - if validation_column.name.upper() != "ID": - # Always include ID column to help us locate issues - fe_validation_columns = [fe_id_column] + fe_validation_columns - be_validation_columns = [be_id_column] + be_validation_columns - - fe_projection = frontend_api.host_compare_sql_projection(fe_validation_columns) - be_projection = backend_api.host_compare_sql_projection(be_validation_columns) - frontend_sql = f"SELECT {fe_projection} FROM {fe_owner_table}" - backend_sql = f"SELECT {be_projection} FROM {be_owner_table}" - frontend_data = preprocess_data( - frontend_api.execute_query_fetch_all(frontend_sql, log_level=VERBOSE), - fe_validation_columns, - ) - backend_data = preprocess_data( - backend_api.execute_query_fetch_all(backend_sql, log_level=VERBOSE), - be_validation_columns, - ) - base_minus_backend = list(frontend_data - backend_data) - backend_minus_base = list(backend_data - frontend_data) - if base_minus_backend != [] or backend_minus_base != []: - # Extra logging to help diagnose mismatches - log( - "Base minus backend count: %s" % len(base_minus_backend), detail=verbose - ) - log( - "Backend minus base count: %s" % len(backend_minus_base), detail=verbose - ) - log( - "Base minus backend (first 10 rows only): %s" - % str(sorted(base_minus_backend)[:11]), - detail=vverbose, - ) - log( - "Backend minus base (first 10 rows only): %s" - % str(sorted(backend_minus_base)[:11]), - detail=vverbose, - ) - test.assertEqual( - base_minus_backend, - [], - "Extra " - + frontend_schema - + " results (cf " - + backend_schema - + ") for SQL:\n" - + frontend_sql, - ) - test.assertEqual( - backend_minus_base, - [], - "Extra " - + backend_schema - + " results (cf " - + frontend_schema - + ") for SQL:\n" - + backend_sql, - ) - - def text_in_events(messages, message_token): return bool(message_token in messages.get_events()) diff --git a/tests/unit/offload/test_backend_api.py b/tests/unit/offload/test_backend_api.py index 326834f5..8a2deeb8 100644 --- a/tests/unit/offload/test_backend_api.py +++ b/tests/unit/offload/test_backend_api.py @@ -1162,6 +1162,12 @@ def _test_table_exists(self): self.assertTrue(self.api.table_exists(self.db, self.table) in (True, False)) self.assertFalse(self.api.table_exists("not_a_db", "not_a_table")) + def _test_table_has_rows(self): + if self.connect_to_backend: + self.assertTrue( + self.api.table_has_rows(self.db, self.table) in (True, False) + ) + def _test_target_version(self): if self.connect_to_backend: # Some backends do not expose a version so we cannot assert on this fn @@ -1375,6 +1381,7 @@ def _run_all_tests(self): self._test_supported_backend_data_types() self._test_supported_partition_function_data_types() self._test_table_exists() + self._test_table_has_rows() self._test_target_version() self._test_to_backend_literal() self._test_transform_encrypt_data_type() From cc73276aa7982494d795ab3558f020df1e2bd88a Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 17 Apr 2024 10:47:25 +0000 Subject: [PATCH 09/28] feat: Decouple table creation and data loading --- src/goe/goe.py | 2 +- src/goe/offload/column_metadata.py | 4 +- src/goe/offload/offload.py | 68 ----------------------- src/goe/offload/offload_messages.py | 77 --------------------------- src/goe/offload/operation/ddl_file.py | 2 - tests/unit/test_functions.py | 19 +++++++ 6 files changed, 23 insertions(+), 149 deletions(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index f2993002..3940ecea 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -86,12 +86,12 @@ canonical_columns_from_columns_csv, offload_source_to_canonical_mappings, ) +from goe.offload.operation.table_structure_checks import check_table_structure from goe.offload.operation.transport import ( offload_data_to_target, ) from goe.offload.offload import ( active_data_append_options, - check_table_structure, create_ddl_file_step, create_final_backend_table_step, drop_backend_table_step, diff --git a/src/goe/offload/column_metadata.py b/src/goe/offload/column_metadata.py index 8b0a20c6..2a575c7b 100644 --- a/src/goe/offload/column_metadata.py +++ b/src/goe/offload/column_metadata.py @@ -229,7 +229,9 @@ def match_partition_column_by_source(source_column_name, column_list): def match_table_column( search_name: str, column_list: list ) -> Optional["ColumnMetadataInterface"]: - """Looks for, and returns, a column with name search_name in a list of table columns.""" + """Looks for, and returns, a column with name search_name in a list of table columns. + + Case insensitive matching.""" assert search_name assert isinstance(column_list, list) if column_list: diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 3c59c7a0..08117725 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -19,7 +19,6 @@ from datetime import datetime, timedelta from optparse import SUPPRESS_HELP -from textwrap import dedent from typing import TYPE_CHECKING from goe.config import config_descriptions, orchestration_defaults @@ -28,9 +27,6 @@ from goe.data_governance.hadoop_data_governance_constants import ( DATA_GOVERNANCE_GOE_OBJECT_TYPE_BASE_TABLE, ) -from goe.offload.column_metadata import ( - get_column_names, -) from goe.offload.factory.offload_source_table_factory import OffloadSourceTable from goe.offload import offload_constants from goe.offload.offload_messages import OffloadMessages, VVERBOSE @@ -58,7 +54,6 @@ INCREMENTAL_PREDICATE_TYPE_LIST_AS_RANGE, INCREMENTAL_PREDICATE_TYPES_WITH_PREDICATE_IN_HV, ) -from goe.util.misc_functions import format_list_for_logging if TYPE_CHECKING: from goe.config.orchestration_config import OrchestrationConfig @@ -71,69 +66,6 @@ ) -OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT = "Column mismatch detected between the source and backend table. Resolve before offloading" - - -def check_table_structure(frontend_table, backend_table, messages: OffloadMessages): - """Compare frontend and backend columns by name and throw an exception if there is a mismatch. - - Ideally we would use SchemaSyncAnalyzer for this but circular dependencies prevent that for the time being. - FIXME revisit this in the future to see if we can hook into SchemaSyncAnalyzer for comparison, see GOE-1307 - """ - frontend_cols = frontend_table.get_column_names(conv_fn=str.upper) - backend_cols = get_column_names( - backend_table.get_non_synthetic_columns(), conv_fn=str.upper - ) - new_frontend_cols = sorted([_ for _ in frontend_cols if _ not in backend_cols]) - missing_frontend_cols = sorted([_ for _ in backend_cols if _ not in frontend_cols]) - if new_frontend_cols and not missing_frontend_cols: - # There are extra columns in the source and no dropped columns, we can recommend Schema Sync - messages.warning( - dedent( - """\ - New columns detected in the source table. Use Schema Sync to resolve. - Recommended schema_sync command to add columns to {}: - schema_sync --include {}.{} -x - """ - ).format( - backend_table.backend_db_name(), - frontend_table.owner, - frontend_table.table_name, - ), - ansi_code="red", - ) - raise OffloadException( - "{}: {}.{}".format( - OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT, - frontend_table.owner, - frontend_table.table_name, - ) - ) - elif missing_frontend_cols: - # There are extra columns in the source but also dropped columns, Schema Sync cannot be used - column_table = [ - (frontend_table.frontend_db_name(), backend_table.backend_db_name()) - ] - column_table.extend([(_, "-") for _ in new_frontend_cols]) - column_table.extend([("-", _) for _ in missing_frontend_cols]) - messages.warning( - dedent( - """\ - The following column mismatches were detected between the source and backend table: - {} - """ - ).format(format_list_for_logging(column_table, underline_char="-")), - ansi_code="red", - ) - raise OffloadException( - "{}: {}.{}".format( - OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT, - frontend_table.owner, - frontend_table.table_name, - ) - ) - - def create_ddl_file_step( offload_target_table: "BackendTableInterface", offload_operation: "OffloadOperation", diff --git a/src/goe/offload/offload_messages.py b/src/goe/offload/offload_messages.py index 179e9c91..160000f4 100755 --- a/src/goe/offload/offload_messages.py +++ b/src/goe/offload/offload_messages.py @@ -1,5 +1,3 @@ -#! /usr/bin/env python3 - # Copyright 2016 The GOE Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -701,84 +699,9 @@ def set_func(marker, messages_func, logger_func): ) -def to_message_level(level): - """Convert 'level' (i.e. in a form of "Normal") to Offload 'message_level'""" - str_levels = dict( - list(zip(("quiet", "normal", "verbose", "vverbose"), list(range(-1, 3)))) - ) - messages_level = None - - if isinstance(level, str): - messages_level = str_levels.get(level.lower(), None) - elif isinstance(level, int): - messages_level = level if level in list(str_levels.values()) else None - - if messages_level is None: - raise OffloadMessagesException("Invalid OffloadMessages level: %s" % level) - - return messages_level - - def step_title_to_step_id(step_title): """Helper function to share logic between this module and goe.py""" if step_title: return step_title.replace(" ", "_").lower() else: return None - - -if __name__ == "__main__": - # GOE - from goe.util.misc_functions import set_goelib_logging - - log_level = sys.argv[-1:][0].upper() - if log_level not in ("DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"): - log_level = "CRITICAL" - - set_goelib_logging(log_level) - - log_dir = "/tmp" - log_id = "goe_test_log" - - print("NORMAL OffloadMessages") - print("=====================================") - messages = OffloadMessages(NORMAL) - messages.init_log(log_dir, log_id) - messages.log("A normal message") - messages.log("A verbose message you won't see on screen", VERBOSE) - messages.log("Nor this", VVERBOSE) - messages.log("More normality") - messages.step_delta("Normal Stuff", timedelta(0, 160, 615919)) - messages.log("Honk honk", ansi_code="red") - messages.step_delta("Bad Stuff", timedelta(0, 130, 651717)) - messages.warning("Honk honk", ansi_code="red") - messages.step_delta("Bad Stuff", timedelta(0, 9101, 651717)) - messages.log_step_deltas() - messages.close_log() - print("=====================================") - - print("As above but QUIET") - print("=====================================") - messages = OffloadMessages(QUIET) - messages.init_log(log_dir, log_id) - messages.log("A normal message") - messages.log("A verbose message you won't see on screen", VERBOSE) - messages.log("Nor this", VVERBOSE) - messages.log("More normality") - messages.log("Honk honk", ansi_code="red") - messages.warning("Honk honk", ansi_code="red") - messages.close_log() - print("=====================================") - - print("VERBOSE OffloadMessages") - print("=====================================") - messages = OffloadMessages(VERBOSE) - messages.init_log(log_dir, log_id) - messages.log("A normal message") - messages.log("A verbose message you will see", VERBOSE) - messages.log("But not this", VVERBOSE) - messages.log("More normality") - messages.log("Honk honk", ansi_code="red") - messages.warning("Honk honk", ansi_code="red") - messages.close_log() - print("=====================================") diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py index 58145f95..7af91c78 100644 --- a/src/goe/offload/operation/ddl_file.py +++ b/src/goe/offload/operation/ddl_file.py @@ -1,5 +1,3 @@ -#! /usr/bin/env python3 - # Copyright 2024 The GOE Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index c94cb1c3..25abf3cc 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -13,12 +13,14 @@ # limitations under the License. import os +from typing import TYPE_CHECKING from unittest import mock import numpy from goe.config.orchestration_config import OrchestrationConfig from goe.offload.column_metadata import ColumnPartitionInfo +from goe.offload.factory.backend_table_factory import backend_table_factory from goe.offload.offload_source_table import RdbmsPartition from goe.offload.oracle.oracle_column import ( OracleColumn, @@ -27,6 +29,9 @@ ) from goe.offload.oracle.oracle_offload_source_table import OracleSourceTable +if TYPE_CHECKING: + from goe.offload.backend_table import BackendTableInterface + FAKE_COMMON_ENV = { "DB_NAME_PREFIX": "x", @@ -335,6 +340,20 @@ def build_mock_offload_operation(): return fake_operation +def build_fake_backend_table(config, messages) -> "BackendTableInterface": + """Return a fake BackendTable.""" + test_table_object = backend_table_factory( + "no_user", + "no_table", + config.target, + config, + messages, + dry_run=True, + do_not_connect=True, + ) + return test_table_object + + def build_fake_oracle_table(config, messages) -> OracleSourceTable: """Return a fake OracleSourceTable partitioned by RANGE with 4 partitions.""" test_table_object = OracleSourceTable( From 3abf5b3d49cfb9b7b8a6e5474578bd92acfbf421 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 17 Apr 2024 10:49:06 +0000 Subject: [PATCH 10/28] feat: Decouple table creation and data loading --- .../operation/table_structure_checks.py | 123 ++++++ .../operation/test_table_structure_checks.py | 355 ++++++++++++++++++ 2 files changed, 478 insertions(+) create mode 100644 src/goe/offload/operation/table_structure_checks.py create mode 100644 tests/unit/offload/operation/test_table_structure_checks.py diff --git a/src/goe/offload/operation/table_structure_checks.py b/src/goe/offload/operation/table_structure_checks.py new file mode 100644 index 00000000..81e97bb5 --- /dev/null +++ b/src/goe/offload/operation/table_structure_checks.py @@ -0,0 +1,123 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from textwrap import dedent +from typing import TYPE_CHECKING + +from goe.exceptions import OffloadException +from goe.offload.column_metadata import ( + get_column_names, + match_table_column, +) +from goe.offload.offload_messages import OffloadMessages +from goe.util.misc_functions import format_list_for_logging + +if TYPE_CHECKING: + from goe.offload.backend_table import BackendTableInterface + from goe.offload.offload_source_table import OffloadSourceTableInterface + + +OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT = "Column mismatch detected between the source and backend table. Resolve before offloading" + + +def check_table_structure( + frontend_table: "OffloadSourceTableInterface", + backend_table: "BackendTableInterface", + messages: OffloadMessages, +): + """Compare frontend and backend columns are compatible to allow Offload to proceed. + + Throws an exception if there is a mismatch. + + Checks: + - Check names match, case insensitive. + - Check data types are compatible via canonical classes, e.g. is_numeric(). + - Check data types are compatible across valid remappings, e.g. frontend date to backend string. + """ + frontend_cols = frontend_table.columns + backend_cols = backend_table.get_columns() + + # Check case insensitive names match. + new_frontend_cols, missing_frontend_cols = check_table_columns_by_name( + frontend_cols, backend_cols + ) + if new_frontend_cols or missing_frontend_cols: + column_table = [ + (frontend_table.frontend_db_name(), backend_table.backend_db_name()) + ] + column_table.extend([(_, "-") for _ in new_frontend_cols]) + column_table.extend([("-", _) for _ in missing_frontend_cols]) + messages.warning( + dedent( + """\ + The following column mismatches were detected between the source and backend table: + {} + """ + ).format(format_list_for_logging(column_table, underline_char="-")), + ansi_code="red", + ) + raise OffloadException( + "{}: {}.{}".format( + OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT, + frontend_table.owner, + frontend_table.table_name, + ) + ) + + # Check data types are compatible via canonical classes. + # TODO this code does not satisfy the above comment yet. + # check_table_columns_by_type(frontend_table, backend_table) + + +def check_table_columns_by_name(frontend_cols: list, backend_cols: list) -> tuple: + """Check case insensitive names match. + + Returns a tuples of lists of column names: + (new_frontend_names: list[str], missing_frontend_names: list[str]) + """ + frontend_names = get_column_names(frontend_cols, conv_fn=str.upper) + backend_names = get_column_names(backend_cols, conv_fn=str.upper) + new_frontend_names = sorted([_ for _ in frontend_names if _ not in backend_names]) + missing_frontend_names = sorted( + [_ for _ in backend_names if _ not in frontend_names] + ) + return new_frontend_names, missing_frontend_names + + +def check_table_columns_by_type( + frontend_table: "OffloadSourceTableInterface", + backend_table: "BackendTableInterface", +) -> dict: + """Check data types are compatible via canonical classes. + + Returns: + A dict of frontend column names that are incompatible with the canonical type of the backend column. + """ + invalid_combinations = {} + target_canonical_cols = backend_table.get_canonical_columns() + for rdbms_col in frontend_table.columns: + target_canonical_col = match_table_column(rdbms_col.name, target_canonical_cols) + if ( + (rdbms_col.is_number_based() and target_canonical_col.is_number_based()) + or (rdbms_col.is_string_based() and target_canonical_col.is_string_based()) + or (rdbms_col.is_date_based() and target_canonical_col.is_date_based()) + ): + # The types are close enough. + continue + if frontend_table.valid_canonical_override(rdbms_col, target_canonical_col): + # The types are a valid offload combination. + continue + # If we get here then we have an invalid combination + invalid_combinations[rdbms_col.name] = target_canonical_col.data_type + return invalid_combinations diff --git a/tests/unit/offload/operation/test_table_structure_checks.py b/tests/unit/offload/operation/test_table_structure_checks.py new file mode 100644 index 00000000..33711b1f --- /dev/null +++ b/tests/unit/offload/operation/test_table_structure_checks.py @@ -0,0 +1,355 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +import pytest + +from goe.offload.bigquery import bigquery_column +from goe.offload.column_metadata import ( + CanonicalColumn, + GOE_TYPE_DATE, + GOE_TYPE_DECIMAL, + GOE_TYPE_INTEGER_1, + GOE_TYPE_TIME, + GOE_TYPE_TIMESTAMP, + GOE_TYPE_TIMESTAMP_TZ, + GOE_TYPE_VARIABLE_STRING, +) +from goe.offload.offload_messages import OffloadMessages +from goe.offload.operation import table_structure_checks as module_under_test +from goe.offload.oracle import oracle_column + +from tests.unit.test_functions import ( + build_fake_backend_table, + build_fake_oracle_table, + build_mock_options, + FAKE_ORACLE_BQ_ENV, +) + +if TYPE_CHECKING: + from goe.config.orchestration_config import OrchestrationConfig + + +@pytest.fixture(scope="module") +def ora_bq_config() -> "OrchestrationConfig": + return build_mock_options(FAKE_ORACLE_BQ_ENV) + + +@pytest.fixture(scope="module") +def messages(): + return OffloadMessages() + + +@pytest.fixture +def bigquery_table(ora_bq_config, messages): + return build_fake_backend_table(ora_bq_config, messages) + + +@pytest.fixture +def oracle_table(ora_bq_config, messages): + return build_fake_oracle_table(ora_bq_config, messages) + + +@pytest.mark.parametrize( + "frontend_columns,backend_columns,expected_extra_frontend_names,expected_missing_frontend_names", + [ + # Happy path, no expected mismatches. + ( + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_2", GOE_TYPE_INTEGER_1), + ], + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_2", GOE_TYPE_INTEGER_1), + ], + [], + [], + ), + # Happy path with different case, no expected mismatches. + ( + [ + CanonicalColumn("Col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("COL_2", GOE_TYPE_INTEGER_1), + ], + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("cOl_2", GOE_TYPE_INTEGER_1), + ], + [], + [], + ), + # Missing backend column. + ( + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_2", GOE_TYPE_INTEGER_1), + ], + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + ], + ["COL_2"], + [], + ), + # Missing frontend column. + ( + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + ], + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_2", GOE_TYPE_INTEGER_1), + ], + [], + ["COL_2"], + ), + # Missing backend columns. + ( + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_2", GOE_TYPE_INTEGER_1), + CanonicalColumn("col_3", GOE_TYPE_INTEGER_1), + ], + [ + CanonicalColumn("col_1", GOE_TYPE_INTEGER_1), + ], + ["COL_2", "COL_3"], + [], + ), + ], +) +def test_check_table_columns_by_name( + frontend_columns: list, + backend_columns: list, + expected_extra_frontend_names: list, + expected_missing_frontend_names: list, +): + extra_frontend_names, missing_frontend_names = ( + module_under_test.check_table_columns_by_name(frontend_columns, backend_columns) + ) + assert extra_frontend_names == expected_extra_frontend_names + assert missing_frontend_names == expected_missing_frontend_names + + +@pytest.mark.parametrize( + "frontend_columns,backend_columns,expected_return_dict", + [ + # Happy path, no expected mismatches. + ( + [ + oracle_column.OracleColumn("COL_N1", oracle_column.ORACLE_TYPE_NUMBER), + oracle_column.OracleColumn("COL_N2", oracle_column.ORACLE_TYPE_NUMBER), + oracle_column.OracleColumn( + "COL_S1", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn("COL_D1", oracle_column.ORACLE_TYPE_DATE), + oracle_column.OracleColumn("COL_D2", oracle_column.ORACLE_TYPE_DATE), + oracle_column.OracleColumn("COL_D3", oracle_column.ORACLE_TYPE_DATE), + oracle_column.OracleColumn( + "COL_T1", oracle_column.ORACLE_TYPE_TIMESTAMP + ), + oracle_column.OracleColumn( + "COL_T2", oracle_column.ORACLE_TYPE_TIMESTAMP + ), + oracle_column.OracleColumn( + "COL_T3", oracle_column.ORACLE_TYPE_TIMESTAMP + ), + ], + [ + bigquery_column.BigQueryColumn( + "COL_N1", bigquery_column.BIGQUERY_TYPE_NUMERIC + ), + bigquery_column.BigQueryColumn( + "COL_N2", bigquery_column.BIGQUERY_TYPE_INT64 + ), + bigquery_column.BigQueryColumn( + "COL_S1", bigquery_column.BIGQUERY_TYPE_STRING + ), + bigquery_column.BigQueryColumn( + "COL_D1", bigquery_column.BIGQUERY_TYPE_DATE + ), + bigquery_column.BigQueryColumn( + "COL_D2", bigquery_column.BIGQUERY_TYPE_DATETIME + ), + bigquery_column.BigQueryColumn( + "COL_D3", bigquery_column.BIGQUERY_TYPE_TIMESTAMP + ), + bigquery_column.BigQueryColumn( + "COL_T1", bigquery_column.BIGQUERY_TYPE_DATE + ), + bigquery_column.BigQueryColumn( + "COL_T2", bigquery_column.BIGQUERY_TYPE_DATETIME + ), + bigquery_column.BigQueryColumn( + "COL_T3", bigquery_column.BIGQUERY_TYPE_TIMESTAMP + ), + ], + {}, + ), + # Dates to strings. + ( + [ + oracle_column.OracleColumn("COL_D1", oracle_column.ORACLE_TYPE_DATE), + oracle_column.OracleColumn( + "COL_T1", oracle_column.ORACLE_TYPE_TIMESTAMP + ), + ], + [ + bigquery_column.BigQueryColumn( + "COL_D1", bigquery_column.BIGQUERY_TYPE_STRING + ), + bigquery_column.BigQueryColumn( + "COL_T1", bigquery_column.BIGQUERY_TYPE_STRING + ), + ], + {}, + ), + # Numbers to strings is not currently supported. + ( + [ + oracle_column.OracleColumn("COL_N1", oracle_column.ORACLE_TYPE_NUMBER), + oracle_column.OracleColumn("COL_N2", oracle_column.ORACLE_TYPE_NUMBER), + ], + [ + bigquery_column.BigQueryColumn( + "COL_N1", bigquery_column.BIGQUERY_TYPE_NUMERIC + ), + bigquery_column.BigQueryColumn( + "COL_N2", bigquery_column.BIGQUERY_TYPE_STRING + ), + ], + { + "COL_N2": GOE_TYPE_VARIABLE_STRING, + }, + ), + # Strings to numbers is not currently supported. + ( + [ + oracle_column.OracleColumn( + "COL_S1", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn( + "COL_S2", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn( + "COL_S3", oracle_column.ORACLE_TYPE_NVARCHAR2 + ), + oracle_column.OracleColumn("COL_S4", oracle_column.ORACLE_TYPE_CLOB), + ], + [ + bigquery_column.BigQueryColumn( + "COL_S1", bigquery_column.BIGQUERY_TYPE_STRING + ), + bigquery_column.BigQueryColumn( + "COL_S2", bigquery_column.BIGQUERY_TYPE_NUMERIC + ), + bigquery_column.BigQueryColumn( + "COL_S3", bigquery_column.BIGQUERY_TYPE_BIGNUMERIC + ), + bigquery_column.BigQueryColumn( + "COL_S4", bigquery_column.BIGQUERY_TYPE_NUMERIC + ), + ], + { + "COL_S2": GOE_TYPE_DECIMAL, + "COL_S3": GOE_TYPE_DECIMAL, + "COL_S4": GOE_TYPE_DECIMAL, + }, + ), + # Strings to dates is not currently supported. + ( + [ + oracle_column.OracleColumn( + "COL_S1", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn( + "COL_S2", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn( + "COL_S3", oracle_column.ORACLE_TYPE_NVARCHAR2 + ), + oracle_column.OracleColumn("COL_S4", oracle_column.ORACLE_TYPE_CLOB), + ], + [ + bigquery_column.BigQueryColumn( + "COL_S1", bigquery_column.BIGQUERY_TYPE_STRING + ), + bigquery_column.BigQueryColumn( + "COL_S2", bigquery_column.BIGQUERY_TYPE_DATE + ), + bigquery_column.BigQueryColumn( + "COL_S3", bigquery_column.BIGQUERY_TYPE_DATETIME + ), + bigquery_column.BigQueryColumn( + "COL_S4", bigquery_column.BIGQUERY_TYPE_TIMESTAMP + ), + ], + { + "COL_S2": GOE_TYPE_DATE, + "COL_S3": GOE_TYPE_TIMESTAMP, + "COL_S4": GOE_TYPE_TIMESTAMP_TZ, + }, + ), + # Backend TIME is not supported. + ( + [ + oracle_column.OracleColumn("COL_N1", oracle_column.ORACLE_TYPE_NUMBER), + oracle_column.OracleColumn("COL_N2", oracle_column.ORACLE_TYPE_NUMBER), + oracle_column.OracleColumn( + "COL_S1", oracle_column.ORACLE_TYPE_VARCHAR2 + ), + oracle_column.OracleColumn("COL_D1", oracle_column.ORACLE_TYPE_DATE), + oracle_column.OracleColumn( + "COL_T1", oracle_column.ORACLE_TYPE_TIMESTAMP + ), + ], + [ + bigquery_column.BigQueryColumn( + "COL_N1", bigquery_column.BIGQUERY_TYPE_NUMERIC + ), + bigquery_column.BigQueryColumn( + "COL_N2", bigquery_column.BIGQUERY_TYPE_TIME + ), + bigquery_column.BigQueryColumn( + "COL_S1", bigquery_column.BIGQUERY_TYPE_TIME + ), + bigquery_column.BigQueryColumn( + "COL_D1", bigquery_column.BIGQUERY_TYPE_TIME + ), + bigquery_column.BigQueryColumn( + "COL_T1", bigquery_column.BIGQUERY_TYPE_TIME + ), + ], + { + "COL_N2": GOE_TYPE_TIME, + "COL_S1": GOE_TYPE_TIME, + "COL_D1": GOE_TYPE_TIME, + "COL_T1": GOE_TYPE_TIME, + }, + ), + ], +) +def test_check_table_columns_by_type_oracle_to_bigquery( + frontend_columns: list, + backend_columns: list, + expected_return_dict: dict, + oracle_table, + bigquery_table, +): + oracle_table._columns = frontend_columns + oracle_table._columns_with_partition_info = frontend_columns + bigquery_table._columns = backend_columns + result = module_under_test.check_table_columns_by_type(oracle_table, bigquery_table) + assert result == expected_return_dict From ae7b2f1879f62a30667ec97b9d8682f009e3b045 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 17 Apr 2024 15:32:06 +0000 Subject: [PATCH 11/28] feat: Decouple table creation and data loading --- .../operation/table_structure_checks.py | 92 +++++++++++++++---- .../operation/test_table_structure_checks.py | 86 +++++++++++++---- 2 files changed, 144 insertions(+), 34 deletions(-) diff --git a/src/goe/offload/operation/table_structure_checks.py b/src/goe/offload/operation/table_structure_checks.py index 81e97bb5..261c936f 100644 --- a/src/goe/offload/operation/table_structure_checks.py +++ b/src/goe/offload/operation/table_structure_checks.py @@ -53,19 +53,12 @@ def check_table_structure( frontend_cols, backend_cols ) if new_frontend_cols or missing_frontend_cols: - column_table = [ - (frontend_table.frontend_db_name(), backend_table.backend_db_name()) - ] - column_table.extend([(_, "-") for _ in new_frontend_cols]) - column_table.extend([("-", _) for _ in missing_frontend_cols]) - messages.warning( - dedent( - """\ - The following column mismatches were detected between the source and backend table: - {} - """ - ).format(format_list_for_logging(column_table, underline_char="-")), - ansi_code="red", + check_table_columns_by_name_logging( + frontend_table, + backend_table, + new_frontend_cols, + missing_frontend_cols, + messages, ) raise OffloadException( "{}: {}.{}".format( @@ -76,8 +69,18 @@ def check_table_structure( ) # Check data types are compatible via canonical classes. - # TODO this code does not satisfy the above comment yet. - # check_table_columns_by_type(frontend_table, backend_table) + invalid_combinations = check_table_columns_by_type(frontend_table, backend_table) + if invalid_combinations: + check_table_columns_by_type_logging( + frontend_table, backend_table, invalid_combinations, messages + ) + raise OffloadException( + "{}: {}.{}".format( + OFFLOAD_SCHEMA_CHECK_EXCEPTION_TEXT, + frontend_table.owner, + frontend_table.table_name, + ) + ) def check_table_columns_by_name(frontend_cols: list, backend_cols: list) -> tuple: @@ -95,6 +98,28 @@ def check_table_columns_by_name(frontend_cols: list, backend_cols: list) -> tupl return new_frontend_names, missing_frontend_names +def check_table_columns_by_name_logging( + frontend_table: "OffloadSourceTableInterface", + backend_table: "BackendTableInterface", + new_frontend_cols: list, + missing_frontend_cols: list, + messages: OffloadMessages, +): + if not new_frontend_cols and not missing_frontend_cols: + return + column_table = [ + (frontend_table.frontend_db_name(), backend_table.backend_db_name()) + ] + column_table.extend([(_, "-") for _ in new_frontend_cols]) + column_table.extend([("-", _) for _ in missing_frontend_cols]) + messages.warning( + "The following column mismatches were detected between the source and backend table:\n{}".format( + format_list_for_logging(column_table, underline_char="-") + ), + ansi_code="red", + ) + + def check_table_columns_by_type( frontend_table: "OffloadSourceTableInterface", backend_table: "BackendTableInterface", @@ -102,7 +127,7 @@ def check_table_columns_by_type( """Check data types are compatible via canonical classes. Returns: - A dict of frontend column names that are incompatible with the canonical type of the backend column. + A dict of frontend column names with the incompatible backend data type. """ invalid_combinations = {} target_canonical_cols = backend_table.get_canonical_columns() @@ -119,5 +144,38 @@ def check_table_columns_by_type( # The types are a valid offload combination. continue # If we get here then we have an invalid combination - invalid_combinations[rdbms_col.name] = target_canonical_col.data_type + backend_col = match_table_column(rdbms_col.name, backend_table.get_columns()) + invalid_combinations[rdbms_col.name] = backend_col.data_type return invalid_combinations + + +def check_table_columns_by_type_logging( + frontend_table: "OffloadSourceTableInterface", + backend_table: "BackendTableInterface", + invalid_combinations: dict, + messages: OffloadMessages, +): + if not invalid_combinations: + return + column_table = [ + ( + f"{frontend_table.frontend_db_name()} Column", + f"{frontend_table.frontend_db_name()} Type", + f"{backend_table.backend_db_name()} Type", + ) + ] + for col_name, backend_type in invalid_combinations.items(): + frontend_col = frontend_table.get_column(col_name) + column_table.append( + ( + col_name, + frontend_col.data_type, + backend_type, + ) + ) + messages.warning( + "The following column mismatches were detected between the source and backend table:\n{}".format( + format_list_for_logging(column_table, underline_char="-") + ), + ansi_code="red", + ) diff --git a/tests/unit/offload/operation/test_table_structure_checks.py b/tests/unit/offload/operation/test_table_structure_checks.py index 33711b1f..ce2f2445 100644 --- a/tests/unit/offload/operation/test_table_structure_checks.py +++ b/tests/unit/offload/operation/test_table_structure_checks.py @@ -13,19 +13,14 @@ # limitations under the License. from typing import TYPE_CHECKING +from unittest import mock import pytest from goe.offload.bigquery import bigquery_column from goe.offload.column_metadata import ( CanonicalColumn, - GOE_TYPE_DATE, - GOE_TYPE_DECIMAL, GOE_TYPE_INTEGER_1, - GOE_TYPE_TIME, - GOE_TYPE_TIMESTAMP, - GOE_TYPE_TIMESTAMP_TZ, - GOE_TYPE_VARIABLE_STRING, ) from goe.offload.offload_messages import OffloadMessages from goe.offload.operation import table_structure_checks as module_under_test @@ -143,6 +138,41 @@ def test_check_table_columns_by_name( assert missing_frontend_names == expected_missing_frontend_names +@pytest.mark.parametrize( + "extra_frontend_names,missing_frontend_names", + [ + ( + [], + [], + ), + # Missing backend column. + ( + ["COL_2"], + [], + ), + # Missing frontend column. + ( + [], + ["COL_2"], + ), + # Missing backend columns. + ( + ["COL_2", "COL_3"], + [], + ), + ], +) +def test_check_table_columns_by_name_logging( + extra_frontend_names: list, missing_frontend_names: list, messages +): + fake_table = mock.MagicMock() + fake_table.frontend_db_name = lambda: "System A" + fake_table.backend_db_name = lambda: "System B" + module_under_test.check_table_columns_by_name_logging( + fake_table, fake_table, extra_frontend_names, missing_frontend_names, messages + ) + + @pytest.mark.parametrize( "frontend_columns,backend_columns,expected_return_dict", [ @@ -231,7 +261,7 @@ def test_check_table_columns_by_name( ), ], { - "COL_N2": GOE_TYPE_VARIABLE_STRING, + "COL_N2": bigquery_column.BIGQUERY_TYPE_STRING, }, ), # Strings to numbers is not currently supported. @@ -263,9 +293,9 @@ def test_check_table_columns_by_name( ), ], { - "COL_S2": GOE_TYPE_DECIMAL, - "COL_S3": GOE_TYPE_DECIMAL, - "COL_S4": GOE_TYPE_DECIMAL, + "COL_S2": bigquery_column.BIGQUERY_TYPE_NUMERIC, + "COL_S3": bigquery_column.BIGQUERY_TYPE_BIGNUMERIC, + "COL_S4": bigquery_column.BIGQUERY_TYPE_NUMERIC, }, ), # Strings to dates is not currently supported. @@ -297,9 +327,9 @@ def test_check_table_columns_by_name( ), ], { - "COL_S2": GOE_TYPE_DATE, - "COL_S3": GOE_TYPE_TIMESTAMP, - "COL_S4": GOE_TYPE_TIMESTAMP_TZ, + "COL_S2": bigquery_column.BIGQUERY_TYPE_DATE, + "COL_S3": bigquery_column.BIGQUERY_TYPE_DATETIME, + "COL_S4": bigquery_column.BIGQUERY_TYPE_TIMESTAMP, }, ), # Backend TIME is not supported. @@ -333,10 +363,10 @@ def test_check_table_columns_by_name( ), ], { - "COL_N2": GOE_TYPE_TIME, - "COL_S1": GOE_TYPE_TIME, - "COL_D1": GOE_TYPE_TIME, - "COL_T1": GOE_TYPE_TIME, + "COL_N2": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_S1": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_D1": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_T1": bigquery_column.BIGQUERY_TYPE_TIME, }, ), ], @@ -353,3 +383,25 @@ def test_check_table_columns_by_type_oracle_to_bigquery( bigquery_table._columns = backend_columns result = module_under_test.check_table_columns_by_type(oracle_table, bigquery_table) assert result == expected_return_dict + + +@pytest.mark.parametrize( + "invalid_combinations", + [ + {}, + { + "COL_N2": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_S1": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_D1": bigquery_column.BIGQUERY_TYPE_TIME, + "COL_T1": bigquery_column.BIGQUERY_TYPE_TIME, + }, + ], +) +def test_check_table_columns_by_type_logging(invalid_combinations: dict, messages): + fake_table = mock.MagicMock() + fake_table.frontend_db_name = lambda: "System A" + fake_table.backend_db_name = lambda: "System B" + fake_table.get_column = lambda x: CanonicalColumn("COL-NAME", GOE_TYPE_INTEGER_1) + module_under_test.check_table_columns_by_type_logging( + fake_table, fake_table, invalid_combinations, messages + ) From 51f791a572cf298ef0d2869d487d42cf01677298 Mon Sep 17 00:00:00 2001 From: abb9979 Date: Wed, 17 Apr 2024 15:49:34 +0000 Subject: [PATCH 12/28] chore: repo upgrade for 1.0.4 --- .../source/sql/create_offload_repo_104.sql | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 sql/oracle/source/sql/create_offload_repo_104.sql diff --git a/sql/oracle/source/sql/create_offload_repo_104.sql b/sql/oracle/source/sql/create_offload_repo_104.sql new file mode 100644 index 00000000..e62eb879 --- /dev/null +++ b/sql/oracle/source/sql/create_offload_repo_104.sql @@ -0,0 +1,45 @@ +/* +# Copyright 2016 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +define goe_offload_repo_version = '1.0.4' +define goe_offload_repo_comments = "GOE repo upgrades for &goe_offload_repo_version." + +PROMPT Installing GOE repository &goe_offload_repo_version.... + +-- New seed data +-- ----------------------------------------------------------------------------------------------- + +DECLARE + PROCEDURE add_command_step ( p_code IN command_step.code%TYPE, + p_title IN command_step.title%TYPE ) IS + BEGIN + INSERT INTO command_step + (id, code, title, create_time) + VALUES + (command_step_seq.NEXTVAL, p_code, p_title, SYSTIMESTAMP); + END add_command_step; +BEGIN + add_command_step('DDL_FILE', 'Create DDL file'); +END; +/ + +-------------------------------------------------------------------------------------------------- +@@upgrade_offload_repo_version.sql + +PROMPT GOE repository &goe_offload_repo_version. installed. + +undefine goe_offload_repo_version +undefine goe_offload_repo_comments From ab447831d04fb4346a7e613ba435828be6852ef2 Mon Sep 17 00:00:00 2001 From: abb9979 Date: Wed, 17 Apr 2024 15:49:55 +0000 Subject: [PATCH 13/28] chore: repo upgrade for 1.0.4 --- sql/oracle/source/sql/install_offload_repo.sql | 1 + sql/oracle/source/sql/upgrade_offload_repo_deltas.sql | 1 + 2 files changed, 2 insertions(+) diff --git a/sql/oracle/source/sql/install_offload_repo.sql b/sql/oracle/source/sql/install_offload_repo.sql index 7a070341..503193b8 100644 --- a/sql/oracle/source/sql/install_offload_repo.sql +++ b/sql/oracle/source/sql/install_offload_repo.sql @@ -21,5 +21,6 @@ prompt Installing GOE repository... alter session set current_schema = &goe_db_repo_user; -- Start offload repo version files... @@create_offload_repo_100.sql +@@create_offload_repo_104.sql -- End offload repo version files. @@install_offload_repo_code.sql diff --git a/sql/oracle/source/sql/upgrade_offload_repo_deltas.sql b/sql/oracle/source/sql/upgrade_offload_repo_deltas.sql index 96ae2e70..514d3957 100644 --- a/sql/oracle/source/sql/upgrade_offload_repo_deltas.sql +++ b/sql/oracle/source/sql/upgrade_offload_repo_deltas.sql @@ -60,6 +60,7 @@ begin -- Follow this pattern for each repo version file in sequence... check_version(v_current_version, '1.0.0'); + check_version(v_current_version, '1.0.4'); end; / From 477a0317c12d732f8fc8a073f9ed3888a1e1097c Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 18 Apr 2024 09:03:29 +0000 Subject: [PATCH 14/28] feat: Decouple table creation and data loading --- src/goe/offload/operation/table_structure_checks.py | 2 +- .../integration/persistence/test_orchestration_metadata.py | 2 ++ tests/integration/scenarios/test_ddl_file.py | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/goe/offload/operation/table_structure_checks.py b/src/goe/offload/operation/table_structure_checks.py index 261c936f..cc8d1371 100644 --- a/src/goe/offload/operation/table_structure_checks.py +++ b/src/goe/offload/operation/table_structure_checks.py @@ -46,7 +46,7 @@ def check_table_structure( - Check data types are compatible across valid remappings, e.g. frontend date to backend string. """ frontend_cols = frontend_table.columns - backend_cols = backend_table.get_columns() + backend_cols = backend_table.get_non_synthetic_columns() # Check case insensitive names match. new_frontend_cols, missing_frontend_cols = check_table_columns_by_name( diff --git a/tests/integration/persistence/test_orchestration_metadata.py b/tests/integration/persistence/test_orchestration_metadata.py index 7bedc3a8..b8caafee 100644 --- a/tests/integration/persistence/test_orchestration_metadata.py +++ b/tests/integration/persistence/test_orchestration_metadata.py @@ -175,6 +175,8 @@ def _convert_to_test_metadata( test_metadata = source_metadata test_metadata["OFFLOADED_OWNER"] = new_owner test_metadata["OFFLOADED_TABLE"] = new_name + # There is a unique key on backend table too: + test_metadata["HADOOP_TABLE"] = new_name for k, v in test_metadata.items(): if v is None: # Fill in the blanks with dummy data. diff --git a/tests/integration/scenarios/test_ddl_file.py b/tests/integration/scenarios/test_ddl_file.py index c28532c2..bf1c76ec 100644 --- a/tests/integration/scenarios/test_ddl_file.py +++ b/tests/integration/scenarios/test_ddl_file.py @@ -73,11 +73,14 @@ def data_db(schema, config): def step_assertions(offload_messages): """Check that we didn't run Offload steps that come after the DDL file is produced.""" assert ( - command_steps.step_title(command_steps.STEP_CREATE_TABLE) - in offload_messages.steps + command_steps.step_title(command_steps.STEP_DDL_FILE) in offload_messages.steps ) # After creating the DDL file Offload should stop, therefore # we should never see data staged or loaded. + assert ( + command_steps.step_title(command_steps.STEP_CREATE_TABLE) + not in offload_messages.steps + ), f"We ran an offload step that shouldn't be run: {command_steps.step_title(command_steps.STEP_CREATE_TABLE)}" assert ( command_steps.step_title(command_steps.STEP_STAGING_TRANSPORT) not in offload_messages.steps From 0dd77ebc6fd9245a9bdf9d871b19d1da03c30ad5 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 25 Apr 2024 16:13:51 +0000 Subject: [PATCH 15/28] feat: Decouple table creation and data loading --- src/goe/__init__.py | 17 +++ src/goe/config/config_descriptions.py | 24 ---- src/goe/config/option_descriptions.py | 40 +++++++ src/goe/goe.py | 23 +++- src/goe/listener/schemas/orchestration.py | 11 +- src/goe/offload/offload.py | 6 +- src/goe/offload/offload_constants.py | 6 +- src/goe/offload/operation/ddl_file.py | 12 +- src/goe/scripts/agg_validate.py | 4 +- .../scenarios/test_offload_basic.py | 109 +++++++++++++++++- .../oracle/oracle_frontend_testing_api.py | 95 +++++++++------ .../teradata/teradata_frontend_testing_api.py | 88 ++++++++------ tests/unit/offload/operation/test_ddl_file.py | 6 + 13 files changed, 327 insertions(+), 114 deletions(-) delete mode 100644 src/goe/config/config_descriptions.py create mode 100644 src/goe/config/option_descriptions.py diff --git a/src/goe/__init__.py b/src/goe/__init__.py index e69de29b..2b106b99 100644 --- a/src/goe/__init__.py +++ b/src/goe/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2024 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.metadata + +__version__ = importlib.metadata.version("goe") diff --git a/src/goe/config/config_descriptions.py b/src/goe/config/config_descriptions.py deleted file mode 100644 index ec0f1d2c..00000000 --- a/src/goe/config/config_descriptions.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/env python3 - -# Copyright 2016 The GOE Authors. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" config_descriptions: Library of constants defining descriptions for configuration attributes - In the future we expect to refactor all option processing, including descriptions, and this module will - hopefully become redundant at that time. -""" - -DATA_SAMPLE_PARALLELISM = "Degree of parallelism to use when sampling RDBMS data for columns with no precision/scale properties. Values of 0 or 1 will execute the query without parallelism" - -VERIFY_PARALLELISM = "Degree of parallelism to use for the RDBMS query executed when validating an offload. Values of 0 or 1 will execute the query without parallelism. Values > 1 will force a parallel query of the given degree. If unset, the RDBMS query will fall back to using the behavior specified by RDBMS defaults" diff --git a/src/goe/config/option_descriptions.py b/src/goe/config/option_descriptions.py new file mode 100644 index 00000000..50e7f6d4 --- /dev/null +++ b/src/goe/config/option_descriptions.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python3 + +# Copyright 2016 The GOE Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" option_descriptions: Library of constants defining descriptions for options. + In the future we expect to refactor all option processing, including descriptions, and this module will + may become redundant at that time. +""" + +DATA_SAMPLE_PARALLELISM = ( + "Degree of parallelism to use when sampling RDBMS data for columns with no precision/scale properties. " + "Values of 0 or 1 will execute the query without parallelism" +) + +RESET_BACKEND_TABLE = ( + "Remove backend data table. Use with caution - this will delete previously offloaded data for this table!", +) + +REUSE_BACKEND_TABLE = ( + "Allow Offload to re-use an empty backend table when there is already Offload metadata. " + "This may be useful if a backend table had data removed by an administrator and a re-offlaod is required" +) + +VERIFY_PARALLELISM = ( + "Degree of parallelism to use for the RDBMS query executed when validating an offload. " + "Values of 0 or 1 will execute the query without parallelism. Values > 1 will force a parallel query of the given degree. " + "If unset, the RDBMS query will fall back to using the behavior specified by RDBMS defaults" +) diff --git a/src/goe/goe.py b/src/goe/goe.py index 3940ecea..1f78f055 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -26,7 +26,7 @@ import orjson -from goe.config import orchestration_defaults +from goe.config import option_descriptions, orchestration_defaults from goe.config.config_validation_functions import normalise_size_option from goe.exceptions import OffloadException, OffloadOptionError from goe.filesystem.goe_dfs import ( @@ -243,6 +243,7 @@ "purge_backend_table", "reset_backend_table", "reset_hybrid_view", + "reuse_backend_table", "sort_columns_csv", "sqoop_additional_options", "sqoop_mapreduce_map_memory_mb", @@ -1662,7 +1663,14 @@ def defaults_for_existing_table( """ existing_metadata = self.get_hybrid_metadata() - if not existing_metadata: + if existing_metadata: + if not offload_target_table.has_rows() and not self.reuse_backend_table: + # If the table is empty but has metadata then we need to abort unless using --reuse-backend-table. + raise OffloadException( + offload_constants.METADATA_EMPTY_TABLE_EXCEPTION_TEMPLATE + % (self.owner.upper(), self.table_name.upper()) + ) + else: if offload_target_table.has_rows(): # If the table has rows but no metadata then we need to abort. raise OffloadException( @@ -2263,6 +2271,7 @@ def from_options( purge_backend_table=options.purge_backend_table, reset_backend_table=options.reset_backend_table, reset_hybrid_view=options.reset_hybrid_view, + reuse_backend_table=options.reuse_backend_table, skip=options.skip, sort_columns_csv=options.sort_columns_csv, sqoop_additional_options=options.sqoop_additional_options, @@ -2459,6 +2468,7 @@ def from_dict( ), reset_backend_table=operation_dict.get("reset_backend_table", False), reset_hybrid_view=operation_dict.get("reset_hybrid_view", False), + reuse_backend_table=operation_dict.get("reuse_backend_table", False), skip=operation_dict.get("skip", orchestration_defaults.skip_default()), sort_columns_csv=operation_dict.get( "sort_columns_csv", orchestration_defaults.sort_columns_default() @@ -3237,7 +3247,7 @@ def get_options(usage=None, operation_name=None): dest="reset_backend_table", action="store_true", default=False, - help="Remove backend data table. Use with caution - this will delete previously offloaded data for this table!", + help=option_descriptions.RESET_BACKEND_TABLE, ) opt.add_option( "--reset-hybrid-view", @@ -3246,6 +3256,13 @@ def get_options(usage=None, operation_name=None): default=False, help="Reset Incremental Partition Append or Predicate-Based Offload predicates in the hybrid view.", ) + opt.add_option( + "--reuse-backend-table", + dest="reuse_backend_table", + action="store_true", + default=False, + help=option_descriptions.REUSE_BACKEND_TABLE, + ) opt.add_option( "--purge", dest="purge_backend_table", diff --git a/src/goe/listener/schemas/orchestration.py b/src/goe/listener/schemas/orchestration.py index 8873fcfe..1cd6b760 100644 --- a/src/goe/listener/schemas/orchestration.py +++ b/src/goe/listener/schemas/orchestration.py @@ -24,6 +24,7 @@ from pydantic import UUID4, Field, Json, PositiveInt, validator # GOE +from goe.config import option_descriptions import goe.config.orchestration_defaults as defaults from goe.listener.schemas.base import BaseSchema, TotaledResults from goe.orchestration.execution_id import ExecutionId @@ -725,9 +726,7 @@ def set_offload_transport_validation_polling_interval(cls, value): reset_backend_table: Optional[bool] = Field( default=False, title="Reset backend table", - description=( - "Remove backend data table. Use with caution - this will delete previously offloaded data for this table!" - ), + description=option_descriptions.RESET_BACKEND_TABLE, cli=("--reset-backend-table"), ) reset_hybrid_view: Optional[bool] = Field( @@ -738,6 +737,12 @@ def set_offload_transport_validation_polling_interval(cls, value): ), cli=("--reset-hybrid-view"), ) + reuse_backend_table: Optional[bool] = Field( + default=False, + title="Reuse backend table", + description=option_descriptions.REUSE_BACKEND_TABLE, + cli=("--reuse-backend-table"), + ) skip: Optional[str] = Field( default=None, title="Skip Steps", diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 08117725..7938acdc 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -21,7 +21,7 @@ from optparse import SUPPRESS_HELP from typing import TYPE_CHECKING -from goe.config import config_descriptions, orchestration_defaults +from goe.config import option_descriptions, orchestration_defaults from goe.exceptions import OffloadException from goe.filesystem.goe_dfs import VALID_OFFLOAD_FS_SCHEMES from goe.data_governance.hadoop_data_governance_constants import ( @@ -476,7 +476,7 @@ def get_offload_options(opt): type=int, dest="data_sample_parallelism", default=orchestration_defaults.data_sample_parallelism_default(), - help=config_descriptions.DATA_SAMPLE_PARALLELISM, + help=option_descriptions.DATA_SAMPLE_PARALLELISM, ) opt.add_option( "--ddl-file", @@ -695,5 +695,5 @@ def get_offload_options(opt): dest="verify_parallelism", type=int, default=orchestration_defaults.verify_parallelism_default(), - help=config_descriptions.VERIFY_PARALLELISM, + help=option_descriptions.VERIFY_PARALLELISM, ) diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index 7cab05e2..d91ea645 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -133,8 +133,12 @@ IPA_PREDICATE_TYPE_FIRST_OFFLOAD_EXCEPTION_TEXT = ( "--offload-predicate-type is not valid for a first time predicate-based offload" ) -IPA_PREDICATE_TYPE_REQUIRES_PREDICATE_EXCEPTION_TEXT = "Missing --offload-predicate option. This option is mandatory to offload tables with an INCREMENTAL_PREDICATE_TYPE configuration of PREDICATE" +IPA_PREDICATE_TYPE_REQUIRES_PREDICATE_EXCEPTION_TEXT = ( + "Missing --offload-predicate option. " + "This option is mandatory to offload tables with an INCREMENTAL_PREDICATE_TYPE configuration of PREDICATE" +) MISSING_METADATA_EXCEPTION_TEMPLATE = "Missing metadata for table %s.%s. Offload with --reset-backend-table to overwrite table data" +METADATA_EMPTY_TABLE_EXCEPTION_TEMPLATE = "Empty table %s.%s has metadata. Offload with --reuse-backend-table to populate this table" OFFLOAD_TYPE_CHANGE_FOR_LIST_EXCEPTION_TEXT = "Switching to offload type INCREMENTAL for LIST partitioned table requires --equal-to-values/--partition-names" OFFLOAD_TYPE_CHANGE_FOR_LIST_MESSAGE_TEXT = ( "Switching to INCREMENTAL for LIST partitioned table" diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py index 7af91c78..3bc00551 100644 --- a/src/goe/offload/operation/ddl_file.py +++ b/src/goe/offload/operation/ddl_file.py @@ -16,6 +16,7 @@ import os from typing import TYPE_CHECKING +from goe import __version__ as package_version from goe.exceptions import OffloadOptionError from goe.filesystem.goe_dfs import get_scheme_from_location_uri from goe.filesystem.goe_dfs_factory import get_dfs_from_options @@ -31,6 +32,7 @@ DDL_FILE_HEADER = "Table DDL generated by GOE" DDL_FILE_HEADER_TEMPLATE = f"""-- {DDL_FILE_HEADER} -- Time: {{}} +-- Version: {{}} """ @@ -92,6 +94,12 @@ def normalise_ddl_file( validate_ddl_file(offload_operation.ddl_file) +def ddl_file_header() -> str: + return DDL_FILE_HEADER_TEMPLATE.format( + datetime.datetime.now().replace(microsecond=0).isoformat(), package_version + ) + + def write_ddl_to_ddl_file( ddl_file: str, ddl: list, @@ -101,9 +109,7 @@ def write_ddl_to_ddl_file( """Take a list of DDL strings and write them to a file""" assert ddl_file ddl_str = "\n".join(ddl) - header = DDL_FILE_HEADER_TEMPLATE.format( - datetime.datetime.now().replace(microsecond=0).isoformat() - ) + header = ddl_file_header() ddl_file_contents = f"{header}\n{ddl_str}" if ":" in ddl_file: # Cloud storage. diff --git a/src/goe/scripts/agg_validate.py b/src/goe/scripts/agg_validate.py index 013b0a20..3ff8cd61 100755 --- a/src/goe/scripts/agg_validate.py +++ b/src/goe/scripts/agg_validate.py @@ -24,7 +24,7 @@ import re import sys -from goe.config import config_descriptions, orchestration_defaults +from goe.config import option_descriptions, orchestration_defaults from goe.config.orchestration_config import OrchestrationConfig from goe.offload.offload_validation import ( CrossDbValidator, @@ -237,7 +237,7 @@ def parse_args(): "--frontend-parallelism", default=orchestration_defaults.verify_parallelism_default(), type=int, - help=config_descriptions.VERIFY_PARALLELISM, + help=option_descriptions.VERIFY_PARALLELISM, ) parser.add_option( "--skip-boundary-check", diff --git a/tests/integration/scenarios/test_offload_basic.py b/tests/integration/scenarios/test_offload_basic.py index 712dd111..eca70674 100644 --- a/tests/integration/scenarios/test_offload_basic.py +++ b/tests/integration/scenarios/test_offload_basic.py @@ -71,6 +71,7 @@ OFFLOAD_DIM = "STORY_DIM" OFFLOAD_DIM2 = "STORY_EXISTS_DIM" +OFFLOAD_DIM3 = "STORY_EXISTS_META_DIM" OFFLOAD_FACT = "STORY_FACT" OFFLOAD_FACT2 = "STORY_EXISTS_FACT" @@ -641,8 +642,8 @@ def test_offload_basic_fact(config, schema, data_db): frontend_api.close() -def test_offload_dim_to_existing_table(config, schema, data_db): - id = "test_offload_dim_to_existing_table" +def test_offload_dim_to_existing_table_no_metadata(config, schema, data_db): + id = "test_offload_dim_to_existing_table_no_metadata" messages = get_test_messages(config, id) backend_api = get_backend_testing_api(config, messages) frontend_api = get_frontend_testing_api(config, messages, trace_action=id) @@ -708,8 +709,108 @@ def test_offload_dim_to_existing_table(config, schema, data_db): frontend_api.close() -def test_offload_fact_to_existing_table(config, schema, data_db): - id = "test_offload_fact_to_existing_table" +def test_offload_dim_to_existing_table_with_metadata(config, schema, data_db): + id = "test_offload_dim_to_existing_table_with_metadata" + messages = get_test_messages(config, id) + backend_api = get_backend_testing_api(config, messages) + frontend_api = get_frontend_testing_api(config, messages, trace_action=id) + repo_client = orchestration_repo_client_factory( + config, messages, trace_action=f"repo_client({id})" + ) + + test_table = OFFLOAD_DIM3 + + # Setup + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl( + schema, test_table, empty=True + ), + python_fns=[ + lambda: drop_backend_test_table( + config, backend_api, messages, data_db, test_table + ), + lambda: drop_offload_metadata(repo_client, schema, test_table), + ], + ) + + # Offload the empty table. + options = { + "owner_table": schema + "." + test_table, + "reset_backend_table": True, + "create_backend_db": True, + "execute": True, + } + run_offload(options, config, messages) + + assert backend_table_exists( + config, backend_api, messages, data_db, test_table + ), "Backend table should exist" + assert ( + backend_table_count(config, backend_api, messages, data_db, test_table) == 0 + ), "Backend table should be empty" + + # Recreate the table but this time with data. + # Do not drop the metadata. + run_setup( + frontend_api, + backend_api, + config, + messages, + frontend_sqls=frontend_api.standard_dimension_frontend_ddl( + schema, + test_table, + ), + ) + + # Attempt to offload to the empty table - expect this to fail. + options = { + "owner_table": schema + "." + test_table, + "execute": True, + } + run_offload( + options, + config, + messages, + expected_exception_string=( + offload_constants.METADATA_EMPTY_TABLE_EXCEPTION_TEMPLATE + % (schema.upper(), test_table.upper()) + ), + ) + + assert ( + backend_table_count(config, backend_api, messages, data_db, test_table) == 0 + ), "Backend table should be empty" + + # Offload to the empty table. + options = { + "owner_table": schema + "." + test_table, + "reuse_backend_table": True, + "execute": True, + } + run_offload(options, config, messages) + + assert ( + backend_table_count(config, backend_api, messages, data_db, test_table) > 0 + ), "Backend table should NOT be empty" + + # Re-try should do nothing, even with reuse option. + options = { + "owner_table": schema + "." + test_table, + "reuse_backend_table": True, + "execute": True, + } + run_offload(options, config, messages, expected_status=False) + + # Connections are being left open, explicitly close them. + frontend_api.close() + + +def test_offload_fact_to_existing_table_no_metadata(config, schema, data_db): + id = "test_offload_fact_to_existing_table_no_metadata" messages = get_test_messages(config, id) backend_api = get_backend_testing_api(config, messages) frontend_api = get_frontend_testing_api(config, messages, trace_action=id) diff --git a/tests/testlib/test_framework/oracle/oracle_frontend_testing_api.py b/tests/testlib/test_framework/oracle/oracle_frontend_testing_api.py index 4a641f53..4fef2a2b 100644 --- a/tests/testlib/test_framework/oracle/oracle_frontend_testing_api.py +++ b/tests/testlib/test_framework/oracle/oracle_frontend_testing_api.py @@ -2281,49 +2281,70 @@ def select_grant_exists( return bool(row[0] == "NO") def standard_dimension_frontend_ddl( - self, schema: str, table_name: str, extra_col_tuples: Optional[list] = None + self, + schema: str, + table_name: str, + extra_col_tuples: Optional[list] = None, + empty: bool = False, ) -> list: extra_cols = "" if extra_col_tuples: extra_cols = "," + ",".join( "{} AS {}".format(_[0], _[1]) for _ in extra_col_tuples ) - subquery = dedent( - f"""\ - SELECT CAST(1 AS NUMBER(15)) AS id - , CAST(2 AS NUMBER(4)) AS prod_id - , CAST(20121031 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(17.5 AS NUMBER(10,2)) AS txn_rate - , CAST('ABC' AS VARCHAR(50)) AS txn_desc - , CAST('ABC' AS CHAR(3)) AS txn_code - {extra_cols} - FROM dual - UNION ALL - SELECT CAST(2 AS NUMBER(15)) AS id - , CAST(3 AS NUMBER(4)) AS prod_id - , CAST(20121031 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 02:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(20 AS NUMBER(10,2)) AS txn_rate - , CAST('DEF' AS VARCHAR(50)) AS txn_desc - , CAST('DEF' AS CHAR(3)) AS txn_code - {extra_cols} - FROM dual - UNION ALL - SELECT CAST(3 AS NUMBER(15)) AS id - , CAST(4 AS NUMBER(4)) AS prod_id - , CAST(20121031 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 03:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(10.55 AS NUMBER(10,2)) AS txn_rate - , CAST('GHI' AS VARCHAR(50)) AS txn_desc - , CAST('GHI' AS CHAR(3)) AS txn_code - {extra_cols} - FROM dual - """ - ) + if empty: + subquery = dedent( + f"""\ + SELECT CAST(1 AS NUMBER(15)) AS id + , CAST(2 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(17.5 AS NUMBER(10,2)) AS txn_rate + , CAST('ABC' AS VARCHAR(50)) AS txn_desc + , CAST('ABC' AS CHAR(3)) AS txn_code + {extra_cols} + FROM dual + WHERE 1 = 2 + """ + ) + else: + subquery = dedent( + f"""\ + SELECT CAST(1 AS NUMBER(15)) AS id + , CAST(2 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(17.5 AS NUMBER(10,2)) AS txn_rate + , CAST('ABC' AS VARCHAR(50)) AS txn_desc + , CAST('ABC' AS CHAR(3)) AS txn_code + {extra_cols} + FROM dual + UNION ALL + SELECT CAST(2 AS NUMBER(15)) AS id + , CAST(3 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 02:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(20 AS NUMBER(10,2)) AS txn_rate + , CAST('DEF' AS VARCHAR(50)) AS txn_desc + , CAST('DEF' AS CHAR(3)) AS txn_code + {extra_cols} + FROM dual + UNION ALL + SELECT CAST(3 AS NUMBER(15)) AS id + , CAST(4 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 03:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(10.55 AS NUMBER(10,2)) AS txn_rate + , CAST('GHI' AS VARCHAR(50)) AS txn_desc + , CAST('GHI' AS CHAR(3)) AS txn_code + {extra_cols} + FROM dual + """ + ) return self.gen_ctas_from_subquery( schema, table_name, subquery, with_stats_collection=True ) diff --git a/tests/testlib/test_framework/teradata/teradata_frontend_testing_api.py b/tests/testlib/test_framework/teradata/teradata_frontend_testing_api.py index a08d0304..4d89742f 100644 --- a/tests/testlib/test_framework/teradata/teradata_frontend_testing_api.py +++ b/tests/testlib/test_framework/teradata/teradata_frontend_testing_api.py @@ -2108,46 +2108,66 @@ def select_grant_exists( return bool(row[0] == "NO") def standard_dimension_frontend_ddl( - self, schema: str, table_name: str, extra_col_tuples: Optional[list] = None + self, + schema: str, + table_name: str, + extra_col_tuples: Optional[list] = None, + empty: bool = False, ) -> list: extra_cols = "" if extra_col_tuples: extra_cols = "," + ",".join( "{} AS {}".format(_[0], _[1]) for _ in extra_col_tuples ) - subquery = dedent( - f"""\ - SELECT CAST(1 AS NUMBER(15)) AS id - , CAST(2 AS NUMBER(4)) AS prod_id - , CAST(20120931 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(17.5 AS NUMBER(10,2)) AS txn_rate - , CAST('ABC' AS VARCHAR(50)) AS txn_desc - , CAST('ABC' AS CHAR(3)) AS txn_code - {extra_cols} - UNION ALL - SELECT CAST(2 AS NUMBER(15)) AS id - , CAST(3 AS NUMBER(4)) AS prod_id - , CAST(20121031 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 02:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(20 AS NUMBER(10,2)) AS txn_rate - , CAST('DEF' AS VARCHAR(50)) AS txn_desc - , CAST('DEF' AS CHAR(3)) AS txn_code - {extra_cols} - UNION ALL - SELECT CAST(3 AS NUMBER(15)) AS id - , CAST(4 AS NUMBER(4)) AS prod_id - , CAST(20121031 AS NUMBER(8)) AS txn_day - , DATE'2012-10-31' AS txn_date - , CAST(TIMESTAMP'2012-10-31 03:15:00' AS TIMESTAMP(3)) AS txn_time - , CAST(10.55 AS NUMBER(10,2)) AS txn_rate - , CAST('GHI' AS VARCHAR(50)) AS txn_desc - , CAST('GHI' AS CHAR(3)) AS txn_code - {extra_cols} - """ - ) + if empty: + subquery = dedent( + f"""\ + SELECT CAST(1 AS NUMBER(15)) AS id + , CAST(2 AS NUMBER(4)) AS prod_id + , CAST(20120931 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(17.5 AS NUMBER(10,2)) AS txn_rate + , CAST('ABC' AS VARCHAR(50)) AS txn_desc + , CAST('ABC' AS CHAR(3)) AS txn_code + {extra_cols} + WHERE 1 = 2 + """ + ) + else: + subquery = dedent( + f"""\ + SELECT CAST(1 AS NUMBER(15)) AS id + , CAST(2 AS NUMBER(4)) AS prod_id + , CAST(20120931 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 01:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(17.5 AS NUMBER(10,2)) AS txn_rate + , CAST('ABC' AS VARCHAR(50)) AS txn_desc + , CAST('ABC' AS CHAR(3)) AS txn_code + {extra_cols} + UNION ALL + SELECT CAST(2 AS NUMBER(15)) AS id + , CAST(3 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 02:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(20 AS NUMBER(10,2)) AS txn_rate + , CAST('DEF' AS VARCHAR(50)) AS txn_desc + , CAST('DEF' AS CHAR(3)) AS txn_code + {extra_cols} + UNION ALL + SELECT CAST(3 AS NUMBER(15)) AS id + , CAST(4 AS NUMBER(4)) AS prod_id + , CAST(20121031 AS NUMBER(8)) AS txn_day + , DATE'2012-10-31' AS txn_date + , CAST(TIMESTAMP'2012-10-31 03:15:00' AS TIMESTAMP(3)) AS txn_time + , CAST(10.55 AS NUMBER(10,2)) AS txn_rate + , CAST('GHI' AS VARCHAR(50)) AS txn_desc + , CAST('GHI' AS CHAR(3)) AS txn_code + {extra_cols} + """ + ) return self.gen_ctas_from_subquery( schema, table_name, subquery, with_stats_collection=True ) diff --git a/tests/unit/offload/operation/test_ddl_file.py b/tests/unit/offload/operation/test_ddl_file.py index ce3fab3e..32cd4b3e 100644 --- a/tests/unit/offload/operation/test_ddl_file.py +++ b/tests/unit/offload/operation/test_ddl_file.py @@ -63,6 +63,12 @@ def test_normalise_ddl_file_auto(config: "OrchestrationConfig"): assert isinstance(fake_operation.ddl_file, str) +def test_ddl_file_header(): + h = module_under_test.ddl_file_header() + assert h + assert isinstance(h, str) + + @pytest.mark.parametrize( "path,expect_exception", [ From d32131ea97c91c46e80e6103143ddb0f2d84ff0e Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 10 May 2024 14:10:20 +0000 Subject: [PATCH 16/28] feat: Decouple table creation and data loading --- src/goe/config/option_descriptions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/goe/config/option_descriptions.py b/src/goe/config/option_descriptions.py index 50e7f6d4..67db30a7 100644 --- a/src/goe/config/option_descriptions.py +++ b/src/goe/config/option_descriptions.py @@ -24,9 +24,7 @@ "Values of 0 or 1 will execute the query without parallelism" ) -RESET_BACKEND_TABLE = ( - "Remove backend data table. Use with caution - this will delete previously offloaded data for this table!", -) +RESET_BACKEND_TABLE = "Remove backend data table. Use with caution - this will delete previously offloaded data for this table!" REUSE_BACKEND_TABLE = ( "Allow Offload to re-use an empty backend table when there is already Offload metadata. " From 0674398f174fbde579cafad8506cab8a569aae17 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 10 May 2024 16:15:12 +0000 Subject: [PATCH 17/28] feat: Decouple table creation and data loading --- src/goe/config/option_descriptions.py | 2 +- src/goe/offload/offload_constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/goe/config/option_descriptions.py b/src/goe/config/option_descriptions.py index 67db30a7..6f1fb99e 100644 --- a/src/goe/config/option_descriptions.py +++ b/src/goe/config/option_descriptions.py @@ -28,7 +28,7 @@ REUSE_BACKEND_TABLE = ( "Allow Offload to re-use an empty backend table when there is already Offload metadata. " - "This may be useful if a backend table had data removed by an administrator and a re-offlaod is required" + "This may be useful if a backend table had data removed by an administrator and a re-offload is required" ) VERIFY_PARALLELISM = ( diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index d91ea645..f65323a3 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -149,7 +149,7 @@ ) TOTAL_ROWS_OFFLOADED_LOG_TEXT = "Total rows offloaded" DDL_FILE_EXECUTE_MESSAGE_TEXT = ( - "Switching command to non-exectute mode due to --ddl-file option" + "Switching command to non-execute mode due to --ddl-file option" ) DDL_FILE_WRITE_MESSAGE_TEMPLATE = "Table DDL has been written to file: {}" From 35913219f42d22c776145ce43030b6db249959df Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 10 May 2024 16:28:58 +0000 Subject: [PATCH 18/28] feat: Decouple table creation and data loading --- src/goe/offload/operation/ddl_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/goe/offload/operation/ddl_file.py b/src/goe/offload/operation/ddl_file.py index 3bc00551..33d54162 100644 --- a/src/goe/offload/operation/ddl_file.py +++ b/src/goe/offload/operation/ddl_file.py @@ -110,7 +110,7 @@ def write_ddl_to_ddl_file( assert ddl_file ddl_str = "\n".join(ddl) header = ddl_file_header() - ddl_file_contents = f"{header}\n{ddl_str}" + ddl_file_contents = f"{header}\n{ddl_str}\n" if ":" in ddl_file: # Cloud storage. # dry_run=False below because, even in preview mode we need to write the file. From 2fe928ed37660bd785f254b16087b9e74064d141 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 16 May 2024 17:55:43 +0000 Subject: [PATCH 19/28] feat: Add backend schema DDL to DDL file --- src/goe/offload/backend_api.py | 15 ++----- src/goe/offload/backend_table.py | 35 +++++++++++----- .../offload/bigquery/bigquery_backend_api.py | 40 +++++++++++-------- .../bigquery/bigquery_backend_table.py | 13 +++--- src/goe/offload/hadoop/hadoop_backend_api.py | 31 +++----------- .../offload/hadoop/hadoop_backend_table.py | 18 ++++++--- src/goe/offload/hadoop/hive_backend_api.py | 3 ++ src/goe/offload/hadoop/impala_backend_api.py | 3 ++ .../offload/microsoft/synapse_backend_api.py | 12 ++++-- .../microsoft/synapse_backend_table.py | 10 +++-- src/goe/offload/offload.py | 8 +++- .../snowflake/snowflake_backend_api.py | 12 ++++-- .../snowflake/snowflake_backend_table.py | 31 +++++++++----- tests/unit/offload/test_backend_api.py | 29 +++++++------- 14 files changed, 147 insertions(+), 113 deletions(-) diff --git a/src/goe/offload/backend_api.py b/src/goe/offload/backend_api.py index 62598b12..c4beb1ea 100644 --- a/src/goe/offload/backend_api.py +++ b/src/goe/offload/backend_api.py @@ -1154,23 +1154,15 @@ def compute_stats( """ @abstractmethod - def create_database(self, db_name, comment=None, properties=None): + def create_database( + self, db_name, comment=None, properties=None, with_terminator=False + ): """Create a backend database or equivalent container for tables (such as dataset or schema). properties: An optional dictionary to pass information to different backends, e.g.: properties={"location": "us-west"} Enables a BackendTable implementation to pass implementation specifics to BackendApi """ - @abstractmethod - def create_sequence_table(self, db_name, table_name): - """Create an empty options.sequence_table_name appropriate for the backend - options.sequence_table_name is tricky to deal with because we can leave the db part out, - e.g. both of these are valid: - options.sequence_table_name = 'udf_db.my_sequence_table - options.sequence_table_name = 'my_sequence_table - This function needs to cater for that - """ - @abstractmethod def create_table( self, @@ -1185,6 +1177,7 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): """Create a table and cater for backend specific details, we need to pass all parameters even if they don't apply to certain backends but some may be ignored depending on the system involved: diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index a1381c9b..fa12bc68 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -368,7 +368,7 @@ def _check_partition_info_range_start_end(self, canonical_column, backend_column % (canonical_column.partition_info.range_end, range_max) ) - def _create_db(self, db_name, comment=None, properties=None): + def _create_db(self, db_name, comment=None, properties=None, with_terminator=False): """Call through to relevant BackendApi to create a final database/dataset/schema. location can mean different things to different backends. """ @@ -383,23 +383,32 @@ def _create_db(self, db_name, comment=None, properties=None): return [] else: return self._db_api.create_database( - db_name, comment=comment, properties=properties + db_name, + comment=comment, + properties=properties, + with_terminator=with_terminator, ) - def _create_final_db(self, location=None): + def _create_final_db(self, location=None, with_terminator=False): comment = BACKEND_DB_COMMENT_TEMPLATE.format( db_name_type="Offload", db_name_label=self.db_name_label() ) return self._create_db( - self.db_name, comment=comment, properties={"location": location} + self.db_name, + comment=comment, + properties={"location": location}, + with_terminator=with_terminator, ) - def _create_load_db(self, location=None): + def _create_load_db(self, location=None, with_terminator=False): comment = BACKEND_DB_COMMENT_TEMPLATE.format( db_name_type="Offload load", db_name_label=self.db_name_label() ) return self._create_db( - self._load_db_name, comment=comment, properties={"location": location} + self._load_db_name, + comment=comment, + properties={"location": location}, + with_terminator=with_terminator, ) def _create_result_cache_db(self, location=None): @@ -1072,7 +1081,7 @@ def _partition_key_out_of_range_message(self, column): def _recreate_load_table(self, staging_file): """Drop and create the staging/load table and any supporting filesystem directory""" self._drop_load_table() - self._create_load_table(staging_file) + return self._create_load_table(staging_file) def _result_cache_db_exists(self): return self._db_api.database_exists(self._result_cache_db_name) @@ -1507,7 +1516,7 @@ def _validate_staged_data_query_options(self): # enforced private methods @abstractmethod - def _create_load_table(self, staging_file): + def _create_load_table(self, staging_file, with_terminator=False) -> list: pass @abstractmethod @@ -2354,7 +2363,7 @@ def _warning(self, msg): # Final table enforced methods/properties @abstractmethod - def create_db(self): + def create_db(self, with_terminator=False) -> list: pass @abstractmethod @@ -2415,7 +2424,8 @@ def compute_final_table_stats_step( optional=True, ) - def create_backend_db_step(self): + def create_backend_db_step(self) -> list: + executed_commands = [] if self.create_database_supported() and self._user_requested_create_backend_db: ( pre_register_data_gov_fn, @@ -2431,8 +2441,11 @@ def create_backend_db_step(self): ), ) pre_register_data_gov_fn() - self._offload_step(command_steps.STEP_CREATE_DB, lambda: self.create_db()) + executed_commands: list = self._offload_step( + command_steps.STEP_CREATE_DB, lambda: self.create_db() + ) post_register_data_gov_fn() + return executed_commands def create_backend_table_step(self, goe_object_type) -> list: ( diff --git a/src/goe/offload/bigquery/bigquery_backend_api.py b/src/goe/offload/bigquery/bigquery_backend_api.py index a35a7c35..6f70d013 100644 --- a/src/goe/offload/bigquery/bigquery_backend_api.py +++ b/src/goe/offload/bigquery/bigquery_backend_api.py @@ -946,8 +946,11 @@ def compute_stats( ): raise NotImplementedError("Compute statistics does not apply for BigQuery") - def create_database(self, db_name, comment=None, properties=None): - """Use the BigQuery API to create a dataset. + def create_database( + self, db_name, comment=None, properties=None, with_terminator=False + ): + """Create a BigQuery dataset using SQL. + properties: Allows properties["location"] to specify a BigQuery location, e.g. "us-west" """ assert db_name @@ -961,22 +964,21 @@ def create_database(self, db_name, comment=None, properties=None): "Dataset already exists, not attempting to create: %s" % db_name, detail=VVERBOSE, ) - new_dataset = bigquery.Dataset(self._bq_dataset_id(db_name)) - log_cmd = "create_dataset(%s" % self._bq_dataset_id(db_name) + return [] + sql = "CREATE SCHEMA {}".format( + self.enclose_identifier(self._bq_dataset_id(db_name)) + ) + options = [] if comment: - log_cmd += ", description='%s'" % comment - new_dataset.description = comment + options.append(f"description='{comment}'") if properties and properties.get("location"): - log_cmd += ", location=%s" % properties["location"] - new_dataset.location = properties["location"] - log_cmd += ")" - self._log("BigQuery call: %s" % log_cmd, detail=VERBOSE) - if not self._dry_run: - self._client.create_dataset(new_dataset) - return [log_cmd] - - def create_sequence_table(self, db_name, table_name): - raise NotImplementedError("Sequence table does not apply for BigQuery") + options.append("location='{}'".format(properties["location"])) + if options: + sql += " OPTIONS({})".format(",".join(options)) + if with_terminator: + sql += ";" + cmds = self.execute_ddl(sql) + return cmds def create_table( self, @@ -991,8 +993,10 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): - """Create a BigQuery table + """Create a BigQuery table. + sort_column_names: Only applicable for partitioned tables storage_format: Only used for external table otherwise FILE_STORAGE_FORMAT_BIGTABLE location: Only used for external table @@ -1013,6 +1017,8 @@ def create_table( table_properties=table_properties, sort_column_names=sort_column_names, ) + if with_terminator: + sql += ";" cmds = self.execute_ddl(sql, sync=sync) # Check table was created with KMS encryption if requested diff --git a/src/goe/offload/bigquery/bigquery_backend_table.py b/src/goe/offload/bigquery/bigquery_backend_table.py index 9059418a..5788f573 100644 --- a/src/goe/offload/bigquery/bigquery_backend_table.py +++ b/src/goe/offload/bigquery/bigquery_backend_table.py @@ -122,7 +122,7 @@ def __init__( # PRIVATE METHODS ########################################################################### - def _create_load_table(self, staging_file): + def _create_load_table(self, staging_file, with_terminator=False) -> list: """Create the staging/load table in BigQuery Defining a URI for location is tricky. Initially we had *.avro but Avro files written by Spark ThriftServer didn't match. Restrictions on the URI: @@ -146,7 +146,7 @@ def _create_load_table(self, staging_file): """ no_columns = no_partition_cols = [] load_table_location = "%s/part*" % (self.get_staging_table_location()) - self._db_api.create_table( + return self._db_api.create_table( self._load_db_name, self._load_table_name, no_columns, @@ -154,6 +154,7 @@ def _create_load_table(self, staging_file): storage_format=staging_file.file_format, location=load_table_location, external=True, + with_terminator=with_terminator, ) def _drop_load_table(self, sync=None): @@ -473,7 +474,7 @@ def compute_final_table_stats(self, incremental_stats, materialized_join=False): """Do nothing on BigQuery""" pass - def create_backend_table(self) -> list: + def create_backend_table(self, with_terminator=False) -> list: """Create a table in BigQuery based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. @@ -485,14 +486,16 @@ def create_backend_table(self) -> list: self.get_columns(), partition_column_names, sort_column_names=self._sort_columns, + with_terminator=with_terminator, ) if not self._dry_run: self._drop_state() return cmds - def create_db(self): + def create_db(self, with_terminator=False) -> list: return self._create_final_db( - location=self._orchestration_config.bigquery_dataset_location + location=self._orchestration_config.bigquery_dataset_location, + with_terminator=with_terminator, ) def default_udf_db_name(self): diff --git a/src/goe/offload/hadoop/hadoop_backend_api.py b/src/goe/offload/hadoop/hadoop_backend_api.py index ec43e5fe..03bcd62e 100644 --- a/src/goe/offload/hadoop/hadoop_backend_api.py +++ b/src/goe/offload/hadoop/hadoop_backend_api.py @@ -176,8 +176,6 @@ IMPALA_PROFILE_LOG_LENGTH = 1024 * 32 -SEQUENCE_COLUMN_NAME = "n" - ########################################################################### # GLOBAL FUNCTIONS @@ -776,7 +774,9 @@ def close(self): # which I (NJ) cannot fully track down. Instead I drop state so we'll start afresh if required. self.drop_state() - def create_database(self, db_name, comment=None, properties=None): + def create_database( + self, db_name, comment=None, properties=None, with_terminator=False + ): """Create a Hadoop database. properties: Allows properties["location"] to specify a DFS location """ @@ -791,32 +791,11 @@ def create_database(self, db_name, comment=None, properties=None): sql += " COMMENT '%s'" % comment if properties and properties.get("location"): sql += " LOCATION '%s'" % properties["location"] + if with_terminator: + sql += ";" return self.execute_ddl(sql) - def create_sequence_table(self, db_name, table_name): - """Create an empty options.sequence_table_name appropriate for the backend. - options.sequence_table_name is tricky to deal with because we can leave the db part out, - e.g. both of these are valid: - options.sequence_table_name = 'udf_db.my_sequence_table. - options.sequence_table_name = 'my_sequence_table. - This function needs to cater for that. - """ - assert table_name - without_db_name = bool(not db_name) - sequence_table_columns = [ - HadoopColumn(SEQUENCE_COLUMN_NAME, data_type=HADOOP_TYPE_INT) - ] - return self.create_table( - db_name, - table_name, - sequence_table_columns, - None, - FILE_STORAGE_FORMAT_PARQUET, - without_db_name=without_db_name, - sync=True, - ) - def create_view( self, db_name, diff --git a/src/goe/offload/hadoop/hadoop_backend_table.py b/src/goe/offload/hadoop/hadoop_backend_table.py index d64e7972..c77390bc 100644 --- a/src/goe/offload/hadoop/hadoop_backend_table.py +++ b/src/goe/offload/hadoop/hadoop_backend_table.py @@ -343,7 +343,7 @@ def _copy_load_table_avro_schema(self, staging_file): self._avro_schema_hdfs_path, data=avro_schema_str, overwrite=True ) - def _create_load_table(self, staging_file): + def _create_load_table(self, staging_file, with_terminator=False) -> list: """Create the staging/load table and supporting HDFS directory""" self._recreate_load_table_dir(include_remove=False) no_partition_cols = [] @@ -356,7 +356,7 @@ def _create_load_table(self, staging_file): table_properties = { "avro.schema.url": "%s%s" % (schema_fs_prefix, self._avro_schema_hdfs_path) } - self._db_api.create_table( + return self._db_api.create_table( self._load_db_name, self._load_table_name, self.convert_canonical_columns_to_backend( @@ -368,6 +368,7 @@ def _create_load_table(self, staging_file): external=True, table_properties=table_properties, sync=True, + with_terminator=with_terminator, ) def _create_new_backend_table(self, sort_column_names=None): @@ -650,12 +651,14 @@ def _tzoffset_to_timestamp_sql_expression(self, col_name): def cleanup_staging_area(self): self._drop_load_table(sync=True) - def create_backend_table(self) -> list: + def create_backend_table(self, with_terminator=False) -> list: """Create a table in the backend based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. """ - cmds = self._create_new_backend_table(sort_column_names=self._sort_columns) + cmds = self._create_new_backend_table( + sort_column_names=self._sort_columns, with_terminator=with_terminator + ) if not self._dry_run: # The CREATE TABLE above may have changed our world view so let's reset what we already know self._drop_state() @@ -816,8 +819,11 @@ def validate_type_conversions(self, staging_columns: list): # PUBLIC METHODS ########################################################################### - def create_db(self): - return self._create_final_db(location=self._get_data_db_hdfs_dir()) + def create_db(self, with_terminator=False) -> list: + return self._create_final_db( + location=self._get_data_db_hdfs_dir(), + with_terminator=with_terminator, + ) def default_udf_db_name(self): """By default we support UDF_DB but on Hadoop we use 'default' as a fall back""" diff --git a/src/goe/offload/hadoop/hive_backend_api.py b/src/goe/offload/hadoop/hive_backend_api.py index dab365ee..6b3cb2d2 100644 --- a/src/goe/offload/hadoop/hive_backend_api.py +++ b/src/goe/offload/hadoop/hive_backend_api.py @@ -375,6 +375,7 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): """Create a table using HiveQL See abstract method for more description @@ -453,6 +454,8 @@ def create_table( "location_clause": location_clause, "table_prop_clause": table_prop_clause, } + if with_terminator: + sql += ";" return self.execute_ddl(sql, sync=sync) diff --git a/src/goe/offload/hadoop/impala_backend_api.py b/src/goe/offload/hadoop/impala_backend_api.py index 38c9a443..5888ae91 100644 --- a/src/goe/offload/hadoop/impala_backend_api.py +++ b/src/goe/offload/hadoop/impala_backend_api.py @@ -625,6 +625,7 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): """Create a table using Impala SQL See abstract method for more description @@ -714,6 +715,8 @@ def create_table( "location_clause": location_clause, "table_prop_clause": table_prop_clause, } + if with_terminator: + sql += ";" return self.execute_ddl(sql, sync=sync) def create_udf( diff --git a/src/goe/offload/microsoft/synapse_backend_api.py b/src/goe/offload/microsoft/synapse_backend_api.py index 62490866..08436453 100644 --- a/src/goe/offload/microsoft/synapse_backend_api.py +++ b/src/goe/offload/microsoft/synapse_backend_api.py @@ -1079,7 +1079,9 @@ def compute_stats( ) return self.execute_ddl(sqls) if sqls else sqls - def create_database(self, db_name, comment=None, properties=None): + def create_database( + self, db_name, comment=None, properties=None, with_terminator=False + ): """Create a Synapse schema which is a database in GOE terminology. properties: not applicable comment: not applicable @@ -1095,11 +1097,10 @@ def create_database(self, db_name, comment=None, properties=None): detail=VVERBOSE, ) sql = "CREATE SCHEMA %s" % self.enclose_identifier(db_name) + if with_terminator: + sql += ";" return self.execute_ddl(sql) - def create_sequence_table(self, db_name, table_name): - raise NotImplementedError("Sequence table does not apply for Synapse") - def create_table( self, db_name, @@ -1113,6 +1114,7 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): """Create an Azure Synapse SQL table See abstract method for more description @@ -1141,6 +1143,8 @@ def add_colation(column): table_properties=table_properties, sort_column_names=sort_column_names, ) + if with_terminator: + sql += ";" return self.execute_ddl(sql, sync=sync) def create_view( diff --git a/src/goe/offload/microsoft/synapse_backend_table.py b/src/goe/offload/microsoft/synapse_backend_table.py index 8888b748..13231289 100644 --- a/src/goe/offload/microsoft/synapse_backend_table.py +++ b/src/goe/offload/microsoft/synapse_backend_table.py @@ -141,7 +141,7 @@ def _compute_load_table_statistics(self): self._load_db_name, self._load_table_name, for_columns=True ) - def _create_load_table(self, staging_file): + def _create_load_table(self, staging_file, with_terminator=False) -> list: no_partition_cols = [] self._db_api.create_table( self._load_db_name, @@ -162,6 +162,7 @@ def _create_load_table(self, staging_file): ), }, sync=True, + with_terminator=with_terminator, ) def _drop_load_table(self, sync=None): @@ -433,7 +434,7 @@ def cleanup_staging_area(self): def compute_final_table_stats(self, incremental_stats, materialized_join=False): return self._db_api.compute_stats(self.db_name, self.table_name) - def create_backend_table(self) -> list: + def create_backend_table(self, with_terminator=False) -> list: """Create a table in Synapse based on object state. For efficiency, we compute backend stats immediately after table creation to initialise empty stats objects on each column. These will be updated using a single table level command after the final load. @@ -454,6 +455,7 @@ def create_backend_table(self) -> list: no_partition_columns, table_properties=table_properties, sort_column_names=self._sort_columns, + with_terminator=with_terminator, ) if ( self._offload_stats_method @@ -469,8 +471,8 @@ def create_backend_table(self) -> list: self._drop_state() return cmds - def create_db(self): - cmds = self._create_final_db() + def create_db(self, with_terminator=False): + cmds = self._create_final_db(with_terminator=with_terminator) return cmds def derive_unicode_string_columns(self, as_csv=False): diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 7938acdc..19a6257b 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -77,7 +77,13 @@ def create_ddl_file_step( return def step_fn(): - ddl = offload_target_table.create_backend_table() + ddl = [] + if ( + offload_operation.create_backend_db + and offload_target_table.create_database_supported() + ): + ddl.extend(offload_target_table.create_db(with_terminator=True)) + ddl.extend(offload_target_table.create_backend_table(with_terminator=True)) write_ddl_to_ddl_file(offload_operation.ddl_file, ddl, config, messages) messages.offload_step(command_steps.STEP_DDL_FILE, step_fn, execute=False) diff --git a/src/goe/offload/snowflake/snowflake_backend_api.py b/src/goe/offload/snowflake/snowflake_backend_api.py index 9f54f347..37b05450 100644 --- a/src/goe/offload/snowflake/snowflake_backend_api.py +++ b/src/goe/offload/snowflake/snowflake_backend_api.py @@ -692,7 +692,9 @@ def compute_stats( """No manual controls of stats on Snowflake""" raise NotImplementedError("Compute statistics does not apply for Snowflake") - def create_database(self, db_name, comment=None, properties=None): + def create_database( + self, db_name, comment=None, properties=None, with_terminator=False + ): """Create a Snowflake schema which is a database in GOE terminology. properties["transient"]: Can be used to create a transient schema if value is truthy. """ @@ -719,11 +721,10 @@ def create_database(self, db_name, comment=None, properties=None): retention_clause, comment_clause, ) + if with_terminator: + sql += ";" return self.execute_ddl(sql) - def create_sequence_table(self, db_name, table_name): - raise NotImplementedError("Sequence table does not apply for Snowflake") - def create_table( self, db_name, @@ -737,6 +738,7 @@ def create_table( sort_column_names=None, without_db_name=False, sync=None, + with_terminator=False, ): """Create a Snowflake table partition_column_names: Not supported on Snowflake @@ -788,6 +790,8 @@ def create_table( "sort_by_clause": sort_by_clause, } ) + if with_terminator: + sql += ";" return self.execute_ddl(sql, sync=sync) def create_view( diff --git a/src/goe/offload/snowflake/snowflake_backend_table.py b/src/goe/offload/snowflake/snowflake_backend_table.py index 9a415a56..010e8c3e 100644 --- a/src/goe/offload/snowflake/snowflake_backend_table.py +++ b/src/goe/offload/snowflake/snowflake_backend_table.py @@ -125,7 +125,7 @@ def _cast_verification_query_options(self): """ return {"BINARY_INPUT_FORMAT": "BASE64"} - def _create_load_table(self, staging_file): + def _create_load_table(self, staging_file, with_terminator=False) -> list: raise NotImplementedError(self._not_implemented_message("Load table")) def _drop_load_table(self, sync=None): @@ -192,17 +192,22 @@ def _format_staging_object_name(self): } ) - def _gen_create_file_format_sql_text(self, file_format_name, staging_format): + def _gen_create_file_format_sql_text( + self, file_format_name, staging_format, with_terminator=False + ): null_if = ( " NULL_IF = 'null'" if staging_format == FILE_STORAGE_FORMAT_AVRO else "" ) - return "CREATE FILE FORMAT %s TYPE = %s%s" % ( + sql = "CREATE FILE FORMAT %s TYPE = %s%s" % ( self._db_api.enclose_object_reference(self.db_name, file_format_name), staging_format, null_if, ) + if with_terminator: + sql += ";" + return sql - def _gen_create_stage_sql_text(self): + def _gen_create_stage_sql_text(self, with_terminator=False): stage_url = self._get_dfs_client().gen_uri( self._offload_fs_scheme_override(), self._orchestration_config.offload_fs_container, @@ -210,7 +215,7 @@ def _gen_create_stage_sql_text(self): backend_db=self.db_name, container_override=self._offload_fs_container_override(), ) - return ( + sql = ( dedent( """\ CREATE STAGE %s @@ -225,6 +230,9 @@ def _gen_create_stage_sql_text(self): self._db_api.enclose_identifier(self._snowflake_integration), ) ) + if with_terminator: + sql += ";" + return sql def _gen_synthetic_partition_column_object(self, synthetic_name, partition_info): raise NotImplementedError( @@ -492,7 +500,7 @@ def compute_final_table_stats(self, incremental_stats, materialized_join=False): """We cannot influence stats on Snowflake and this should never be called due to capability setting""" pass - def create_backend_table(self) -> list: + def create_backend_table(self, with_terminator=False) -> list: """Create a table in Snowflake based on object state. Creating a new table may change our world view so the function drops state if in execute mode. If dry_run then we leave state in place to allow other operations to preview. @@ -504,12 +512,13 @@ def create_backend_table(self) -> list: self.get_columns(), no_partition_columns, sort_column_names=self._sort_columns, + with_terminator=with_terminator, ) if not self._dry_run: self._drop_state() return cmds - def create_db(self): + def create_db(self, with_terminator=False) -> list: """On Snowflake we create a SCHEMA, STAGE and all FILE FORMATs""" cmds = self._create_final_db() # Create STAGE and FILE FORMAT @@ -521,7 +530,9 @@ def create_db(self): detail=VERBOSE, ) else: - sqls.append(self._gen_create_stage_sql_text()) + sqls.append( + self._gen_create_stage_sql_text(with_terminator=with_terminator) + ) for staging_format in self._db_api.valid_staging_formats(): file_format = add_suffix_in_same_case( @@ -536,7 +547,9 @@ def create_db(self): ) else: sqls.append( - self._gen_create_file_format_sql_text(file_format, staging_format) + self._gen_create_file_format_sql_text( + file_format, staging_format, with_terminator=with_terminator + ) ) # TODO NJ@2020-11-12 Prepare result cache area for Hybrid Queries, temporary solution that will be removed diff --git a/tests/unit/offload/test_backend_api.py b/tests/unit/offload/test_backend_api.py index 8a2deeb8..ac809372 100644 --- a/tests/unit/offload/test_backend_api.py +++ b/tests/unit/offload/test_backend_api.py @@ -174,21 +174,20 @@ def _test_compute_stats(self): pass def _test_create_database(self): - if self.connect_to_backend: - try: - self.assertIsInstance(self.api.create_database(self.db), list) - self.assertIsInstance( - self.api.create_database(self.db, comment="Some comment"), list - ) - self.assertIsInstance( - self.api.create_database( - self.db, - properties={"location": "/some/place", "transient": True}, - ), - list, - ) - except NotImplementedError: - pass + try: + self.assertIsInstance(self.api.create_database(self.db), list) + self.assertIsInstance( + self.api.create_database(self.db, comment="Some comment"), list + ) + self.assertIsInstance( + self.api.create_database( + self.db, + properties={"location": "/some/place", "transient": True}, + ), + list, + ) + except NotImplementedError: + pass def _test_create_table(self): column_list = [ From b806a76f058a71128570bdbc2c28036d00c445c8 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 17 May 2024 13:38:40 +0000 Subject: [PATCH 20/28] feat: Better message when heap table already has data --- src/goe/goe.py | 4 ++++ src/goe/offload/offload_constants.py | 1 + tests/integration/scenarios/test_offload_basic.py | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index 1f78f055..0db6510e 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -2670,6 +2670,10 @@ def offload_operation_logic( if source_data_client.nothing_to_offload(): return False else: + messages.notice( + offload_constants.TARGET_HAS_DATA_MESSAGE_TEMPLATE + % (offload_target_table.db_name, offload_target_table.table_name) + ) return False return True diff --git a/src/goe/offload/offload_constants.py b/src/goe/offload/offload_constants.py index f65323a3..328ddf54 100644 --- a/src/goe/offload/offload_constants.py +++ b/src/goe/offload/offload_constants.py @@ -147,6 +147,7 @@ RESET_HYBRID_VIEW_EXCEPTION_TEXT = ( "Offload data identification options required with --reset-hybrid-view" ) +TARGET_HAS_DATA_MESSAGE_TEMPLATE = "Target table %s.%s already has data. Offload with --reset-backend-table to overwrite table data" TOTAL_ROWS_OFFLOADED_LOG_TEXT = "Total rows offloaded" DDL_FILE_EXECUTE_MESSAGE_TEXT = ( "Switching command to non-execute mode due to --ddl-file option" diff --git a/tests/integration/scenarios/test_offload_basic.py b/tests/integration/scenarios/test_offload_basic.py index eca70674..126d6f40 100644 --- a/tests/integration/scenarios/test_offload_basic.py +++ b/tests/integration/scenarios/test_offload_basic.py @@ -45,6 +45,7 @@ sales_based_fact_assertion, standard_dimension_assertion, text_in_events, + text_in_messages, ) from tests.integration.scenarios.scenario_runner import ( run_offload, @@ -346,7 +347,11 @@ def test_offload_basic_dim(config, schema, data_db): "owner_table": schema + "." + test_table, "execute": True, } - run_offload(options, config, messages, expected_status=False) + offload_messages = run_offload(options, config, messages, expected_status=False) + assert text_in_messages( + offload_messages, + offload_constants.TARGET_HAS_DATA_MESSAGE_TEMPLATE % (data_db, backend_name), + ) # Reset offload the dimension adding backend partitioning (if supported). options = { From 06a8c594a951c3c36f8a5ce010a1e67a01037274 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 17 May 2024 13:58:16 +0000 Subject: [PATCH 21/28] feat: Better message when heap table already has data --- src/goe/goe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index 0db6510e..9d9314a5 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -1679,7 +1679,7 @@ def defaults_for_existing_table( ) # If the table is empty then we allow the offload to continue. messages.log( - f"Allowing Offload to populate exists empty table: {offload_target_table.db_name}.{offload_target_table.table_name}", + f"Allowing Offload to populate existing empty table: {offload_target_table.db_name}.{offload_target_table.table_name}", detail=VERBOSE, ) return None From 593dcfb0b9d047c1fe90a07e2ff1760bfc87d2b0 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Tue, 21 May 2024 10:21:18 +0000 Subject: [PATCH 22/28] feat: Add backend load database DDL to DDL file --- src/goe/offload/backend_table.py | 10 +++++++++- src/goe/offload/offload.py | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index fa12bc68..812b80e1 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -400,7 +400,7 @@ def _create_final_db(self, location=None, with_terminator=False): with_terminator=with_terminator, ) - def _create_load_db(self, location=None, with_terminator=False): + def _create_load_db(self, location=None, with_terminator=False) -> list: comment = BACKEND_DB_COMMENT_TEMPLATE.format( db_name_type="Offload load", db_name_label=self.db_name_label() ) @@ -1802,6 +1802,14 @@ def create_conversion_view(self, column_tuples): sync=True, ) + def create_load_db(self, location=None, with_terminator=False) -> list: + if self._db_api.load_db_transport_supported(): + return self._create_load_db( + location=location, with_terminator=with_terminator + ) + else: + return [] + def db_exists(self): return self._db_api.database_exists(self.db_name) diff --git a/src/goe/offload/offload.py b/src/goe/offload/offload.py index 19a6257b..f370f4e8 100644 --- a/src/goe/offload/offload.py +++ b/src/goe/offload/offload.py @@ -83,6 +83,7 @@ def step_fn(): and offload_target_table.create_database_supported() ): ddl.extend(offload_target_table.create_db(with_terminator=True)) + ddl.extend(offload_target_table.create_load_db(with_terminator=True)) ddl.extend(offload_target_table.create_backend_table(with_terminator=True)) write_ddl_to_ddl_file(offload_operation.ddl_file, ddl, config, messages) From c260ce32d158565c1d84a58e79c902e4aedcf1ff Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 22 May 2024 10:39:37 +0000 Subject: [PATCH 23/28] feat: Remove trailing spaces from backend table DDL --- src/goe/offload/backend_api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/goe/offload/backend_api.py b/src/goe/offload/backend_api.py index c4beb1ea..8b3c2a12 100644 --- a/src/goe/offload/backend_api.py +++ b/src/goe/offload/backend_api.py @@ -351,10 +351,9 @@ def _create_table_columns_clause_common(self, column_list, external=False): for _ in column_list ] max_name = max(len(_[0]) for _ in sql_cols) - max_type = max(len(_[1]) for _ in sql_cols) - col_template = f"%-{max_name}s %-{max_type}s %s" + col_template = f"%-{max_name}s %s%s" return " " + "\n, ".join( - [col_template % (_[0], _[1], _[2]) for _ in sql_cols] + [col_template % (_[0], _[1], f" {_[2]}" if _[2] else "") for _ in sql_cols] ) def _create_table_column_nn_clause_common( From a7f895e297236277c30bef138c93c3e5e2f44480 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 23 May 2024 12:36:34 +0000 Subject: [PATCH 24/28] feat: Decouple table creation and data loading --- src/goe/goe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/goe/goe.py b/src/goe/goe.py index 9d9314a5..faaba00e 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -1006,6 +1006,11 @@ def normalise_options(options, normalise_owner_table=True): "Invalid value for LOG_LEVEL: %s" % options.log_level ) + if options.reset_backend_table and options.reuse_backend_table: + raise OptionValueError( + "Conflicting options --reset-backend-table with --reuse-backend-table cannot be used together" + ) + if options.reset_backend_table and not options.force: options.force = True From 9f4b04df384793ae57521cb931a1ac2fbbc09ceb Mon Sep 17 00:00:00 2001 From: nj1973 Date: Thu, 23 May 2024 16:17:57 +0000 Subject: [PATCH 25/28] feat: Change BigQUery external table creation to be with SQL and not an API call --- .../offload/bigquery/bigquery_backend_api.py | 64 ++++++++----------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/src/goe/offload/bigquery/bigquery_backend_api.py b/src/goe/offload/bigquery/bigquery_backend_api.py index 6f70d013..55197a10 100644 --- a/src/goe/offload/bigquery/bigquery_backend_api.py +++ b/src/goe/offload/bigquery/bigquery_backend_api.py @@ -269,52 +269,34 @@ def _check_kms_key_name(self, kms_key_name: str, key_type="job"): ) def _create_external_table( - self, db_name, table_name, column_list, storage_format, location=None - ): - """Create a BigQuery external table using the API""" + self, + db_name, + table_name, + storage_format, + location=None, + with_terminator=False, + ) -> list: + """Create a BigQuery external table using SQL""" assert db_name assert table_name - if storage_format not in [ - FILE_STORAGE_FORMAT_AVRO, - FILE_STORAGE_FORMAT_PARQUET, - ]: - assert column_list - assert valid_column_list(column_list), ( - "Incorrectly formed column_list: %s" % column_list - ) assert storage_format in ( FILE_STORAGE_FORMAT_AVRO, - FILE_STORAGE_FORMAT_BIGTABLE, FILE_STORAGE_FORMAT_PARQUET, - ) + ), f"Unsupported staging format: {storage_format}" assert location - if storage_format in [FILE_STORAGE_FORMAT_AVRO, FILE_STORAGE_FORMAT_PARQUET]: - # In BigQuery Avro/Parquet external table are to be created over existing files, no schema can be specified - column_spec = None - else: - column_spec = [ - bigquery.SchemaField( - _.name, - _.format_data_type(), - mode="NULLABLE" if _.nullable else "REQUIRED", - ) - for _ in column_list - ] - - new_table = bigquery.Table( - self._bq_table_id(db_name, table_name), schema=column_spec + sql = """CREATE EXTERNAL TABLE {db_table} +OPTIONS (format ='{format}', + uris = ['{location}'], + description = 'GOE staging table'); +""".format( + db_table=self.enclose_object_reference(db_name, table_name), + format=storage_format, + location=location, ) - - external_config = bigquery.ExternalConfig(storage_format) - external_config.source_uris = [location] - new_table.external_data_configuration = external_config - - log_cmd = pprint.pformat(new_table.to_api_repr()) - self._log("BigQuery call: %s" % log_cmd, detail=VERBOSE) - if not self._dry_run: - created_table = self._client.create_table(new_table) - return [log_cmd] + if with_terminator: + sql += ";" + return self.execute_ddl(sql) def _create_table_properties_clause(self, table_properties): """Build OPTIONS clause for CREATE TABLE statements from table_properties dict. Add kms key details (if set)""" @@ -1005,7 +987,11 @@ def create_table( """ if external: return self._create_external_table( - db_name, table_name, column_list, storage_format, location=location + db_name, + table_name, + storage_format, + location=location, + with_terminator=with_terminator, ) # Normal (non-external) table From b56eac6006d239c5788f0d99b9cf8a7071eb2147 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 5 Jul 2024 16:11:23 +0000 Subject: [PATCH 26/28] fix: Fix bug where load db dataset location is excluded from DDL file --- src/goe/offload/backend_table.py | 7 +++---- src/goe/offload/bigquery/bigquery_backend_table.py | 10 ++++++++++ src/goe/offload/hadoop/hadoop_backend_table.py | 10 ++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/goe/offload/backend_table.py b/src/goe/offload/backend_table.py index 812b80e1..5ad6e3ea 100644 --- a/src/goe/offload/backend_table.py +++ b/src/goe/offload/backend_table.py @@ -1802,11 +1802,10 @@ def create_conversion_view(self, column_tuples): sync=True, ) - def create_load_db(self, location=None, with_terminator=False) -> list: + def create_load_db(self, with_terminator=False) -> list: + """Generic code to create a load database, individual backends may have overrides.""" if self._db_api.load_db_transport_supported(): - return self._create_load_db( - location=location, with_terminator=with_terminator - ) + return self._create_load_db(with_terminator=with_terminator) else: return [] diff --git a/src/goe/offload/bigquery/bigquery_backend_table.py b/src/goe/offload/bigquery/bigquery_backend_table.py index 5788f573..9ad0fecf 100644 --- a/src/goe/offload/bigquery/bigquery_backend_table.py +++ b/src/goe/offload/bigquery/bigquery_backend_table.py @@ -498,6 +498,16 @@ def create_db(self, with_terminator=False) -> list: with_terminator=with_terminator, ) + def create_load_db(self, with_terminator=False) -> list: + """Create a load database.""" + if self._db_api.load_db_transport_supported(): + return self._create_load_db( + location=self._orchestration_config.bigquery_dataset_location, + with_terminator=with_terminator, + ) + else: + return [] + def default_udf_db_name(self): """By default we support UDF_DB but on BigQuery we use the data db as a fall back""" return self._udf_db or self.db_name diff --git a/src/goe/offload/hadoop/hadoop_backend_table.py b/src/goe/offload/hadoop/hadoop_backend_table.py index c77390bc..79fc4c3e 100644 --- a/src/goe/offload/hadoop/hadoop_backend_table.py +++ b/src/goe/offload/hadoop/hadoop_backend_table.py @@ -825,6 +825,16 @@ def create_db(self, with_terminator=False) -> list: with_terminator=with_terminator, ) + def create_load_db(self, with_terminator=False) -> list: + """Create a load database.""" + if self._db_api.load_db_transport_supported(): + return self._create_load_db( + location=self._get_load_db_hdfs_dir(), + with_terminator=with_terminator, + ) + else: + return [] + def default_udf_db_name(self): """By default we support UDF_DB but on Hadoop we use 'default' as a fall back""" return self._udf_db or "default" From c308100d754627ff93ce81664bde31e2da3b6eb1 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Fri, 5 Jul 2024 16:13:16 +0000 Subject: [PATCH 27/28] chore: Typo --- src/goe/goe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/goe/goe.py b/src/goe/goe.py index faaba00e..0956757b 100644 --- a/src/goe/goe.py +++ b/src/goe/goe.py @@ -1008,7 +1008,7 @@ def normalise_options(options, normalise_owner_table=True): if options.reset_backend_table and options.reuse_backend_table: raise OptionValueError( - "Conflicting options --reset-backend-table with --reuse-backend-table cannot be used together" + "Conflicting options --reset-backend-table and --reuse-backend-table cannot be used together" ) if options.reset_backend_table and not options.force: From fa4f30545832cf7a49e336f6ced45655de40f6ea Mon Sep 17 00:00:00 2001 From: nj1973 Date: Mon, 8 Jul 2024 14:46:23 +0000 Subject: [PATCH 28/28] fix: Fix merge issues --- tests/integration/offload/test_data_type_controls.py | 1 + tests/integration/scenarios/test_offload_basic.py | 1 + .../integration/scenarios/test_offload_transport_oracle_iot.py | 3 +++ 3 files changed, 5 insertions(+) diff --git a/tests/integration/offload/test_data_type_controls.py b/tests/integration/offload/test_data_type_controls.py index a1f464fd..82db2f76 100644 --- a/tests/integration/offload/test_data_type_controls.py +++ b/tests/integration/offload/test_data_type_controls.py @@ -195,6 +195,7 @@ def test_data_type_mapping_offload(config, schema, data_db): "owner_table": schema + "." + DIM_NAME, "create_backend_db": True, "reset_backend_table": True, + "execute": True, } offload_modifiers = frontend_api.goe_type_mapping_offload_options( max_decimal_precision, max_decimal_scale, max_decimal_integral_magnitude diff --git a/tests/integration/scenarios/test_offload_basic.py b/tests/integration/scenarios/test_offload_basic.py index e89a6b7e..caae1dbd 100644 --- a/tests/integration/scenarios/test_offload_basic.py +++ b/tests/integration/scenarios/test_offload_basic.py @@ -938,6 +938,7 @@ def test_offload_log_path_gcs(config, schema, data_db): "owner_table": schema + "." + table_name, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } run_offload( options, diff --git a/tests/integration/scenarios/test_offload_transport_oracle_iot.py b/tests/integration/scenarios/test_offload_transport_oracle_iot.py index 752693b3..ce24bb46 100644 --- a/tests/integration/scenarios/test_offload_transport_oracle_iot.py +++ b/tests/integration/scenarios/test_offload_transport_oracle_iot.py @@ -179,6 +179,7 @@ def iot_num_dim_tests( "offload_transport_method": transport_method, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } log_test_marker(messages, test_id) run_offload(options, config, messages) @@ -234,6 +235,7 @@ def iot_str_dim_tests( "offload_transport_method": transport_method, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } log_test_marker(messages, test_id) run_offload(options, config, messages) @@ -289,6 +291,7 @@ def iot_ts_dim_tests( "offload_transport_method": transport_method, "reset_backend_table": True, "create_backend_db": True, + "execute": True, } log_test_marker(messages, test_id) run_offload(options, config, messages)