diff --git a/import-automation/executor/Dockerfile b/import-automation/executor/Dockerfile index b473cea7fc..c62d6e043b 100644 --- a/import-automation/executor/Dockerfile +++ b/import-automation/executor/Dockerfile @@ -23,13 +23,8 @@ ENV JAVA_HOME=/usr/local/openjdk-17 COPY --from=openjdk:17-slim $JAVA_HOME $JAVA_HOME ENV PATH="${JAVA_HOME}/bin:${PATH}" -WORKDIR /workspace - -ADD requirements.txt /workspace/requirements.txt -RUN pip install -r /workspace/requirements.txt - RUN git clone https://github.com/datacommonsorg/data.git -RUN wget https://github.com/datacommonsorg/import/releases/download/0.1-alpha.1k/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar -COPY app/. /workspace/app/ - -CMD gunicorn --timeout 0 --workers 5 -b :$PORT app.main:FLASK_APP +WORKDIR /data/import-automation/executor +RUN wget https://storage.googleapis.com/datacommons_public/import_tools/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar +RUN pip install -r requirements.txt +ENTRYPOINT ["python", "main.py"] diff --git a/import-automation/executor/app/configs.py b/import-automation/executor/app/configs.py index 60e615a997..154b559ac8 100644 --- a/import-automation/executor/app/configs.py +++ b/import-automation/executor/app/configs.py @@ -84,6 +84,8 @@ class ExecutorConfig: dashboard_oauth_client_id: str = '' # Oauth Client ID used to authenticate with the proxy. importer_oauth_client_id: str = '' + # URL for the import executor container image. + importer_docker_image: str = 'gcr.io/datcom-ci/dc-import-executor:stable' # Access token of the account used to authenticate with GitHub. This is not # the account password. See # https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token. @@ -105,12 +107,18 @@ class ExecutorConfig: requirements_filename: str = 'requirements.txt' # ID of the location where Cloud Scheduler is hosted. scheduler_location: str = 'us-central1' + # Location of the local git data repo. + local_repo_dir: str = '/data' # Maximum time a user script can run for in seconds. user_script_timeout: float = 3600 # Arguments for the user script user_script_args: List[str] = () # Environment variables for the user script user_script_env: dict = None + # Invoke validations before upload. + invoke_import_validation: bool = False + # Import validation config file. + validation_config_file: str = 'tools/import_validation/validation_config.json' # Maximum time venv creation can take in seconds. venv_create_timeout: float = 3600 # Maximum time downloading a file can take in seconds. @@ -121,8 +129,10 @@ class ExecutorConfig: email_account: str = '' # The corresponding password, app password, or access token. email_token: str = '' - # Disbale email alert notifications. + # Disable email alert notifications. disable_email_notifications: bool = False + # Skip uploading the data to GCS (for local testing). + skip_gcs_upload: bool = False # Maximum time a blocking call to the importer to # perform an import can take in seconds. importer_import_timeout: float = 20 * 60 @@ -130,8 +140,8 @@ class ExecutorConfig: # delete an import can take in seconds. importer_delete_timeout: float = 10 * 60 # Executor type depends on where the executor runs - # Suppports one of: "GKE", "GAE" - executor_type: str = 'GAE' + # Suppports one of: "GKE", "GAE", "CLOUD_RUN" + executor_type: str = 'CLOUD_RUN' def get_data_refresh_config(self): """Returns the config used for Cloud Scheduler data refresh jobs.""" diff --git a/import-automation/executor/app/executor/cloud_run.py b/import-automation/executor/app/executor/cloud_run.py index 3895a4ace2..0452d3cd21 100644 --- a/import-automation/executor/app/executor/cloud_run.py +++ b/import-automation/executor/app/executor/cloud_run.py @@ -24,15 +24,12 @@ from absl import logging from google.api_core.exceptions import NotFound from google.cloud import run_v2 +from google.protobuf import duration_pb2 -def create_or_update_cloud_run_job( - project_id: str, - location: str, - job_id: str, - image: str, - env_vars: dict, -) -> run_v2.Job: +def create_or_update_cloud_run_job(project_id: str, location: str, job_id: str, + image: str, env_vars: dict, args: list, + resources: dict, timeout: int) -> run_v2.Job: """Creates a new cloud run job or updates an existing one. If the jobs exists, the container is updated with new image and environment @@ -45,6 +42,9 @@ def create_or_update_cloud_run_job( job_id: Name of the job image: Container image URL such as 'gcr.io/your-project/your-image:latest' env_vars: dict of environment variables as {'VAR': ''} + args: list of command line arguments + resources: cpu/memory resources + timeout: duration in seconds Returns: Job created as a dict. @@ -59,17 +59,23 @@ def create_or_update_cloud_run_job( for var, value in env_vars.items(): env.append(run_v2.EnvVar(name=var, value=value)) - container = run_v2.Container(image=image, env=env) - exe_template = run_v2.ExecutionTemplate(template=run_v2.TaskTemplate( - containers=[container])) + res = run_v2.types.ResourceRequirements(limits=resources) + container = run_v2.Container(image=image, env=env, resources=res, args=args) + # Labels allow filtering of automated import cloud run jobs, used in log-based metrics. + exe_template = run_v2.ExecutionTemplate( + labels={"datacommons_cloud_run_job_type": "auto_import_job"}, + template=run_v2.TaskTemplate( + containers=[container], + max_retries=2, + timeout=duration_pb2.Duration(seconds=timeout))) new_job = run_v2.Job(template=exe_template) - logging.info(f"Creating job {job_name}: {new_job}") + logging.info(f"Creating job: {job_name}") # Look for existing job to update job = None try: job = client.get_job(request=run_v2.GetJobRequest(name=job_name)) - logging.info(f"Found existing job {job_name}: {job}") + logging.info(f"Found existing job: {job_name}") except NotFound: logging.info(f"No existing job, creating new job: {job_name}") @@ -85,11 +91,11 @@ def create_or_update_cloud_run_job( # Update existing Cloud Run job # Overrides container settings including image, env job.template.template.containers = new_job.template.template.containers - logging.info(f"Updating job {job_name}: {job}") + logging.info(f"Updating job: {job_name}") update_request = run_v2.UpdateJobRequest(job=job) update_operation = client.update_job(request=update_request) result = update_operation.result() # Blocks until update completes - logging.info(f"Job updated {job_name}: {result}") + logging.info(f"Job updated: {job_name}") return result diff --git a/import-automation/executor/app/executor/cloud_run_simple_import.py b/import-automation/executor/app/executor/cloud_run_simple_import.py index 380e3a3b19..51736d4f24 100644 --- a/import-automation/executor/app/executor/cloud_run_simple_import.py +++ b/import-automation/executor/app/executor/cloud_run_simple_import.py @@ -188,8 +188,11 @@ def cloud_run_simple_import_job( logging.info( f'Setting up simple import cloud run {project_id}:{job_id} for' f' {config_file} with output: {gcs_output_dir}, env: {env_vars}') + resources = {} + args = [] job = cloud_run.create_or_update_cloud_run_job(project_id, location, job_id, - image, env_vars) + image, env_vars, args, + resources) if not job: logging.error( f'Failed to setup cloud run job {job_id} for {config_file}') diff --git a/import-automation/executor/app/executor/cloud_scheduler.py b/import-automation/executor/app/executor/cloud_scheduler.py index e89ec592b8..68e70afd8f 100644 --- a/import-automation/executor/app/executor/cloud_scheduler.py +++ b/import-automation/executor/app/executor/cloud_scheduler.py @@ -26,6 +26,7 @@ from google.protobuf import json_format from google.api_core.exceptions import AlreadyExists, NotFound +CLOUD_RUN_SERVICE_ACCOUNT = os.getenv('CLOUD_SCHEDULER_CALLER_SA') GKE_SERVICE_DOMAIN = os.getenv('GKE_SERVICE_DOMAIN', 'importautomation.datacommons.org') GKE_CALLER_SERVICE_ACCOUNT = os.getenv('CLOUD_SCHEDULER_CALLER_SA') @@ -50,15 +51,38 @@ def _base_job_request(absolute_import_name, schedule: str): # 30m is the max allowed deadline 'seconds': 60 * 30 } - # <'http_request'|'appengine_job_request'>: {...} + # <'gke_job_request'|'appengine_job_request'|'cloud_run_job_request'>: {...} } -def http_job_request(absolute_import_name, - schedule, - json_encoded_job_body: str, - gke_caller_service_account: str = "", - gke_oauth_audience: str = "") -> Dict: +def cloud_run_job_request(absolute_import_name, schedule, + cloud_run_job_url: str, + cloud_run_service_account: str) -> Dict: + """Cloud Scheduler request that targets jobs in CLOUD_RUN.""" + json_encoded_job_body = json.dumps({}).encode("utf-8") + job = _base_job_request(absolute_import_name, schedule) + job_name = absolute_import_name.split(':')[1] + job['name'] = f'{job_name}' + job['http_target'] = { + 'uri': f'https://{cloud_run_job_url}', + 'http_method': 'POST', + 'headers': { + 'Content-Type': 'application/json', + }, + 'body': json_encoded_job_body, + 'oauth_token': { + 'service_account_email': f'{cloud_run_service_account}', + 'scope': 'https://www.googleapis.com/auth/cloud-platform' + } + } + return job + + +def gke_job_request(absolute_import_name, + schedule, + json_encoded_job_body: str, + gke_caller_service_account: str = "", + gke_oauth_audience: str = "") -> Dict: """Cloud Scheduler request that targets executors launched in GKE.""" # If the service account and oauth audience are provided as diff --git a/import-automation/executor/app/executor/import_executor.py b/import-automation/executor/app/executor/import_executor.py index 8396f95465..33740da080 100644 --- a/import-automation/executor/app/executor/import_executor.py +++ b/import-automation/executor/app/executor/import_executor.py @@ -17,15 +17,29 @@ """ import dataclasses +import glob import json import logging import os +import sys import subprocess import tempfile import time import traceback from typing import Callable, Dict, Iterable, List, Optional, Tuple +REPO_DIR = os.path.dirname( + os.path.dirname( + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) +sys.path.append(os.path.join(REPO_DIR, 'tools', 'import_differ')) +sys.path.append(os.path.join(REPO_DIR, 'tools', 'import_validation')) +sys.path.append(os.path.join(REPO_DIR, 'util')) + +import file_util + +from import_differ import ImportDiffer +from import_validation import ImportValidation from app import configs from app import utils from app.executor import cloud_run_simple_import @@ -34,6 +48,7 @@ from app.service import file_uploader from app.service import github_api from app.service import import_service +from google.cloud import storage # Email address for status messages. _DEBUG_EMAIL_ADDR = 'datacommons-debug+imports@google.com' @@ -240,7 +255,6 @@ def _execute_imports_on_commit_helper( with tempfile.TemporaryDirectory() as tmpdir: repo_dir = self.github.download_repo( tmpdir, commit_sha, self.config.repo_download_timeout) - logging.info(f'Downloaded repo: {repo_dir}') imports_to_execute = import_target.find_imports_to_execute( @@ -318,6 +332,100 @@ def _import_one( ) raise exc + def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str, + absolute_import_dir: str, + import_spec: dict) -> None: + """ + Performs validations on import data. + """ + import_inputs = import_spec.get('import_inputs', []) + for import_input in import_inputs: + mcf_path = import_input['node_mcf'] + if not mcf_path: + # TODO: Generate node mcf using dc-import tool + logging.error( + 'Empty node_mcf in manifest, skipping validation.') + current_data_path = os.path.join(absolute_import_dir, mcf_path) + previous_data_path = os.path.join(absolute_import_dir, + 'previous_data.mcf') + summary_stats = os.path.join(absolute_import_dir, + 'summary_report.csv') + validation_output_path = os.path.join(absolute_import_dir, + 'validation') + config_file = import_spec.get('validation_config_file', '') + if config_file: + config_file_path = os.path.join(absolute_import_dir, + config_file) + else: + config_file_path = os.path.join( + repo_dir, self.config.validation_config_file) + logging.info(f'Validation config file: {config_file_path}') + + # Download previous import data. + bucket = storage.Client(self.config.gcs_project_id).bucket( + self.config.storage_prod_bucket_name) + folder = relative_import_dir + '/' + import_spec['import_name'] + '/' + blob = bucket.blob(folder + 'latest_version.txt') + if not blob: + logging.error( + f'Not able to download latest_version.txt from {folder}, skipping validation.' + ) + return + latest_version = blob.download_as_text() + blob = bucket.blob(folder + latest_version + '/' + mcf_path) + if not blob: + logging.error( + f'Not able to download previous import from {latest_version}, skipping validation.' + ) + return + # blob.download_to_filename(previous_data_path) + + # Invoke differ script. + differ = ImportDiffer(current_data_path, previous_data_path, + validation_output_path) + differ.run_differ() + + # Invoke validation script. + validation_output = os.path.join(validation_output_path, + 'validation_output.csv') + differ_output = os.path.join(validation_output_path, + 'point_analysis_summary.csv') + validation = ImportValidation(config_file_path, differ_output, + summary_stats, validation_output) + validation.run_validations() + + def _invoke_import_job(self, absolute_import_dir: str, import_spec: dict, + version: str, interpreter_path: str, + process: subprocess.CompletedProcess) -> None: + script_paths = import_spec.get('scripts') + for path in script_paths: + script_path = os.path.join(absolute_import_dir, path) + simple_job = cloud_run_simple_import.get_simple_import_job_id( + import_spec, script_path) + if simple_job: + # Running simple import as cloud run job. + cloud_run_simple_import.cloud_run_simple_import_job( + import_spec=import_spec, + config_file=script_path, + env=self.config.user_script_env, + version=version, + image=import_spec.get('image'), + ) + else: + # Run import script locally. + script_interpreter = _get_script_interpreter( + script_path, interpreter_path) + process = _run_user_script( + interpreter_path=script_interpreter, + script_path=script_path, + timeout=self.config.user_script_timeout, + args=self.config.user_script_args, + cwd=absolute_import_dir, + env=self.config.user_script_env, + ) + _log_process(process=process) + process.check_returncode() + def _import_one_helper( self, repo_dir: str, @@ -341,7 +449,8 @@ def _import_one_helper( requirements_path = os.path.join(absolute_import_dir, self.config.requirements_filename) central_requirements_path = os.path.join( - repo_dir, self.config.requirements_filename) + repo_dir, 'import-automation', 'executor', + self.config.requirements_filename) interpreter_path, process = _create_venv( (central_requirements_path, requirements_path), tmpdir, @@ -351,35 +460,23 @@ def _import_one_helper( _log_process(process=process) process.check_returncode() - script_paths = import_spec.get('scripts') - for path in script_paths: - script_path = os.path.join(absolute_import_dir, path) - simple_job = cloud_run_simple_import.get_simple_import_job_id( - import_spec, script_path) - if simple_job: - # Running simple import as cloud run job. - cloud_run_simple_import.cloud_run_simple_import_job( - import_spec=import_spec, - config_file=script_path, - env=self.config.user_script_env, - version=version, - image=import_spec.get('image'), - ) - else: - # Run import script locally. - script_interpreter = _get_script_interpreter( - script_path, interpreter_path) - process = _run_user_script( - interpreter_path=script_interpreter, - script_path=script_path, - timeout=self.config.user_script_timeout, - args=self.config.user_script_args, - cwd=absolute_import_dir, - env=self.config.user_script_env, - name=import_name, - ) - _log_process(process=process) - process.check_returncode() + self._invoke_import_job(absolute_import_dir=absolute_import_dir, + import_spec=import_spec, + version=version, + interpreter_path=interpreter_path, + process=process) + + if self.config.invoke_import_validation: + logging.info("Invoking import validations") + self._invoke_import_validation( + repo_dir=repo_dir, + relative_import_dir=relative_import_dir, + absolute_import_dir=absolute_import_dir, + import_spec=import_spec) + + if self.config.skip_gcs_upload: + logging.info("Skipping GCS upload") + return inputs = self._upload_import_inputs( import_dir=absolute_import_dir, @@ -388,6 +485,14 @@ def _import_one_helper( import_spec=import_spec, ) + validation_output_path = os.path.join(absolute_import_dir, 'validation') + for filepath in glob.iglob(f'{validation_output_path}/*.csv'): + dest = f'{relative_import_dir}/{import_name}/{version}/validation/{os.path.basename(filepath)}' + self.uploader.upload_file( + src=filepath, + dest=dest, + ) + if self.importer: self.importer.delete_previous_output(relative_import_dir, import_spec) @@ -438,13 +543,32 @@ def _upload_import_inputs( for import_input in import_inputs: for input_type in self.config.import_input_types: path = import_input.get(input_type) - if path: - dest = f'{output_dir}/{version}/{os.path.basename(path)}' - self._upload_file_helper( - src=os.path.join(import_dir, path), - dest=dest, - ) - setattr(uploaded, input_type, dest) + if not path: + continue + for file in file_util.file_get_matching( + os.path.join(import_dir, path)): + if file: + dest = f'{output_dir}/{version}/{os.path.basename(file)}' + self._upload_file_helper( + src=file, + dest=dest, + ) + uploaded_dest = f'{output_dir}/{version}/{os.path.basename(path)}' + setattr(uploaded, input_type, uploaded_dest) + + # Upload any files downloaded from source + source_files = [ + os.path.join(import_dir, file) + for file in import_spec.get('source_files', []) + ] + source_files = file_util.file_get_matching(source_files) + for file in source_files: + dest = f'{output_dir}/{version}/source_files/{os.path.basename(file)}' + self._upload_file_helper( + src=file, + dest=dest, + ) + self.uploader.upload_string( version, os.path.join(output_dir, self.config.storage_version_filename)) @@ -467,15 +591,15 @@ def _import_metadata_mcf_helper(self, import_spec: dict) -> str: Args: import_spec: Specification of the import as a dict. - + Returns: import_metadata_mcf node. """ node = _IMPORT_METADATA_MCF_TEMPLATE.format_map({ "import_name": import_spec.get('import_name'), - "last_data_refresh_date": _clean_date(utils.pacific_time()) + "last_data_refresh_date": _clean_date(utils.utctime()) }) - next_data_refresh_date = utils.next_pacific_date( + next_data_refresh_date = utils.next_utc_date( import_spec.get('cron_schedule')) if next_data_refresh_date: node += f'nextDataRefreshDate: "{next_data_refresh_date}"\n' diff --git a/import-automation/executor/app/executor/scheduler_job_manager.py b/import-automation/executor/app/executor/scheduler_job_manager.py index e1f7e775e7..e22afb9e3e 100644 --- a/import-automation/executor/app/executor/scheduler_job_manager.py +++ b/import-automation/executor/app/executor/scheduler_job_manager.py @@ -33,6 +33,7 @@ import traceback import tempfile from typing import Dict +import cloud_run from app import configs from app.service import github_api @@ -40,6 +41,9 @@ from app.executor import import_executor from app.executor import cloud_scheduler +_GKE_SERVICE_ACCOUNT_KEY: str = 'gke_service_account' +_GKE_OAUTH_AUDIENCE_KEY: str = 'gke_oauth_audience' + def schedule_on_commit(github: github_api.GitHubRepoAPI, config: configs.ExecutorConfig, commit_sha: str): @@ -66,18 +70,13 @@ def schedule_on_commit(github: github_api.GitHubRepoAPI, scheduled = [] for relative_dir, spec in imports_to_execute: - schedule = spec.get('cron_schedule') - if not schedule: - manifest_path = os.path.join(relative_dir, - config.manifest_filename) - raise KeyError(f'cron_schedule not found in {manifest_path}') try: absolute_import_name = import_target.get_absolute_import_name( relative_dir, spec['import_name']) logging.info('Scheduling a data update job for %s', absolute_import_name) - job = _create_or_update_import_schedule(absolute_import_name, - schedule, config) + job = create_or_update_import_schedule(absolute_import_name, + spec, config, {}) scheduled.append(job) except Exception: raise import_executor.ExecutionError( @@ -87,26 +86,70 @@ def schedule_on_commit(github: github_api.GitHubRepoAPI, 'No issues') -def _create_or_update_import_schedule(absolute_import_name, schedule: str, - config: configs.ExecutorConfig): +def create_or_update_import_schedule(absolute_import_name: str, + import_spec: dict, + config: configs.ExecutorConfig, + scheduler_config_dict: Dict): """Create/Update the import schedule for 1 import.""" - # Note: this is the content of what is passed to /update API - # inside each cronjob http calls. - json_encoded_job_body = json.dumps({ - 'absolute_import_name': absolute_import_name, - 'configs': config.get_data_refresh_config() - }).encode() + schedule = import_spec.get('cron_schedule') + if not schedule: + raise KeyError( + f'cron_schedule not found in manifest for {absolute_import_name}') + resources = {"cpu": "2", "memory": "4G"} # default resources. + if 'resource_limits' in import_spec: + resources.update(import_spec['resource_limits']) + timeout = config.user_script_timeout + if 'user_script_timeout' in import_spec: + timeout = import_spec['user_script_timeout'] if config.executor_type == "GKE": - req = cloud_scheduler.http_job_request(absolute_import_name, schedule, - json_encoded_job_body) + # Note: this is the content of what is passed to /update API + # inside each cronjob http calls. + json_encoded_job_body = json.dumps({ + 'absolute_import_name': absolute_import_name, + 'configs': config.get_data_refresh_config() + }).encode('utf-8') + # Before proceeding, ensure that the configs read from GCS have the expected fields. + assert _GKE_SERVICE_ACCOUNT_KEY in scheduler_config_dict + assert _GKE_OAUTH_AUDIENCE_KEY in scheduler_config_dict + service_account_key = scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY] + oauth_audience_key = scheduler_config_dict[_GKE_OAUTH_AUDIENCE_KEY] + req = cloud_scheduler.gke_job_request(absolute_import_name, schedule, + json_encoded_job_body, + service_account_key, + oauth_audience_key) elif config.executor_type == "GAE": + json_encoded_job_body = json.dumps({ + 'absolute_import_name': absolute_import_name, + 'configs': config.get_data_refresh_config() + }).encode('utf-8') req = cloud_scheduler.appengine_job_request(absolute_import_name, schedule, json_encoded_job_body) + elif config.executor_type == "CLOUD_RUN": + docker_image = config.importer_docker_image + job_name = absolute_import_name.split(':')[1] + + json_encoded_config = json.dumps(config.get_data_refresh_config()) + args = [ + f'--import_name={absolute_import_name}', + f'--import_config={json_encoded_config}' + ] + env_vars = {} + job = cloud_run.create_or_update_cloud_run_job( + config.gcp_project_id, config.scheduler_location, job_name, + docker_image, env_vars, args, resources, timeout) + job_id = job.name.rsplit('/', 1)[1] + if not job: + logging.error( + f'Failed to setup cloud run job for {absolute_import_name}') + cloud_run_job_url = f'{config.scheduler_location}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/{config.gcp_project_id}/jobs/{job_id}:run' + req = cloud_scheduler.cloud_run_job_request( + absolute_import_name, schedule, cloud_run_job_url, + scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY]) else: raise Exception( - "Invalid executor_type %s, expects one of ('GKE', 'GAE')", + "Invalid executor_type %s, expects one of ('GKE', 'GAE', 'CLOUD_RUN')", config.executor_type) return cloud_scheduler.create_or_update_job(config.gcp_project_id, diff --git a/import-automation/executor/app/utils.py b/import-automation/executor/app/utils.py index 6d320c62cb..fe1a04d10b 100644 --- a/import-automation/executor/app/utils.py +++ b/import-automation/executor/app/utils.py @@ -40,9 +40,9 @@ def pacific_time(): return datetime.datetime.now(pytz.timezone(_PACIFIC_TIME)).isoformat() -def next_pacific_date(cron_expression: str, from_time: str = None) -> str: - """Returns the next date from today in ISO8601 with timezone - America/Los_Angeles, given a cron schedule. +def next_utc_date(cron_expression: str, from_time: str = None) -> str: + """Returns the next date from today in ISO8601 with timezone UTC+0, + given a cron schedule. Args: cron_expression: Expression for cron schedule. @@ -53,7 +53,7 @@ def next_pacific_date(cron_expression: str, from_time: str = None) -> str: """ try: if not from_time: - from_time = datetime.datetime.now(pytz.timezone(_PACIFIC_TIME)) + from_time = datetime.datetime.now(datetime.timezone.utc) iter = croniter(cron_expression, from_time) return iter.get_next(datetime.datetime).date().isoformat() except Exception as e: diff --git a/import-automation/executor/cloudbuild.yaml b/import-automation/executor/cloudbuild.yaml new file mode 100644 index 0000000000..0d185b7a76 --- /dev/null +++ b/import-automation/executor/cloudbuild.yaml @@ -0,0 +1,36 @@ +# Builds the docker image of import executor, verifies using integration test, +# and pushes it to artifact registry. +# +# Run it using: +# gcloud builds submit --config=cloudbuild.yaml --substitutions=_DOCKER_IMAGE="us-docker.pkg.dev/datcom-ci/gcr.io/dc-import-executor" . + +steps: + # Docker Build + - name: 'gcr.io/cloud-builders/docker' + args: ['build', '-t', '${_DOCKER_IMAGE}:${COMMIT_SHA}', '-t', '${_DOCKER_IMAGE}:latest', '.'] + dir: 'import-automation/executor' + + # Docker push to Google Artifact Registry + - name: 'gcr.io/cloud-builders/docker' + args: ['push', '${_DOCKER_IMAGE}', '--all-tags'] + + # Install dependencies + - name: python:3.11.11 + entrypoint: pip + args: ['install', '-r', 'requirements.txt', '--user'] + dir: 'import-automation/executor' + + # Run import executor integration test + - name: python:3.11.11 + entrypoint: python + args: ['verify_import.py'] + dir: 'import-automation/executor' + + # Tag image as stable and push + - name: 'gcr.io/cloud-builders/docker' + entrypoint: 'bash' + args: + - '-c' + - | + docker tag ${_DOCKER_IMAGE}:${COMMIT_SHA} ${_DOCKER_IMAGE}:stable \ + && docker push ${_DOCKER_IMAGE}:stable diff --git a/import-automation/executor/main.py b/import-automation/executor/main.py new file mode 100644 index 0000000000..696c27f25c --- /dev/null +++ b/import-automation/executor/main.py @@ -0,0 +1,91 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Import executor entrypoint. +""" +import logging +import json +import os +import time + +from absl import flags +from absl import app + +from app import configs +from app.executor import import_executor +from app.service import file_uploader +from app.service import github_api +from app.service import email_notifier + +FLAGS = flags.FLAGS +flags.DEFINE_string('import_name', '', 'Absoluate import name.') +flags.DEFINE_string('import_config', '', 'Import executor configuration.') + +CLOUD_RUN_JOB_NAME = os.getenv("CLOUD_RUN_JOB") +# The `log_type` label helps filter log lines, which is useful for creating +# log-based metrics. Each log type has a similar set of fields for easier parsing. +LOG_TYPE_LABEL = "log_type" +# log_type for capturing status of auto import cloud run jobs. +# Required fields - log_type, message, status, latency_secs. +AUTO_IMPORT_JOB_STATUS_LOG_TYPE = "auto-import-job-status" + + +def scheduled_updates(absolute_import_name: str, import_config: str): + """ + Invokes import update workflow. + """ + start_time = time.time() + logging.info(absolute_import_name) + cfg = json.loads(import_config) + config = configs.ExecutorConfig(**cfg) + executor = import_executor.ImportExecutor( + uploader=file_uploader.GCSFileUploader( + project_id=config.gcs_project_id, + bucket_name=config.storage_prod_bucket_name), + github=github_api.GitHubRepoAPI( + repo_owner_username=config.github_repo_owner_username, + repo_name=config.github_repo_name, + auth_username=config.github_auth_username, + auth_access_token=config.github_auth_access_token), + config=config, + notifier=email_notifier.EmailNotifier(config.email_account, + config.email_token), + local_repo_dir=config.local_repo_dir) + result = executor.execute_imports_on_update(absolute_import_name) + logging.info(result) + elapsed_time_secs = int(time.time() - start_time) + message = (f"Cloud Run Job [{CLOUD_RUN_JOB_NAME}] completed with status= " + f"[{result.status}] in [{elapsed_time_secs}] seconds.)") + # With Python logging lib, json is interpreted as text (populates textPayload field). + # Using print to populate json as structured logs (populate jsonPayload field). + # Ref: https://cloud.google.com/functions/docs/monitoring/logging#writing_structured_logs + print( + json.dumps({ + LOG_TYPE_LABEL: AUTO_IMPORT_JOB_STATUS_LOG_TYPE, + "message": message, + "severity": "INFO" if result.status == 'succeeded' else "ERROR", + "status": result.status, + "latency_secs": elapsed_time_secs, + })) + if result.status == 'failed': + return 1 + return 0 + + +def main(_): + return scheduled_updates(FLAGS.import_name, FLAGS.import_config) + + +if __name__ == '__main__': + app.run(main) diff --git a/import-automation/executor/requirements.txt b/import-automation/executor/requirements.txt index 956b49e547..92692ffb3a 100644 --- a/import-automation/executor/requirements.txt +++ b/import-automation/executor/requirements.txt @@ -1,14 +1,40 @@ -requests -protobuf +# Requirements for Python scripts in this repo that have automation enabled! + +absl-py +arcgis2geojson +beautifulsoup4 +chardet +croniter +dataclasses==0.6 +datacommons +frozendict +func-timeout==4.3.5 +geojson==2.5.0 google-auth -google-cloud-logging +google-cloud-bigquery +google-cloud-datastore google-cloud-run google-cloud-storage -google-cloud-datastore -google-cloud-scheduler==2.10.0 -gspread -flask +google-cloud-logging +google-cloud-scheduler +gspread==5.12.0 gunicorn +lxml==4.9.1 +numpy==1.26.4 +openpyxl>=3.1.0 +pandas +protobuf +psutil +pylint +pytest pytz -absl-py -croniter +ratelimit +requests==2.27.1 +requests_cache +retry==0.9.2 +shapely==1.8.5 +urllib3==1.26.8 +xarray==0.19.0 +xlrd +xlsxwriter==3.2.0 +zipp diff --git a/import-automation/executor/schedule_update_import.py b/import-automation/executor/schedule_update_import.py index b815faca52..9b1253af03 100644 --- a/import-automation/executor/schedule_update_import.py +++ b/import-automation/executor/schedule_update_import.py @@ -25,6 +25,7 @@ from app.executor import import_target from app.executor import import_executor from app.executor import cloud_scheduler +from app.executor import scheduler_job_manager from app.executor import validation from app.service import email_notifier from app.service import file_uploader @@ -32,8 +33,6 @@ from google.cloud import storage _CONFIG_OVERRIDE_FILE: str = 'config_override.json' -_GKE_SERVICE_ACCOUNT_KEY: str = 'gke_service_account' -_GKE_OAUTH_AUDIENCE_KEY: str = 'gke_oauth_audience' _FLAGS = flags.FLAGS @@ -62,8 +61,8 @@ logging.basicConfig(level=logging.INFO) -def _get_cron_schedule(repo_dir: str, absolute_import_path: str, - manifest_filename: str): +def _get_import_spec(repo_dir: str, absolute_import_path: str, + manifest_filename: str): # Retain the path to the import (ignoring the name of the import). path = absolute_import_path.split(":")[0] @@ -79,7 +78,7 @@ def _get_cron_schedule(repo_dir: str, absolute_import_path: str, for spec in manifest['import_specifications']: if absolute_import_path.endswith(':' + spec['import_name']): - return spec['cron_schedule'] + return spec # If we are here, the the import name was not found in the manifest. raise Exception( @@ -240,34 +239,6 @@ def update(cfg: configs.ExecutorConfig, return executor.execute_imports_on_update(absolute_import_path) -def schedule(cfg: configs.ExecutorConfig, - absolute_import_name: str, - repo_dir: str, - gke_service_account: str = "", - gke_oauth_audience: str = "") -> Dict: - # This is the content of what is passed to /update API - # inside each cronjob http calls from Cloud Scheduler. - json_encoded_job_body = json.dumps({ - 'absolute_import_name': absolute_import_name, - 'configs': cfg.get_data_refresh_config() - }).encode("utf-8") - - # Retrieve the cron schedule. - cron_schedule = _get_cron_schedule(repo_dir, absolute_import_name, - cfg.manifest_filename) - - # Create an HTTP Job Request. - req = cloud_scheduler.http_job_request( - absolute_import_name, - cron_schedule, - json_encoded_job_body, - gke_caller_service_account=gke_service_account, - gke_oauth_audience=gke_oauth_audience) - - return cloud_scheduler.create_or_update_job(cfg.gcp_project_id, - cfg.scheduler_location, req) - - def main(_): mode = _FLAGS.mode absolute_import_path = _FLAGS.absolute_import_path @@ -335,19 +306,14 @@ def main(_): _print_fileupload_results(cfg, absolute_import_path) elif mode == 'schedule': - # Before proceeding, ensure that the configs read from GCS have the expected fields. - assert _GKE_SERVICE_ACCOUNT_KEY in scheduler_config_dict - assert _GKE_OAUTH_AUDIENCE_KEY in scheduler_config_dict - logging.info("*************************************************") logging.info("***** Beginning Schedule Operation **************") logging.info("*************************************************") - res = schedule( - cfg, - absolute_import_path, - repo_dir, - gke_service_account=scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY], - gke_oauth_audience=scheduler_config_dict[_GKE_OAUTH_AUDIENCE_KEY]) + # Retrieve the cron schedule. + import_spec = _get_import_spec(repo_dir, absolute_import_path, + cfg.manifest_filename) + res = scheduler_job_manager.create_or_update_import_schedule( + absolute_import_path, import_spec, cfg, scheduler_config_dict) logging.info("*************************************************") logging.info("*********** Schedule Operation Complete. ********") logging.info("*************************************************") diff --git a/import-automation/executor/test/cloud_scheduler_test.py b/import-automation/executor/test/cloud_scheduler_test.py index 468249bedb..952ed3a1fb 100644 --- a/import-automation/executor/test/cloud_scheduler_test.py +++ b/import-automation/executor/test/cloud_scheduler_test.py @@ -60,15 +60,15 @@ def test_appengine_job_request(self): } assert DeepDiff(got, want) == {} - def test_http_job_request(self): + def test_gke_job_request(self): absolute_import_name = "scripts/preprocess:A" schedule = "0 5 * * *" json_encoded_job_body = '{"k":"v"}' cloud_scheduler.GKE_CALLER_SERVICE_ACCOUNT = 'account' cloud_scheduler.GKE_OAUTH_AUDIENCE = 'audience' - got = cloud_scheduler.http_job_request(absolute_import_name, schedule, - json_encoded_job_body) + got = cloud_scheduler.gke_job_request(absolute_import_name, schedule, + json_encoded_job_body) want = { 'name': 'scripts_preprocess_A_GKE', 'description': 'scripts/preprocess:A', @@ -97,3 +97,41 @@ def test_http_job_request(self): } } assert DeepDiff(got, want) == {} + + def test_cloud_run_job_request(self): + absolute_import_name = "scripts/preprocess:A" + schedule = "0 5 * * *" + + cloud_run_service_account = 'service_account' + cloud_run_job_url = 'run.googleapis.com/run' + got = cloud_scheduler.cloud_run_job_request(absolute_import_name, + schedule, cloud_run_job_url, + cloud_run_service_account) + want = { + 'name': 'A', + 'description': 'scripts/preprocess:A', + 'schedule': "0 5 * * *", + 'time_zone': 'Etc/UTC', + 'retry_config': { + 'retry_count': 2, + 'min_backoff_duration': { + 'seconds': 60 * 60 + } + }, + 'attempt_deadline': { + 'seconds': 60 * 30 + }, + 'http_target': { + 'uri': 'https://run.googleapis.com/run', + 'http_method': 'POST', + 'headers': { + 'Content-Type': 'application/json', + }, + 'body': b'{}', + 'oauth_token': { + 'service_account_email': 'service_account', + 'scope': 'https://www.googleapis.com/auth/cloud-platform' + } + } + } + assert DeepDiff(got, want) == {} diff --git a/import-automation/executor/test/utils_test.py b/import-automation/executor/test/utils_test.py index cc29475398..55e9655e64 100644 --- a/import-automation/executor/test/utils_test.py +++ b/import-automation/executor/test/utils_test.py @@ -47,15 +47,14 @@ def test_pacific_time_to_datetime_then_back(self): time_datetime = datetime.datetime.fromisoformat(time_iso) self.assertEqual(time_iso, time_datetime.isoformat()) - def test_next_pacific_date(self): - """Tests next_pacific_date.""" + def test_next_utc_date(self): + """Tests next_utc_date.""" # At 00:00 on Friday. cron_expression = '0 0 * * FRI' # Friday. from_time = datetime.datetime(2024, 12, 13) - self.assertEqual( - app.utils.next_pacific_date(cron_expression, from_time), - '2024-12-20') + self.assertEqual(app.utils.next_utc_date(cron_expression, from_time), + '2024-12-20') def test_download_file(self): """Response does not have a Content-Disposition header.""" diff --git a/import-automation/executor/verify_import.py b/import-automation/executor/verify_import.py new file mode 100644 index 0000000000..274c0a3006 --- /dev/null +++ b/import-automation/executor/verify_import.py @@ -0,0 +1,70 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Integration test for import executor image rollout. +Runs a test import as a Cloud Run job and verifies output from GCS. +""" +import logging +import os + +from app.executor import cloud_run +from app.executor import file_io +from test import utils + +_PROJECT_ID = 'datcom-ci' +_JOB_ID = 'dc-import-prober' +_LOCATION = 'us-central1' +_GCS_BUCKET = 'datcom-ci-test' +_IMPORT_NAME = 'scripts/us_fed/treasury_constant_maturity_rates:USFed_ConstantMaturityRates' + + +def run_test(): + logging.basicConfig(level=logging.INFO) + # Execute the cloud run job. + logging.info('Running cloud run job: %s', _JOB_ID) + job = cloud_run.execute_cloud_run_job(_PROJECT_ID, _LOCATION, _JOB_ID) + if not job: + logging.error('Failed to execute cloud run job: %s', _JOB_ID) + raise (AssertionError(f'Failed to execute cloud run job {_JOB_ID}')) + logging.info('Completed run: %s', _JOB_ID) + + # Download output data from GCS. + gcs_path = 'gs://' + _GCS_BUCKET + '/' + _IMPORT_NAME.replace(':', + '/') + '/' + file_path = gcs_path + 'latest_version.txt' + logging.info('Downloading data from GCS: %s', file_path) + blob = file_io.file_get_gcs_blob(file_path, True) + if not blob: + logging.error('Failed to get data from: %s', file_path) + raise (AssertionError(f'Failed to get data from {file_path}')) + timestamp = blob.download_as_text() + file_path = gcs_path + timestamp + '/' + 'treasury_constant_maturity_rates.mcf' + logging.info('Downloading data from GCS: %s', file_path) + blob = file_io.file_get_gcs_blob(file_path, True) + if not blob: + logging.error('Failed to get data from: %s', file_path) + raise (AssertionError(f'Failed to get data from {file_path}')) + + # Compare output data with expected data. + output_file = 'prober_output.mcf' + blob.download_to_filename(output_file) + golden_data_file = os.path.join('test', 'data', + 'treasury_constant_maturity_rates.mcf') + if not utils.compare_lines(golden_data_file, output_file, 50): + raise (AssertionError('Prober failure due to data mismatch')) + logging.info("Verified mcf file content for import: %s", _IMPORT_NAME) + + +if __name__ == '__main__': + run_test() diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f8ddfe17d7..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Requirements for Python scripts in this repo that have automation enabled! - -absl-py -arcgis2geojson -dataclasses==0.6 -datacommons==1.4.3 -frozendict -func-timeout==4.3.5 -geojson==2.5.0 -google-cloud-bigquery -google-cloud-run -google-cloud-storage>=2.7.0 -google-cloud-logging==3.4.0 -google-cloud-scheduler==2.10.0 -gspread==5.12.0 -lxml==4.9.1 -numpy==1.26.4 -openpyxl>=3.1.0 -pandas -pylint -pytest -requests==2.27.1 -requests_cache -retry==0.9.2 -shapely==1.8.5 -urllib3==1.26.8 -xarray==0.19.0 -xlrd -zipp -beautifulsoup4 -ratelimit -xlsxwriter==3.2.0 diff --git a/requirements_all.txt b/requirements_all.txt index 975edc670b..0a63c11075 100644 --- a/requirements_all.txt +++ b/requirements_all.txt @@ -1,44 +1,22 @@ # Requirements for all Python code in this repo, except for import-automation -absl-py -arcgis2geojson +# Add new modules to one of: +# - import-automation/executor/requirements.txt: if the script is used in import automation. +# - script-folder/requirements.txt: the module is only needed for that script. +# - requirements_all.txt (here): anything not related to import automation. + +-r import-automation/executor/requirements.txt + chembl-webresource-client>=0.10.2 -dataclasses==0.6 -datacommons==1.4.3 deepdiff==6.3.0 earthengine-api flask_restful==0.3.9 -frozendict -func-timeout==4.3.5 -geojson==2.5.0 geopandas==0.8.1 geopy -google-cloud-run -google-cloud-bigquery -google-cloud-storage>=2.7.0 -google-cloud-logging==3.4.0 -google-cloud-scheduler==2.10.0 -gspread==5.12.0 -lxml==4.9.1 matplotlib==3.3.0 netCDF4==1.6.4 -numpy==1.26.4 -openpyxl>=3.1.0 -pandas -pylint -pytest rasterio rdp==0.8 -requests==2.27.1 -retry==0.9.2 s2sphere==0.2.5 -shapely==1.8.5 tabula-py -urllib3==1.26.8 -xarray==0.19.0 -xlrd yapf -zipp -beautifulsoup4 -ratelimit -xlsxwriter==3.2.0 diff --git a/run_tests.sh b/run_tests.sh index 8a527a545d..0ddb8d035e 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -17,7 +17,7 @@ set -e # Array of top-level folders with Python code. -PYTHON_FOLDERS="util/ scripts/ import-automation/executor" +PYTHON_FOLDERS="util/ scripts/ import-automation/executor tools/" # Flag used signal if Python requirements have already been installed. PYTHON_REQUIREMENTS_INSTALLED=false diff --git a/scripts/us_census/pep/us_pep_sex/process.py b/scripts/us_census/pep/us_pep_sex/process.py index b28b918241..7d8771212a 100644 --- a/scripts/us_census/pep/us_pep_sex/process.py +++ b/scripts/us_census/pep/us_pep_sex/process.py @@ -535,7 +535,7 @@ def _state_2010_2020(file_path: str) -> pd.DataFrame: df = df[(df['Year'] != 'April2010Census') & (df['Year'] != 'April2010Estimate') & - (df['Year'] != 'April2020')] + (df['Year'] != 'April2020') & (df['Year'] != '2020')] df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey_PartialAggregate' return df except Exception as e: @@ -563,17 +563,21 @@ def _state_latest(file_path: str) -> pd.DataFrame: 'July2020Male', 'July2020Female', ] - # Adding year-specific columns dynamically till current year - current_year = dt.now().year + # Adding year-specific columns dynamically till latest year + df = pd.read_excel(file_path, engine='openpyxl') + + fourth_row_list = df.iloc[2].tolist() + max_year = np.nanmax(fourth_row_list) if any( + not pd.isna(i) for i in fourth_row_list) else np.nan + + df = pd.read_excel(file_path, skiprows=5, skipfooter=7, header=None) + current_year = int(max_year) + 1 for year in range(2021, current_year): if current_year < 2030: base_columns.append(f'July{year}Total') base_columns.append(f'July{year}Male') base_columns.append(f'July{year}Female') - # Load the data with no column names initially - df = pd.read_excel(file_path, skiprows=5, skipfooter=7, header=None) - # Assign dynamic column names df.columns = base_columns @@ -1103,35 +1107,17 @@ def add_future_year_urls(): # A set to track downloaded URLs for unique {YEAR} and URLs without {i} downloaded_year_urls = set() - # This method will generate URLs for the years 2024 to 2029 - for future_year in range(2023, 2030): - if dt.now().year > future_year: - YEAR = future_year - # Loop through URLs - for url in urls_to_scan: - if "{i}" in url: # This URL contains the {i} variable, so we loop through i from 01 to 56 - for i in range(1, 57): # Loop i from 01 to 56 - formatted_i = f"{i:02}" # Ensure i is always 2 digits (01, 02, ..., 56) - url_to_check = url.format(YEAR=YEAR, i=formatted_i) - - try: - check_url = requests.head(url_to_check, - allow_redirects=True) - if check_url.status_code == 200: - _FILES_TO_DOWNLOAD.append( - {"download_path": url_to_check}) - - except requests.exceptions.RequestException as e: - logging.fatal( - f"URL is not accessible {url_to_check} due to {e}" - ) - - else: # This URL does not contain {i}, so we only need to process it once per year - url_to_check = url.format(YEAR=YEAR) - - # If the URL has already been processed for this year, skip it - if url_to_check in downloaded_year_urls: - continue # Skip this URL if it's already processed + # Loop through years in reverse order from 2030 to 2023 + for future_year in range(2030, 2022, -1): # From 2030 to 2023 + + YEAR = future_year + # Loop through URLs + for url in urls_to_scan: + if "{i}" in url: # This URL contains the {i} variable, so we loop through i from 01 to 56 + for i in range(1, 57): # Loop i from 01 to 56 + formatted_i = f"{i:02}" # Ensure i is always 2 digits (01, 02, ..., 56) + url_to_check = url.format(YEAR=YEAR, i=formatted_i) + logging.info(f"checking url: {url_to_check}") try: check_url = requests.head(url_to_check, @@ -1139,18 +1125,36 @@ def add_future_year_urls(): if check_url.status_code == 200: _FILES_TO_DOWNLOAD.append( {"download_path": url_to_check}) - downloaded_year_urls.add( - url_to_check) # Mark this URL as processed - - else: - logging.fatal( - f"URL returned status code {check_url.status_code}: {url_to_check}" - ) except requests.exceptions.RequestException as e: - logging.fatal( + logging.error( f"URL is not accessible {url_to_check} due to {e}") + else: # This URL does not contain {i}, so we only need to process it once per year + url_to_check = url.format(YEAR=YEAR) + logging.info(f"checking url: {url_to_check}") + # If the URL has already been processed for this year, skip it + if url_to_check in downloaded_year_urls: + continue # Skip this URL if it's already processed + + try: + check_url = requests.head(url_to_check, + allow_redirects=True) + if check_url.status_code == 200: + _FILES_TO_DOWNLOAD.append( + {"download_path": url_to_check}) + downloaded_year_urls.add( + url_to_check) # Mark this URL as processed + + else: + logging.error( + f"URL returned status code {check_url.status_code}: {url_to_check}" + ) + + except requests.exceptions.RequestException as e: + logging.error( + f"URL is not accessible {url_to_check} due to {e}") + def download_files(): """ diff --git a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json index bd7e23601a..4e6bda784c 100644 --- a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json +++ b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json @@ -17,7 +17,9 @@ "node_mcf": "treasury_constant_maturity_rates.mcf" } ], - "cron_schedule": "15 3 * * *" + "cron_schedule": "15 3 * * *", + "resource_limits": {"cpu": "1", "memory": "4G"}, + "user_script_timeout": 1800 } ] -} \ No newline at end of file +} diff --git a/tools/import_differ/README.md b/tools/import_differ/README.md new file mode 100644 index 0000000000..b1532600f3 --- /dev/null +++ b/tools/import_differ/README.md @@ -0,0 +1,32 @@ +# Import Differ + +This utility generates a diff (point and series analysis) of two versions of the same dataset for import analysis. + +**Usage** +``` +python import_differ.py --current_data= --previous_data= +``` + +Parameter description: +- current\_data: Path to the current MCF data (single mcf file or folder/* on local/GCS supported). +- previous\_data: Path to the previous MCF data (single mcf file or folder/* on local/GCS supported). +- output\_location: Path to the output data folder. Default value: results. +- groupby\_columns: Columns to group data for diff analysis in the order var,place,time etc. Default value: “variableMeasured,observationAbout,observationDate,measureMethod,unit”. +- value\_columns: Columns with statvar value for diff analysis. Default value: "value,scalingFactor". + +**Output** + +Summary output generated is of the form below showing counts of differences for each variable. + +| |variableMeasured|added|deleted|modified|same|total| +|---|---|---|---|---|---|---| +|0|dcid:var1|1|0|0|0|1| +|1|dcid:var2|0|2|1|1|4| +|2|dcid:var3|0|0|1|0|1| +|3|dcid:var4|0|2|0|0|2| + +Detailed diff output is written to files for further analysis. Sample result files can be found under folder 'test/results'. +- point\_analysis\_summary.csv: diff summry for point analysis +- point\_analysis\_results.csv: detailed results for point analysis +- series\_analysis\_summary.csv: diff summry for series analysis +- series\_analysis\_results.csv: detailed results for series analysis diff --git a/tools/import_differ/__init__.py b/tools/import_differ/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/import_differ/differ_utils.py b/tools/import_differ/differ_utils.py new file mode 100644 index 0000000000..40a8bc6a00 --- /dev/null +++ b/tools/import_differ/differ_utils.py @@ -0,0 +1,94 @@ +import glob +import os +import pandas as pd +import re + +from absl import logging +from google.cloud.storage import Client + + +def load_mcf_file(file: str) -> pd.DataFrame: + """ Reads an MCF text file and returns it as a dataframe.""" + mcf_file = open(file, 'r', encoding='utf-8') + mcf_contents = mcf_file.read() + mcf_file.close() + # nodes separated by a blank line + mcf_nodes_text = mcf_contents.split('\n\n') + # lines seprated as property: constraint + mcf_line = re.compile(r'^(\w+)\s*:\s*(.*)$') + mcf_nodes = [] + for node in mcf_nodes_text: + current_mcf_node = {} + for line in node.split('\n'): + parsed_line = mcf_line.match(line) + if parsed_line is not None: + current_mcf_node[parsed_line.group(1)] = parsed_line.group(2) + if current_mcf_node: + if current_mcf_node['typeOf'] == 'dcid:StatVarObservation': + mcf_nodes.append(current_mcf_node) + else: + logging.warning( + f'Ignoring node of type:{current_mcf_node["typeOf"]}') + df = pd.DataFrame(mcf_nodes) + return df + + +def load_mcf_files(path: str) -> pd.DataFrame: + """ Loads all sharded mcf files in the given directory and + returns a single combined dataframe.""" + df_list = [] + filenames = glob.glob(path + '.mcf') + for filename in filenames: + df = load_mcf_file(filename) + df_list.append(df) + result = pd.concat(df_list, ignore_index=True) + return result + + +def write_data(df: pd.DataFrame, path: str, file: str): + """ Writes a dataframe to a CSV file with the given path.""" + out_file = open(os.path.join(path, file), mode='w', encoding='utf-8') + df.to_csv(out_file, index=False, mode='w') + out_file.close() + + +def load_data(path: str, tmp_dir: str) -> pd.DataFrame: + """ Loads data from the given path and returns as a dataframe. + Args: + path: local or gcs path (single file or folder/* format) + tmp_dir: destination folder + Returns: + dataframe with the input data + """ + if path.startswith('gs://'): + path = get_gcs_data(path, tmp_dir) + + if path.endswith('*'): + return load_mcf_files(path) + else: + return load_mcf_file(path) + + +def get_gcs_data(uri: str, tmp_dir: str) -> str: + """ Downloads files form GCS and copies them to local. + Args: + uri: single file path or folder/* format + tmp_dir: destination folder + Returns: + path to the output file/folder + """ + + client = Client() + bucket = client.get_bucket(uri.split('/')[2]) + if uri.endswith('*'): + blobs = client.list_blobs(bucket) + for blob in blobs: + path = os.path.join(tmp_dir, blob.name.replace('/', '_')) + blob.download_to_filename(path) + return os.path.join(tmp_dir, '*') + else: + file_name = uri.split('/')[3] + blob = bucket.get_blob(file_name) + path = os.path.join(tmp_dir, blob.name.replace('/', '_')) + blob.download_to_filename(path) + return path diff --git a/tools/import_differ/import_differ.py b/tools/import_differ/import_differ.py new file mode 100644 index 0000000000..21e659ba74 --- /dev/null +++ b/tools/import_differ/import_differ.py @@ -0,0 +1,267 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Utility to generate a dataset diff for import analysis.""" + +import os +import pandas as pd +import random + +from absl import app +from absl import flags +from absl import logging + +import differ_utils + +SAMPLE_COUNT = 3 +GROUPBY_COLUMNS = 'variableMeasured,observationAbout,observationDate,measurementMethod,unit,observationPeriod' +VALUE_COLUMNS = 'value,scalingFactor' + +FLAGS = flags.FLAGS +flags.DEFINE_string( + 'current_data', '', 'Path to the current MCF data \ + (single mcf file or folder/* on local/GCS supported).') +flags.DEFINE_string( + 'previous_data', '', 'Path to the previous MCF data \ + (single mcf file or folder/* on local/GCS supported).') +flags.DEFINE_string('output_location', 'results', \ + 'Path to the output data folder.') + +flags.DEFINE_string( + 'groupby_columns', GROUPBY_COLUMNS, + 'Columns to group data for diff analysis in the order (var,place,time etc.).' +) +flags.DEFINE_string('value_columns', VALUE_COLUMNS, + 'Columns with statvar value for diff analysis.') + + +class ImportDiffer: + """ + Utility to generate a diff (point and series analysis) + of two versions of the same dataset for import analysis. + + Usage: + $ python import_differ.py --current_data= --previous_data= + + Summary output generated is of the form below showing + counts of differences for each variable. + + variableMeasured added deleted modified same total + 0 dcid:var1 1 0 0 0 1 + 1 dcid:var2 0 2 1 1 4 + 2 dcid:var3 0 0 1 0 1 + 3 dcid:var4 0 2 0 0 2 + + Detailed diff output is written to files for further analysis. + - point_analysis_summary.csv: diff summry for point analysis + - point_analysis_results.csv: detailed results for point analysis + - series_analysis_summary.csv: diff summry for series analysis + - series_analysis_results.csv: detailed results for series analysis + + """ + + def __init__(self, + current_data, + previous_data, + output_location, + groupby_columns=GROUPBY_COLUMNS, + value_columns=VALUE_COLUMNS): + self.current_data = current_data + self.previous_data = previous_data + self.output_location = output_location + self.groupby_columns = groupby_columns.split(',') + self.value_columns = value_columns.split(',') + self.variable_column = self.groupby_columns[0] + self.place_column = self.groupby_columns[1] + self.time_column = self.groupby_columns[2] + self.diff_column = 'diff_result' + + def _cleanup_data(self, df: pd.DataFrame): + for column in ['added', 'deleted', 'modified', 'same']: + df[column] = df[column] if column in df.columns else 0 + df[column] = df[column].fillna(0).astype(int) + + def _get_samples(self, row): + years = sorted(row[self.time_column]) + if len(years) > SAMPLE_COUNT: + return [years[0]] + random.sample(years[1:-1], + SAMPLE_COUNT - 2) + [years[-1]] + else: + return years + + # Processes two dataset files to identify changes. + def process_data(self, previous_df: pd.DataFrame, + current_df: pd.DataFrame) -> pd.DataFrame: + """ + Process previous and current datasets to generate + the intermediate data for point and series analysis. + Args: + current_df: dataframe with current (new) data + previous_df: dataframe with previous (old) data + Returns: + intermediate merged data for analysis + """ + cur_df_columns = current_df.columns.values.tolist() + self.groupby_columns = [ + i for i in self.groupby_columns if i in cur_df_columns + ] + self.value_columns = [ + i for i in self.value_columns if i in cur_df_columns + ] + df1 = previous_df.loc[:, self.groupby_columns + self.value_columns] + df2 = current_df.loc[:, self.groupby_columns + self.value_columns] + df1['_value_combined'] = df1[self.value_columns]\ + .apply(lambda row: '_'.join(row.values.astype(str)), axis=1) + df2['_value_combined'] = df2[self.value_columns]\ + .apply(lambda row: '_'.join(row.values.astype(str)), axis=1) + df1.drop(columns=self.value_columns, inplace=True) + df2.drop(columns=self.value_columns, inplace=True) + # Perform outer join operation to identify differences. + result = pd.merge(df1, + df2, + on=self.groupby_columns, + how='outer', + indicator=self.diff_column) + result[self.diff_column] = result.apply( + lambda row: 'added' if row[self.diff_column] == 'right_only' \ + else 'deleted' if row[self.diff_column] == 'left_only' \ + else 'modified' if row['_value_combined_x'] != row['_value_combined_y'] \ + else 'same', axis=1) + return result + + def point_analysis(self, + in_data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame): + """ + Performs point diff analysis to identify data point changes. + Args: + in_data: intermediate data generated by processing previous/current data + Returns: + summary and results from the analysis + """ + column_list = [ + self.variable_column, self.place_column, self.time_column, + self.diff_column + ] + result = in_data.loc[:, column_list] + result = result.groupby( + [self.variable_column, self.diff_column], + observed=True, + as_index=False)[[self.place_column, + self.time_column]].agg(lambda x: x.tolist()) + result['size'] = result.apply(lambda row: len(row[self.place_column]), + axis=1) + result[self.place_column] = result.apply(lambda row: random.sample( + row[self.place_column], + min(SAMPLE_COUNT, len(row[self.place_column]))), + axis=1) + result[self.time_column] = result.apply(self._get_samples, axis=1) + summary = result.pivot( + index=self.variable_column, columns=self.diff_column, values='size')\ + .reset_index().rename_axis(None, axis=1) + self._cleanup_data(summary) + summary['total'] = summary.apply(lambda row: row['added'] + row[ + 'deleted'] + row['modified'] + row['same'], + axis=1) + return summary, result + + def series_analysis(self, + in_data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame): + """ + Performs series diff analysis to identify time series changes. + Args: + in_data: intermediate data generated by processing previous/current data + Returns: + summary and results from the analysis + """ + column_list = [ + self.variable_column, self.place_column, self.diff_column + ] + result = in_data.loc[:, column_list] + result = result.groupby(column_list, as_index=False).size() + result = result.pivot( + index=[self.variable_column, self.place_column], columns=self.diff_column, values='size')\ + .reset_index().rename_axis(None, axis=1) + self._cleanup_data(result) + result[self.diff_column] = result.apply(lambda row: 'added' if row['added'] > 0 \ + and row['deleted'] + row['modified'] + row['same'] == 0 \ + else 'deleted' if row['deleted'] > 0 and row['added'] + row['modified'] + row['same'] == 0 \ + else 'modified' if row['deleted'] > 0 or row['added'] > 0 or row['modified'] > 0 \ + else 'same', axis=1) + result = result[column_list] + result = result.groupby( + [self.variable_column, self.diff_column], + observed=True, + as_index=False)[self.place_column].agg(lambda x: x.tolist()) + result['size'] = result.apply(lambda row: len(row[self.place_column]), + axis=1) + result[self.place_column] = result.apply(lambda row: random.sample( + row[self.place_column], + min(SAMPLE_COUNT, len(row[self.place_column]))), + axis=1) + summary = result.pivot( + index=self.variable_column, columns=self.diff_column, values='size')\ + .reset_index().rename_axis(None, axis=1) + self._cleanup_data(summary) + summary['total'] = summary.apply(lambda row: row['added'] + row[ + 'deleted'] + row['modified'] + row['same'], + axis=1) + return summary, result + + def run_differ(self): + if not os.path.exists(self.output_location): + os.makedirs(self.output_location) + logging.info('Loading data...') + current_df = differ_utils.load_data(self.current_data, + self.output_location) + previous_df = differ_utils.load_data(self.previous_data, + self.output_location) + + logging.info('Processing data...') + in_data = self.process_data(previous_df, current_df) + + logging.info('Point analysis:') + summary, result = self.point_analysis(in_data) + result.sort_values(by=[self.diff_column, self.variable_column], + inplace=True) + print(summary.head(10)) + print(result.head(10)) + differ_utils.write_data(summary, self.output_location, + 'point_analysis_summary.csv') + differ_utils.write_data(result, self.output_location, + 'point_analysis_results.csv') + + logging.info('Series analysis:') + summary, result = self.series_analysis(in_data) + result.sort_values(by=[self.diff_column, self.variable_column], + inplace=True) + print(summary.head(10)) + print(result.head(10)) + differ_utils.write_data(summary, self.output_location, + 'series_analysis_summary.csv') + differ_utils.write_data(result, self.output_location, + 'series_analysis_results.csv') + + logging.info('Differ output written to folder: %s', + self.output_location) + + +def main(_): + '''Runs the differ.''' + differ = ImportDiffer(FLAGS.current_data, FLAGS.previous_data, + FLAGS.output_location, FLAGS.groupby_columns, + FLAGS.value_columns) + differ.run_differ() + + +if __name__ == '__main__': + app.run(main) diff --git a/tools/import_differ/import_differ_test.py b/tools/import_differ/import_differ_test.py new file mode 100644 index 0000000000..60137119d8 --- /dev/null +++ b/tools/import_differ/import_differ_test.py @@ -0,0 +1,56 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +import unittest + +from pandas.testing import assert_frame_equal +from import_differ import ImportDiffer + +import differ_utils + +module_dir = os.path.dirname(__file__) + + +class TestImportDiffer(unittest.TestCase): + ''' + Test Class to compare expected output in test/ directory to the + output generated by ImportDiffer class + ''' + + def test_diff_analysis(self): + groupby_columns = 'variableMeasured,observationAbout,observationDate' + value_columns = 'value' + current_data = os.path.join(module_dir, 'test', 'current.mcf') + previous_data = os.path.join(module_dir, 'test', 'previous.mcf') + output_location = os.path.join(module_dir, 'test') + + differ = ImportDiffer(current_data, previous_data, output_location, + groupby_columns, value_columns) + current = differ_utils.load_mcf_file(current_data) + previous = differ_utils.load_mcf_file(previous_data) + + in_data = differ.process_data(previous, current) + summary, result = differ.point_analysis(in_data) + result = pd.read_csv(os.path.join(module_dir, 'test', 'result1.csv')) + assert_frame_equal(summary, result) + + summary, result = differ.series_analysis(in_data) + result = pd.read_csv(os.path.join(module_dir, 'test', 'result2.csv')) + assert_frame_equal(summary, result) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/import_differ/test/current.mcf b/tools/import_differ/test/current.mcf new file mode 100644 index 0000000000..2e994a7a45 --- /dev/null +++ b/tools/import_differ/test/current.mcf @@ -0,0 +1,35 @@ +Node: cpcb_air_quality/E17/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Max_Concentration_AirPollutant_Ozone +value: 53.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/bhdp3vy7dee0d" + +Node: cpcb_air_quality/E18/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Mean_Concentration_AirPollutant_Ozone +value: 28.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/8e11gqvkt183b" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 42.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-25T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 40.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" diff --git a/tools/import_differ/test/previous.mcf b/tools/import_differ/test/previous.mcf new file mode 100644 index 0000000000..ce9fcb31d1 --- /dev/null +++ b/tools/import_differ/test/previous.mcf @@ -0,0 +1,62 @@ +Node: cpcb_air_quality/E18/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Mean_Concentration_AirPollutant_Ozone +value: 29.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/8e11gqvkt183b" + +Node: cpcb_air_quality/E16/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Min_Concentration_AirPollutant_Ozone +value: 18.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/z8j7g5sw11klh" + +Node: cpcb_air_quality/E16/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD +variableMeasured: dcid:Min_Concentration_AirPollutant_Ozone +value: 18.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/z8j7g5sw11klh" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 41.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-25T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 40.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-24T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 41.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" + +Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97 +observationDate: "2024-09-25T12:00:00" +unit: dcid:MicrogramsPerCubicMeter +observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD +variableMeasured: dcid:Mean_Concentration_AirPollutant_CO +value: 40.0 +typeOf: dcid:StatVarObservation +dcid: "dc/o/h1sjhdxycwwmc" diff --git a/tools/import_differ/test/result1.csv b/tools/import_differ/test/result1.csv new file mode 100644 index 0000000000..4d344b5639 --- /dev/null +++ b/tools/import_differ/test/result1.csv @@ -0,0 +1,5 @@ +variableMeasured,added,deleted,modified,same,total +dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1 +dcid:Mean_Concentration_AirPollutant_CO,0,2,1,1,4 +dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1 +dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2 diff --git a/tools/import_differ/test/result2.csv b/tools/import_differ/test/result2.csv new file mode 100644 index 0000000000..4f3b954643 --- /dev/null +++ b/tools/import_differ/test/result2.csv @@ -0,0 +1,5 @@ +variableMeasured,added,deleted,modified,same,total +dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1 +dcid:Mean_Concentration_AirPollutant_CO,0,1,1,0,2 +dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1 +dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2 diff --git a/tools/import_differ/test/results/point_analysis_results.csv b/tools/import_differ/test/results/point_analysis_results.csv new file mode 100644 index 0000000000..80feb425a5 --- /dev/null +++ b/tools/import_differ/test/results/point_analysis_results.csv @@ -0,0 +1,7 @@ +variableMeasured,_diff_result,observationAbout,observationDate,size +dcid:Max_Concentration_AirPollutant_Ozone,added,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],"['""2024-09-24T12:00:00""']",1 +dcid:Mean_Concentration_AirPollutant_CO,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___APPCB', 'dcid:cpcpAq/Secretariat_Amaravati___APPCB']","['""2024-09-24T12:00:00""', '""2024-09-25T12:00:00""']",2 +dcid:Min_Concentration_AirPollutant_Ozone,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___APPCB', 'dcid:cpcpAq/Secretariat_Amaravati___IMD']","['""2024-09-24T12:00:00""', '""2024-09-24T12:00:00""']",2 +dcid:Mean_Concentration_AirPollutant_CO,modified,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],"['""2024-09-24T12:00:00""']",1 +dcid:Mean_Concentration_AirPollutant_Ozone,modified,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],"['""2024-09-24T12:00:00""']",1 +dcid:Mean_Concentration_AirPollutant_CO,same,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],"['""2024-09-25T12:00:00""']",1 diff --git a/tools/import_differ/test/results/point_analysis_summary.csv b/tools/import_differ/test/results/point_analysis_summary.csv new file mode 100644 index 0000000000..4d344b5639 --- /dev/null +++ b/tools/import_differ/test/results/point_analysis_summary.csv @@ -0,0 +1,5 @@ +variableMeasured,added,deleted,modified,same,total +dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1 +dcid:Mean_Concentration_AirPollutant_CO,0,2,1,1,4 +dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1 +dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2 diff --git a/tools/import_differ/test/results/series_analysis_results.csv b/tools/import_differ/test/results/series_analysis_results.csv new file mode 100644 index 0000000000..b776dbd2f5 --- /dev/null +++ b/tools/import_differ/test/results/series_analysis_results.csv @@ -0,0 +1,6 @@ +variableMeasured,_diff_result,observationAbout,size +dcid:Max_Concentration_AirPollutant_Ozone,added,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1 +dcid:Mean_Concentration_AirPollutant_CO,deleted,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1 +dcid:Min_Concentration_AirPollutant_Ozone,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___IMD', 'dcid:cpcpAq/Secretariat_Amaravati___APPCB']",2 +dcid:Mean_Concentration_AirPollutant_CO,modified,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],1 +dcid:Mean_Concentration_AirPollutant_Ozone,modified,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1 diff --git a/tools/import_differ/test/results/series_analysis_summary.csv b/tools/import_differ/test/results/series_analysis_summary.csv new file mode 100644 index 0000000000..4f3b954643 --- /dev/null +++ b/tools/import_differ/test/results/series_analysis_summary.csv @@ -0,0 +1,5 @@ +variableMeasured,added,deleted,modified,same,total +dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1 +dcid:Mean_Concentration_AirPollutant_CO,0,1,1,0,2 +dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1 +dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2 diff --git a/tools/import_validation/import_validation.py b/tools/import_validation/import_validation.py new file mode 100644 index 0000000000..8dee351aeb --- /dev/null +++ b/tools/import_validation/import_validation.py @@ -0,0 +1,140 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Class to perform validations for import automation.""" + +from absl import app +from absl import flags +from absl import logging +from enum import Enum +import pandas as pd +import os +import json + +FLAGS = flags.FLAGS +flags.DEFINE_string('config_file', 'validation_config.json', + 'Path to the validation config file.') +flags.DEFINE_string('differ_output_location', '.', + 'Path to the differ output data folder.') +flags.DEFINE_string('stats_summary_location', '.', + 'Path to the stats summary report folder.') +flags.DEFINE_string('validation_output_location', '.', + 'Path to the validation output folder.') + +POINT_ANALAYSIS_FILE = 'point_analysis_summary.csv' +STATS_SUMMARY_FILE = 'summary_report.csv' +VALIDATION_OUTPUT_FILE = 'validation_output.csv' + +Validation = Enum('Validation', [ + ('MODIFIED_COUNT', 1), + ('UNMODIFIED_COUNT', 2), + ('ADDED_COUNT', 3), + ('DELETED_COUNT', 4), + ('LATEST_DATA', 5), +]) + + +class ValidationResult: + """Describes the result of the validaiton of an import.""" + + def __init__(self, status, name, message): + # Status of the execution: PASSED OR FAILED + self.status = status + # Name of the validaiton executed + self.name = name + # Description of the result/error message + self.message = message + + +class ImportValidation: + """ + Class to perform validations for import automation. + + Usage: + $ python import_validation.py --config_file= \ + --differ_output_location= --stats_summary_location= + + Each import can provide configuration (JSON) to select which validation + checks are performed. Validation results are written to an output file. + Sample config and output files can be found in test folder. + """ + + def __init__(self, config_file: str, differ_output: str, stats_summary: str, + validation_output: str): + logging.info('Reading config from %s', config_file) + self.differ_results = pd.read_csv(differ_output) + self.validation_map = { + Validation.MODIFIED_COUNT: self._modified_count_validation, + Validation.ADDED_COUNT: self._added_count_validation, + Validation.DELETED_COUNT: self._deleted_count_validation, + Validation.UNMODIFIED_COUNT: self._unmodified_count_validation + } + self.validation_output = validation_output + self.validation_result = [] + with open(config_file, encoding='utf-8') as fd: + self.validation_config = json.load(fd) + + def _latest_data_validation(self, config: dict): + logging.info('Not yet implemented') + + # Checks if the number of deleted data points are below a threshold. + def _deleted_count_validation(self, config: dict): + if self.differ_results['deleted'].sum() > config['threshold']: + raise AssertionError(f'Validation failed: {config["validation"]}') + + # Checks if number of modified points for each stat var are same. + def _modified_count_validation(self, config: dict): + if self.differ_results['modified'].nunique() > 1: + raise AssertionError(f'Validation failed: {config["validation"]}') + + # Checks if number of added points for each stat var are same. + def _added_count_validation(self, config: dict): + if self.differ_results['added'].nunique() > 1: + raise AssertionError(f'Validation failed: {config["validation"]}') + + # Checks if number of unmodified points for each stat var are same. + def _unmodified_count_validation(self, config: dict): + if self.differ_results['same'].nunique() > 1: + raise AssertionError(f'Validation failed: {config["validation"]}') + + def _run_validation(self, config) -> ValidationResult: + try: + self.validation_map[Validation[config['validation']]](config) + logging.info('Validation passed: %s', config['validation']) + return ValidationResult('PASSED', config['validation'], '') + except AssertionError as exc: + logging.error(repr(exc)) + return ValidationResult('FAILED', config['validation'], repr(exc)) + + def run_validations(self): + output_file = open(self.validation_output, mode='w', encoding='utf-8') + output_file.write('test,status,message\n') + for config in self.validation_config: + result = self._run_validation(config) + output_file.write( + f'{result.name},{result.status},{result.message}\n') + self.validation_result.append(result) + output_file.close() + + +def main(_): + validation = ImportValidation( + FLAGS.config_file, + os.path.join(FLAGS.differ_output_location, POINT_ANALAYSIS_FILE), + os.path.join(FLAGS.stats_summary_location, STATS_SUMMARY_FILE), + os.paht.join(FLAGS.validation_output_location, VALIDATION_OUTPUT_FILE)) + validation.run_validations() + + +if __name__ == '__main__': + app.run(main) diff --git a/tools/import_validation/import_validation_test.py b/tools/import_validation/import_validation_test.py new file mode 100644 index 0000000000..a33ca72d66 --- /dev/null +++ b/tools/import_validation/import_validation_test.py @@ -0,0 +1,45 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +import unittest + +from pandas.testing import assert_frame_equal +from import_validation import ImportValidation + +module_dir = os.path.dirname(__file__) + + +class TestValidation(unittest.TestCase): + ''' + Test Class to run compare expected output in test/ directory to the + output generated by ImportValidation class + ''' + + def test_validation(self): + result_file = os.path.join(module_dir, 'test', 'test_output.csv') + config_file = os.path.join(module_dir, 'test', 'test_config.json') + differ_output = os.path.join(module_dir, 'test', 'differ_output.csv') + + validation = ImportValidation(config_file, differ_output, '') + validation.run_validations() + + expected = pd.read_csv(result_file) + actual = pd.read_csv(os.path.join(module_dir, 'validation_output.csv')) + assert_frame_equal(actual, expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/import_validation/test/differ_output.csv b/tools/import_validation/test/differ_output.csv new file mode 100644 index 0000000000..280df7aedb --- /dev/null +++ b/tools/import_validation/test/differ_output.csv @@ -0,0 +1,5 @@ +variableMeasured,added,deleted,modified,same,total +dcid:var1,1,0,0,3,4 +dcid:var2,1,0,0,2,4 +dcid:var3,1,0,0,3,4 +dcid:var4,1,0,0,3,4 diff --git a/tools/import_validation/test/test_config.json b/tools/import_validation/test/test_config.json new file mode 100644 index 0000000000..21daba858b --- /dev/null +++ b/tools/import_validation/test/test_config.json @@ -0,0 +1,15 @@ +[ + { + "validation": "DELETED_COUNT", + "threshold": 1 + }, + { + "validation": "MODIFIED_COUNT" + }, + { + "validation": "ADDED_COUNT" + }, + { + "validation": "UNMODIFIED_COUNT" + } +] diff --git a/tools/import_validation/test/test_output.csv b/tools/import_validation/test/test_output.csv new file mode 100644 index 0000000000..a8327651b2 --- /dev/null +++ b/tools/import_validation/test/test_output.csv @@ -0,0 +1,5 @@ +test,status,message +DELETED_COUNT,PASSED, +MODIFIED_COUNT,PASSED, +ADDED_COUNT,PASSED, +UNMODIFIED_COUNT,FAILED,AssertionError('Validation failed: UNMODIFIED_COUNT') \ No newline at end of file diff --git a/tools/import_validation/validation_config.json b/tools/import_validation/validation_config.json new file mode 100644 index 0000000000..21daba858b --- /dev/null +++ b/tools/import_validation/validation_config.json @@ -0,0 +1,15 @@ +[ + { + "validation": "DELETED_COUNT", + "threshold": 1 + }, + { + "validation": "MODIFIED_COUNT" + }, + { + "validation": "ADDED_COUNT" + }, + { + "validation": "UNMODIFIED_COUNT" + } +] diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py new file mode 100644 index 0000000000..39c37ddca6 --- /dev/null +++ b/tools/statvar_importer/config_flags.py @@ -0,0 +1,544 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to store configuration parameters as a dictionary.""" + +import ast +from collections import OrderedDict +import collections.abc +import os +import sys +from typing import Union + +from absl import app +from absl import flags +from absl import logging + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +import file_util +from config_map import ConfigMap +from mcf_file_util import get_numeric_value + +_FLAGS = flags.FLAGS + +flags.DEFINE_string('config_file', '', 'File with configuration parameters.') +flags.DEFINE_list('data_url', '', 'URLs to download the data from.') +flags.DEFINE_string('shard_input_by_column', '', + 'Shard input data by unique values in column.') +flags.DEFINE_integer( + 'shard_prefix_length', + sys.maxsize, + 'Shard input data by value prefix of given length.', +) +flags.DEFINE_list( + 'pv_map', [], + 'Comma separated list of namespace:file with property values.') +flags.DEFINE_list('input_data', [], + 'Comma separated list of data files to be processed.') +flags.DEFINE_string('input_encoding', 'utf-8', 'Encoding for input_data files.') +flags.DEFINE_list( + 'input_xls_sheets', + [], + 'Comma separated list of sheet names within input_data xls files to be processed.', +) +flags.DEFINE_integer('input_rows', sys.maxsize, + 'Number of rows per input file to process.') +flags.DEFINE_integer('input_columns', sys.maxsize, + 'Number of columns in input file to process.') +flags.DEFINE_integer( + 'skip_rows', 0, 'Number of rows to skip at the begining of the input file.') +flags.DEFINE_integer( + 'header_rows', + -1, + 'Number of header rows with property-value mappings for columns. If -1,' + ' will lookup PVs for all rows.', +) +flags.DEFINE_integer( + 'header_columns', + -1, + 'Number of header columns with property-value mappings for rows. If -1,' + ' will lookup PVs for all columns.', +) +flags.DEFINE_string( + 'aggregate_duplicate_svobs', + None, + 'Aggregate SVObs with same place, date by one of the following: sum, min or' + ' max.', +) +flags.DEFINE_bool('schemaless', False, 'Allow schemaless StatVars.') +flags.DEFINE_string('output_path', '', + 'File prefix for output mcf, csv and tmcf.') +flags.DEFINE_string( + 'existing_statvar_mcf', + '', + 'StatVar MCF files for any existing stat var nodes to be resused.', +) +flags.DEFINE_string( + 'existing_schema_mcf', + '', + 'StatVar MCF files for any existing schema nodes to be resused.', +) +flags.DEFINE_integer('parallelism', 0, 'Number of parallel processes to use.') +flags.DEFINE_integer('pprof_port', 0, 'HTTP port for pprof server.') +flags.DEFINE_bool('debug', False, 'Enable debug messages.') +flags.DEFINE_integer('log_level', logging.INFO, + 'Log level messages to be shown.') + +# Flags for place name resolution +flags.DEFINE_string('dc_api_key', '', + 'DataCommons v2 API key used for APIs such as v2/resolve') +flags.DEFINE_string('maps_api_key', '', + 'Maps API key for place lookup by name.') +flags.DEFINE_list('places_csv', [], + 'CSV file with place names and dcids to match.') +flags.DEFINE_string( + 'places_resolved_csv', + '', + 'CSV file with resolved place names and dcids to match.', +) +flags.DEFINE_list('place_type', [], 'List of places types for name reoslution.') +flags.DEFINE_list('places_within', [], + 'List of places types for name reoslution.') +flags.DEFINE_string( + 'statvar_dcid_remap_csv', + '', + 'CSV file with existing DCIDs for generated statvars.', +) +flags.DEFINE_string('output_counters', '', 'CSV file with counters.') + +flags.DEFINE_bool( + 'resume', + False, + 'Resume processing to create output files not yet generated.', +) + +# Flags for spell checks +_DEFAULT_SPELL_ALLOWLIST = os.path.join(_SCRIPT_DIR, 'words_allowlist.txt') +flags.DEFINE_bool('spell_check', True, 'Run schema spell checker') +flags.DEFINE_string('sanity_check_output', '', 'File with list of spell errors') +flags.DEFINE_string('spell_check_allow_list', _DEFAULT_SPELL_ALLOWLIST, + 'File with words to be allowed') +flags.DEFINE_string('spell_check_config', '', 'File with words to be allowed') +flags.DEFINE_bool('spell_check_text', False, + 'if True, spell check quoted text values only.') +flags.DEFINE_list('spell_check_ignore_props', None, + 'List of properties to ignore for spell check.') + +# Flags for pvmap generation +flags.DEFINE_bool('generate_pvmap', True, 'Generate PVmap') +flags.DEFINE_string('google_genai_key', '', 'Google API key for GenAI prompt.') +flags.DEFINE_string('sample_pvmap', os.path.join(_SCRIPT_DIR, + 'sample_pvmap.csv'), + 'Sample PVmap for gen AI.') +flags.DEFINE_string('sample_statvars', + os.path.join(_SCRIPT_DIR, 'sample_statvars.mcf'), + 'Sample statvars MCF for GenAI prompt.') +flags.DEFINE_string('data_context', '', + 'Text file with metadata descriptions for data.') +flags.DEFINE_bool('generate_statvar_name', False, + 'Generate names for Statvars.') +flags.DEFINE_bool('llm_generate_statvar_name', False, + 'Generate names for Statvars.') + + +def get_default_config() -> dict: + """Returns the default config as dictionary of config parameters and values.""" + return { + # 'config parameter in snake_case': value + 'ignore_numeric_commas': + True, # Numbers may have commas + 'input_reference_column': + '#input', + 'input_min_columns_per_row': + 2, + 'input_data': + _FLAGS.input_data, + 'data_url': + _FLAGS.data_url, + 'input_encoding': + _FLAGS.input_encoding, + 'input_xls': + _FLAGS.input_xls, + 'pv_map_drop_undefined_nodes': + (False), # Don't drop undefined PVs in the column PV Map. + 'duplicate_svobs_key': + '#ErrorDuplicateSVObs', + 'duplicate_statvars_key': + '#ErrorDuplicateStatVar', + 'drop_statvars_without_svobs': + 1, + # Aggregate values for duplicate SVObs with the same statvar, place, date + # and units with one of the following functions: + # sum: Add all values. + # min: Set the minimum value. + # max: Set the maximum value. + # Internal property in PV map to aggregate values for a specific statvar. + 'aggregate_key': + '#Aggregate', + # Aggregation type duplicate SVObs for all statvars. + 'aggregate_duplicate_svobs': + _FLAGS.aggregate_duplicate_svobs, + 'merged_pvs_property': + '#MergedSVObs', + 'multi_value_properties': [ + 'name', 'alternateName', 'measurementDenominator' + ], + # Enable schemaless StatVars, + # If True, allow statvars with capitalized property names. + # Those properties are commented out when generating MCF but used for + # statvar dcid. + 'schemaless': + _FLAGS.schemaless, + # Whether to lookup DC API and drop undefined PVs in statvars. + 'schemaless_statvar_comment_undefined_pvs': + False, + 'default_statvar_pvs': + OrderedDict({ + 'typeOf': 'dcs:StatisticalVariable', + 'measurementQualifier': '', + 'statType': 'dcs:measuredValue', + 'measuredProperty': 'dcs:count', + 'populationType': '', + 'memberOf': '', + 'name': '', + 'nameWithLanguage': '', + 'alternateName': '', + 'description': '', + 'descriptionUrl': '', + }), + 'statvar_dcid_ignore_properties': [ + 'description', 'name', 'nameWithLanguage', 'descriptionUrl', + 'alternateName' + ], + 'statvar_dcid_ignore_values': ['measuredValue', 'StatisticalVariable'], + 'default_svobs_pvs': + OrderedDict({ + 'typeOf': 'dcs:StatVarObservation', + 'observationDate': '', + 'observationAbout': '', + 'value': '', + 'observationPeriod': '', + 'measurementMethod': '', + 'unit': '', + 'scalingFactor': '', + 'variableMeasured': '', + 'measurementResult': '', + '#Aggregate': '', + }), + 'required_statvar_properties': [ + 'measuredProperty', + 'populationType', + ], + 'required_statvarobs_properties': [ + 'variableMeasured', + 'observationAbout', + 'observationDate', + 'value', + ], + # Settings to compare StatVars with existing statvars to reuse dcids. + 'existing_statvar_mcf': + _FLAGS.existing_statvar_mcf, + 'existing_schema_mcf': + _FLAGS.existing_schema_mcf, + 'statvar_fingerprint_ignore_props': [ + 'Node', + 'dcid', + 'name', + 'nameWithLanguage', + 'alternateName', + 'description', + 'descriptionUrl', + 'provenance', + 'memberOf', + 'member', + 'relevantVariable', + ], + 'statvar_fingerprint_include_props': [], + # File with generated DCIDs remapped to existing dcids. + # This is used for schemaless statvars that can't be matched with + # existing statvars using property:value + 'statvar_dcid_remap_csv': + _FLAGS.statvar_dcid_remap_csv, + # Use numeric data in any column as a value. + # It may still be dropped if no SVObs can be constructed out of it. + # If False, SVObs is only emitted for PVs that have a map for 'value', + # for example, 'MyColumn': { 'value': '@Data' } + 'use_all_numeric_data_values': + False, + # Number format in input. + 'number_decimal': + '.', # decimal character + 'number_separator': + ', ', # separators stripped. + # Word separator, used to split words into phrases for PV map lookups. + 'word_delimiter': + ' ', + # Enable merged cells that inherit PVs from previous column. + 'merged_cells': + True, + # List of default PVS maps to lookup column values if there is no map for a + # column name. + 'default_pv_maps': ['GLOBAL'], + # Row and column indices with content to be looked up in pv_maps. + 'mapped_rows': + 0, + 'mapped_columns': [], + 'show_counters_every_n': + 0, + 'show_counters_every_sec': + 30, + # Settings for place name resolution + 'dc_api_key': + _FLAGS.dc_api_key, + 'maps_api_key': + _FLAGS.maps_api_key, + 'resolve_places': + False, + 'places_csv': + _FLAGS.places_csv, + 'places_resolved_csv': + _FLAGS.places_resolved_csv, + 'place_type': + _FLAGS.place_type, + 'places_within': + _FLAGS.places_within, + + # Filter settings + 'filter_data_min_value': + None, + 'filter_data_max_value': + None, + 'filter_data_max_change_ratio': + None, + 'filter_data_max_yearly_change_ratio': + None, + + # Output options + 'output_path': + _FLAGS.output_path, + 'generate_statvar_mcf': + True, # Generate MCF file with all statvars + 'generate_csv': + True, # Generate CSV with SVObs + 'output_csv_mode': + 'w', # Overwrite output CSV file. + 'output_columns': [], # Emit all SVObs PVs into output csv + 'generate_tmcf': + True, # Generate tMCF for CSV columns + 'skip_constant_csv_columns': + (True), # Skip emitting columns with constant values in the csv + 'output_only_new_statvars': + True, # Drop existing statvars from output + 'output_precision_digits': + 5, # Round floating values to 5 decimal digits. + 'generate_schema_mcf': + True, + 'generate_provisional_schema': + True, + # Settings for DC API. + 'dc_api_root': + 'http://autopush.api.datacommons.org', + 'dc_api_use_cache': + False, + 'dc_api_batch_size': + 100, + # Settings from flags + 'pv_map': + _FLAGS.pv_map, + 'input_rows': + _FLAGS.input_rows, + 'input_columns': + _FLAGS.input_columns, + 'skip_rows': + _FLAGS.skip_rows, + 'ignore_rows': [0], + 'header_rows': + _FLAGS.header_rows, + 'header_columns': + _FLAGS.header_columns, + 'process_rows': [0], + 'parallelism': + _FLAGS.parallelism, + 'output_counters': + _FLAGS.output_counters, + + # Settings for spell checks + 'spell_check': + _FLAGS.spell_check, + 'spell_allowlist': + _FLAGS.spell_check_allow_list, + 'spell_allow_words': [], + 'output_sanity_check': + _FLAGS.sanity_check_output, + 'spell_check_text_only': + _FLAGS.spell_check_text, + 'spell_check_ignore_props': + _FLAGS.spell_check_ignore_props, + 'debug': + _FLAGS.debug, + 'log_level': + _FLAGS.log_level, + + # Settings for PV Map generator + 'generate_pvmap': + _FLAGS.generate_pvmap, + 'google_api_key': + _FLAGS.google_genai_key, + 'sample_pvmap': + _FLAGS.sample_pvmap, + 'sample_statvars': + _FLAGS.sample_statvars, + 'data_context': + _FLAGS.data_context, + 'llm_data_annotation': + _FLAGS.generate_pvmap, + + # Settings for statvar name generator + 'generate_statvar_name': + _FLAGS.generate_statvar_name, # Generate names for StatVars + 'llm_generate_statvar_name': + _FLAGS.llm_generate_statvar_name, + } + + +def init_config_from_flags(filename: str = None) -> ConfigMap: + """Returns a Config object with parameters loaded from a file. + + Args: + filename: name of the file to load. + + Returns: + Config object with all the parameters loaded into the config_dict. + """ + config_dict = dict(get_default_config()) + if isinstance(filename, dict): + config_dict.update(filename) + filename = None + elif isinstance(filename, ConfigMap): + config_dict.update(filename.get_configs()) + elif isinstance(filename, str): + file_config = {} + # Check if filename is a file. + config_files = file_util.file_get_matching(filename) + if config_files: + # Load config from file. + file_config = file_util.file_load_py_dict(config_files) + elif ':' in filename: + # Try parsing config as a string + file_config = _parse_dict(filename) + if file_config: + update_config(file_config, config_dict) + else: + logging.error(f'Unknown config {filename}, ignored') + _set_verbosity(config_dict) + config = ConfigMap(config_dict=config_dict) + return config + + +def _set_verbosity(config: dict): + """Set logging verbosity by the config.""" + if config.get('debug'): + logging.set_verbosity(1) + if config.get('log_level'): + logging.set_verbosity(config.get('log_level')) + logging.info(f'Logging verbosity {logging.get_verbosity()}') + + +def set_config_value(param: str, value: str, config: dict): + """Set the config value for the param with the original type.""" + if param is None: + return + if isinstance(config, ConfigMap): + config = config.get_configs() + orig_value = config.get(param) + if orig_value is not None: + value = get_value_type(value, orig_value) + config[param] = value + + +def update_config(new_config: dict, config: dict) -> dict: + """Add values from the new_config into config and return the updated dict.""" + for key, value in new_config.items(): + set_config_value(key, value, config) + return config + + +def get_value_type(value: str, default_value): + """Returns value in the type of value_type.""" + if value is None: + return value + value_type = type(default_value) + if value_type is list: + # Convert value to list + if isinstance(value, list): + return value + default_element = '' + if len(default_value) > 0: + default_element = default_value[0] + if isinstance(value, str): + value = value.strip() + if value: + if value[0] == '[': + value = value[1:] + if value[-1] == ']': + value = value[:-1] + return [ + get_value_type(v.strip(), default_element) + for v in str(value).split(',') + ] + if value_type is str: + return str(value).strip() + if value_type is int or value_type is float: + return get_numeric_value(value) + if value_type is bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + match value.lower(): + case 'true': + return True + case 'false': + return False + case '': + return False + return get_numeric_value(value) > 0 + if value_type is dict or value_type is OrderedDict: + if isinstance(value, str): + logging.info(f'Converting {value} to dict') + value = value.strip() + if value and value[0] == '{': + value = _parse_dict(value) + elif '=' in value: + # Dict is a list of key=value, pairs. + pv = {} + for prop_value in value.split(','): + prop, val = prop_value.split('=', 1) + prop = prop.strip() + pv[prop] = val.strip() + value = pv + return value + + +def _parse_dict(dict_str: str) -> dict: + """Returns a dict parsed from text string.""" + try: + return ast.literal_eval(dict_str) + except (NameError, ValueError) as e: + logging.error(f'Unable to parse dict {dict_str}') + return {} diff --git a/tools/statvar_importer/eval_functions.py b/tools/statvar_importer/eval_functions.py new file mode 100644 index 0000000000..7e3336442e --- /dev/null +++ b/tools/statvar_importer/eval_functions.py @@ -0,0 +1,125 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for eval statements with PropertyValueMapper. + +The functions can be invoked within '#Eval' in the pv_map.py. +For Example, for format values in 'DateTime' column into ISO-8601 format: +'DateTime': { + '#Eval': 'observationDate=format_date("{Data}")', + } +""" + +import datetime +from datetime import datetime +import os +import re +import sys + +from absl import logging +import dateutil +from dateutil import parser +from dateutil.relativedelta import relativedelta + +# String utility functions + + +def format_date(date_str: str, format_str: str = '%Y-%m-%d') -> str: + """Parse the date string and return formated date string. + + Args: + date_str: Input date string to be parsed. + format_str: output format for date + + Returns: + date formatted by the format_str. + In case of parse error, returns the original date_str. + Raises + NameError in case of any exceptions in parsing. + This will cause any Eval using it to fail. + """ + try: + return dateutil.parser.parse(date_str).strftime(format_str) + except dateutil.parser._parser.ParserError: + return '' + + +def str_to_camel_case(input_string: str, + strip_re: str = r'[^A-Za-z_0-9]') -> str: + """Returns the string in CamelCase without spaces and special characters. + + Example: "Abc-def(HG-123)" -> "AbcDefHG". + + Args: + input_string: string to be converted to CamelCase + strip_chars: regular expression of characters to be removed. + + Returns: + string with non-alpha characters removed and remaining words capitalized. + """ + if not str: + return '' + if not isinstance(input_string, str): + input_string = str(input_string) + # Replace any non-alpha characters with space + clean_str = re.sub(strip_re, ' ', input_string) + clean_str = clean_str.strip() + # Split by space and capitalize first letter, preserving any other capitals + return ''.join( + [w[0].upper() + w[1:] for w in clean_str.split(' ') if len(w) > 0]) + + +EVAL_GLOBALS = { + # Date time functions + 'dateutil_parser_parse': dateutil.parser.parse, + 'format_date': format_date, + 'datetime': datetime, + 'datetime_strptime': datetime.strptime, + 'relativedelta': relativedelta, + # String functions + 'str_to_camel_case': str_to_camel_case, + # Regex functions + 're': re, + 're_sub': re.sub, +} + + +def evaluate_statement(eval_str: str, + variables: dict = {}, + functions: dict = EVAL_GLOBALS) -> (str, str): + """Returns the tuple: (variable, result) after evaluating statement in eval. + + Args: + eval_str: string with statement to be evaluated of the form: + 'variable=statement' if the variable is not specified, an empty string is + retured as variable. + variables: dictionary of variables and values to be used in statement. + functions: dictionary of global functoins that can be invoked within + statement. + + Returns: + tuple of the (variable , result) after evaluating the statement. + in case of exception during eval, None is returned as result + """ + variable = '' + statement = eval_str + if '=' in eval_str: + variable, statement = eval_str.split('=', 1) + variable = variable.strip() + try: + result = eval(statement, functions, variables) + except (SyntaxError, NameError, ValueError, TypeError) as e: + logging.debug( + f'Failed to evaluate: {variable}={statement}, {e} in {variables}') + result = None + return (variable, result) diff --git a/tools/statvar_importer/eval_functions_test.py b/tools/statvar_importer/eval_functions_test.py new file mode 100644 index 0000000000..2e4c442c4c --- /dev/null +++ b/tools/statvar_importer/eval_functions_test.py @@ -0,0 +1,90 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for eval_functions.py.""" + +import os +import sys +import tempfile +import unittest + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) + +import eval_functions + +# module_dir_ is the path to where this test is running from. +_module_dir_ = os.path.dirname(__file__) + + +class TestEvalFunctions(unittest.TestCase): + + def test_evaluate_statement(self): + self.assertEqual( + ('num', 3), + eval_functions.evaluate_statement('num=1+Number', {'Number': 2}), + ) + self.assertEqual( + ('', 4), eval_functions.evaluate_statement('2*Number', + {'Number': 2})) + # Verify None is returned on error in statement + self.assertEqual( + ('name', None), + eval_functions.evaluate_statement( + 'name=1+Data', + {'Data': '2'} # string should raise TypeError + ), + ) + # Missing variable value for Data raises NameError + self.assertEqual(('name', None), + eval_functions.evaluate_statement('name=1+Data')) + + def test_format_date(self): + self.assertEqual('2023-01-31', + eval_functions.format_date('Jan 31, 2023')) + self.assertEqual( + ('month', '2022-01'), + eval_functions.evaluate_statement( + 'month=format_date(Data, "%Y-%m")', {'Data': '2022, Jan 1st'}), + ) + self.assertEqual( + ('', '2022-12-31'), + eval_functions.evaluate_statement( + 'format_date(Data)', {'Data': 'Dec 31st, 2022, 10:00am'}), + ) + self.assertEqual( + ('', ''), + eval_functions.evaluate_statement('format_date("SunMonTue")'), + ) + + def test_str_to_camel_case(self): + self.assertEqual('CamelCase123', + eval_functions.str_to_camel_case(' camel-case 123 ')) + self.assertEqual( + ('name', '10MyDCID'), + eval_functions.evaluate_statement('name=str_to_camel_case(Data)', + {'Data': '1.0 my DCID'}), + ) + self.assertEqual( + ('', 'SnakeCaseString'), + eval_functions.evaluate_statement('str_to_camel_case(Data)', + {'Data': 'snake(case.) string'}), + ) + self.assertEqual( + ('', 'String_Value1'), + eval_functions.evaluate_statement( + 'str_to_camel_case(Data, r"[^A-Za-z0-9_]")', + {'Data': 'string_ value(1)'}, + ), + ) diff --git a/tools/statvar_importer/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py new file mode 100644 index 0000000000..51bad68fdf --- /dev/null +++ b/tools/statvar_importer/property_value_mapper.py @@ -0,0 +1,614 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility class to store property:value mappings for data strings.""" + +import csv +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging +from collections import OrderedDict + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +import config_flags +import eval_functions +import file_util + +import property_value_utils as pv_utils + +from config_map import ConfigMap, read_py_dict_from_file +from counters import Counters, CounterOptions + + +class PropertyValueMapper: + """Class to map strings to set of property values. + + Supports multiple maps with a namespace or context string. Stores string to + property:value maps as a dictionary: _pv_map = { + + 'GLOBAL': { + '': { + '': '' + '': '' + ... + }, + ... + }, + '' : { + '': { + '': '' + ... + }, + ... + }, + } + + The first level keys in _pv_map are namespaces that are column-headers or + 'GLOBAL'. + When looking up PVs for an input string, such as a column header or a cell + value, + first the namespace column-header is tried. + If there are no values then other namespacs such as 'GLOBAL are tried. + + within the PV can have a reference to another property. + Such reference are replaced with that property's value after + all PVs for a data cell have been collected. + + The references are indicated with the syntax '{Variable}' or '@Variable'. + where 'Variable' is expected to be another property in the cell's PVs. + + Internal properties that require special processing begin with '#', such as: + '#Regex': refers to a regular expression with names match groups + to be applied on a cell value + '#Format': a format string to be processed with other parameters + '#Eval': a python statement to be evaluated. It could have some computations + of the form = where the '' is evaluated and + assigned to property or to 'Data'. + + The cell value is mapped to the following default properties: + 'Data': the string value in the cell + 'Number': the numeric value if the cell is a number. + """ + + def __init__( + self, + pv_map_files: list = [], + config_dict: dict = None, + counters_dict: dict = None, + ): + self._config = ConfigMap(config_dict=config_dict) + self._counters = Counters( + counters_dict=counters_dict, + options=CounterOptions(debug=self._config.get('debug', False)), + ) + # Map from a namespace to dictionary of string-> { p:v} + self._pv_map = OrderedDict({'GLOBAL': {}}) + self._num_pv_map_keys = 0 + self._max_words_in_keys = 0 + for filename in pv_map_files: + namespace = 'GLOBAL' + if not file_util.file_get_matching(filename): + if ':' in filename: + namespace, filename = filename.split(':', 1) + self.load_pvs_from_file(filename, namespace) + logging.level_debug() and logging.debug( + f'Loaded PV map {self._pv_map} with max words {self._max_words_in_keys}' + ) + + def load_pvs_from_file(self, filename: str, namespace: str = 'GLOBAL'): + """Loads a map of the form 'string -> { P: V }' from a file. + + File is a python dictionary or a JSON file with python equivalents such as + True(true), False(false), None(null). + + Args: + filename: file containing the dictionary of string to dictionary of PVs + namespace: the namespace key for the dictionary to be loaded against. the + namespace is the first level key in the _pv_map. + """ + # Append new PVs to existing map. + pv_map_input = {} + if file_util.file_is_csv(filename): + # Load rows into a dict of prop,value + # if the first col is a config key, next column is its value + logging.info( + f'Loading PV maps for {namespace} from csv file: {filename}') + with file_util.FileIO(filename) as csvfile: + csv_reader = csv.reader(csvfile, + skipinitialspace=True, + escapechar='\\') + for row in csv_reader: + # Drop trailing empty columns in the row + last_col = len(row) - 1 + while last_col >= 0 and row[last_col].strip() == '': + last_col -= 1 + row = row[:last_col + 1] + if not row: + continue + key = row[0].strip() + if key in self._config.get_configs(): + # Add value to the config with same type as original. + value = ','.join(row[1:]) + config_flags.set_config_value(key, value, self._config) + else: + # Row is a pv map + pvs_list = row[1:] + if len(pvs_list) == 1: + # PVs list has no property, just a value. + # Use the namespace as the property + pvs_list = [namespace] + pvs_list.append(row[1]) + if len(pvs_list) % 2 != 0: + raise RuntimeError( + f'Invalid list of property value: {row} in {filename}' + ) + # Get property,values from the columns + pvs = {} + for i in range(0, len(pvs_list), 2): + prop = pvs_list[i].strip() + if not prop: + continue + value = pvs_list[i + 1].strip() + if value == '""': + value = '' + # Remove extra quotes around schema values. + # if value and value[0] == '"' and value[-1] == '"': + # value = value[1:-1].strip() + if value and value[0] != '[' and prop[0] != '#': + # Add quotes around text strings + # with spaces without commas. + # if re.search('[^,] +', value): + # value = f'"{value}"' + if value[0] == "'" and value[-1] == "'": + # Replace single quote with double quotes + # To distinguish quote as delimiter vs value in CSVs + # single quote is used instead of double quote in CSV values. + value[0] = '"' + value[-1] = '"' + #pvs[prop] = value + normalize = True + if '#' in prop or '=' in value: + # Value is a formula. e value as a string. + normalize = False + pv_utils.add_key_value(prop, + value, + pvs, + self._config.get( + 'multi_value_properties', + {}), + normalize=normalize) + pv_map_input[key] = pvs + else: + logging.info( + f'Loading PV maps for {namespace} from dictionary file: {filename}' + ) + pv_map_input = read_py_dict_from_file(filename) + self.load_pvs_dict(pv_map_input, namespace) + + def load_pvs_dict(self, pv_map_input: dict, namespace: str = 'GLOBAL'): + if namespace not in self._pv_map: + self._pv_map[namespace] = {} + pv_map = self._pv_map[namespace] + word_delimiter = self._config.get('word_delimiter', ' ') + num_keys_added = 0 + for key, pvs_input in pv_map_input.items(): + if key not in pv_map: + pv_map[key] = {} + pvs_dict = pv_map[key] + if isinstance(pvs_input, str): + pvs_input = {namespace: pvs_input} + for p, v in pvs_input.items(): + num_keys_added += 1 + pv_utils.add_key_value( + p, + v, + pvs_dict, + self._config.get('multi_value_properties', {}), + ) + # Track the max number of words in any of the keys. + # This is used when splitting input-string for lookups. + num_words_key = len(pv_utils.get_words(key, word_delimiter)) + self._max_words_in_keys = max(self._max_words_in_keys, + num_words_key) + logging.level_debug() and logging.log( + 2, f'Setting PVMap[{key}] = {pvs_dict}') + + self._num_pv_map_keys += num_keys_added + logging.info( + f'Loaded {num_keys_added} property-value mappings for "{namespace}"' + ) + logging.level_debug() and logging.debug( + f'Loaded pv map {namespace}:{pv_map_input}') + + def get_pv_map(self) -> dict: + """Returns the dictionary mapping input-strings to property:values.""" + return self._pv_map + + def process_pvs_for_data(self, key: str, pvs: dict) -> bool: + """Processes property:value and returns true if processed successfully. + + Processes values for actionable props such as '#Regex', '#Eval', '#Format'. + Args: pvs (input/output) dictionary of property:values Properties such as + '#Regex', '#Eval', '#Format' are processed and resulting properties are + updated into pvs. + + Returns: + True if any property:values were processed and pvs dict was updated. + """ + logging.level_debug() and logging.log( + 2, f'Processing data PVs:{key}:{pvs}') + data_key = self._config.get('data_key', 'Data') + data = pvs.get(data_key, key) + is_modified = False + + # Process regular expression and add named group matches to the PV. + # Regex PV is of the form: '#Regex': '(?P[0-9]+) *- *(?P[0-9])' + # Parses 'Data': '10 - 20' to generate PVs: + # { 'Start': '10', 'End': '20' } + regex_key = self._config.get('regex_key', '#Regex') + if regex_key in pvs and data: + re_pattern = pvs[regex_key] + re_matches = re.finditer(re_pattern, data) + regex_pvs = {} + for match in re_matches: + regex_pvs.update(match.groupdict()) + logging.level_debug() and logging.log( + 2, + f'Processed regex: {re_pattern} on {key}:{data} to get {regex_pvs}' + ) + if regex_pvs: + self._counters.add_counter('processed-regex', 1, re_pattern) + pv_utils.pvs_update( + regex_pvs, pvs, + self._config.get('multi_value_properties', {})) + pvs.pop(regex_key) + is_modified = True + + # Format the data substituting properties with values. + format_key = self._config.get('format_key', '#Format') + if format_key in pvs: + format_str = pvs[format_key] + (format_prop, strf) = _get_variable_expr(format_str, data_key) + try: + format_data = strf.format(**pvs) + logging.level_debug() and logging.log( + 2, + f'Processed format {format_prop}={strf} on {key}:{data} to get' + f' {format_data}') + except (KeyError, ValueError) as e: + format_data = format_str + self._counters.add_counter('error-process-format', 1, + format_str) + logging.level_debug() and logging.log( + 2, + f'Failed to format {format_prop}={strf} on {key}:{data} with' + f' {pvs}, {e}') + if format_prop != data_key and format_data != format_str: + pvs[format_prop] = format_data + self._counters.add_counter('processed-format', 1, format_str) + pvs.pop(format_key) + is_modified = True + + # Evaluate the expression properties as local variables. + eval_key = self._config.get('eval_key', '#Eval') + if eval_key in pvs: + eval_str = pvs[eval_key] + eval_prop, eval_data = eval_functions.evaluate_statement( + eval_str, + pvs, + self._config.get('eval_globals', eval_functions.EVAL_GLOBALS), + ) + logging.level_debug() and logging.log( + 2, + f'Processed eval {eval_str} with {pvs} to get {eval_prop}:{eval_data}' + ) + if not eval_prop: + eval_prop = data_key + if eval_data and eval_data != eval_str: + pvs[eval_prop] = eval_data + self._counters.add_counter('processed-eval', 1, eval_str) + pvs.pop(eval_key) + is_modified = True + logging.level_debug() and logging.log( + 2, f'Processed data PVs:{is_modified}:{key}:{pvs}') + return is_modified + + def get_pvs_for_key(self, key: str, namespace: str = 'GLOBAL') -> dict: + """Return a dict of property-values that are mapped to the given key + within the dictionary for the namespace. + + Args: + key: input string to be looked up + namespace: the top level dictionary key to get the map within which + input-string is looked up. + + Returns: + dictionary of property:values for the input string. + """ + pvs = None + logging.level_debug() and logging.log( + 3, f'Search PVs for {namespace}:{key}') + if namespace in self._pv_map: + pvs = self._pv_map[namespace].get(key, None) + else: + # Check if key is unique and exists in any other map. + dicts_with_key = [] + pvs = {} + namespaces = self._config.get('default_pv_maps', ['GLOBAL']) + for namespace in namespaces: + logging.level_debug() and logging.log( + 3, f'Search PVs for {namespace}:{key}') + if namespace in self._pv_map.keys(): + pv_map = self._pv_map[namespace] + if key in pv_map: + dicts_with_key.append(namespace) + pv_utils.pvs_update( + pv_map[key], pvs, + self._config.get('multi_value_properties', {})) + if len(dicts_with_key) > 1: + logging.warning( + f'Duplicate key {key} in property maps: {dicts_with_key}') + self._counters.add_counter( + f'warning-multiple-property-key', + 1, + f'{key}:' + ','.join(dicts_with_key), + ) + if not pvs: + logging.level_debug() and logging.log( + 3, f'Missing key {key} in property maps') + self._counters.add_counter(f'warning-missing-property-key', 1, key) + return pvs + logging.level_debug() and logging.debug(f'Got PVs for {key}:{pvs}') + return pvs + + def get_pvs_for_key_variants(self, + key: str, + namespace: str = 'GLOBAL') -> list: + """Return a dict of property-values that are mapped to the given key + or its variantes with case lower case. + Args: + key: input string to be looked up + namespace: the top level dictionary key to get the map within which + input-string is looked up. + + Returns: + a list of dictionary of property:values for the input string. + """ + if not key: + return None + pvs = self.get_pvs_for_key(key, namespace) + if not pvs: + # Check if GLOBAL map has key namespace:column-key + pvs = self.get_pvs_for_key(f'{namespace}:{key}') + if not pvs: + pvs = self.get_pvs_for_key(key.lower(), namespace) + if pvs: + pvs_list = [pvs] + pvs_list.append({self._config.get('pv_lookup_key', 'Key'): key}) + return pvs_list + # Check for keys with extra characters removed. + key_filtered = re.sub('[^A-Za-z0-9_%$-]+', ' ', key).strip() + if key_filtered != key: + return self.get_pvs_for_key_variants(key_filtered, namespace) + return None + + def _is_key_in_value(self, key: str, value: str) -> bool: + """Returns True if key is a substring of the value string. + + Only substrings separated by the word boundary are considered. + """ + if self._config.get('match_substring_word_boundary', True): + # Match substring around word boundaries. + while value: + pos = value.find(key) + if pos < 0: + return False + if (pos == 0 or not value[pos - 1].isalpha()) and ( + pos + len(key) <= len(value) or + not value[pos + len(key)].isalpha()): + return True + value = value[pos:] + return False + # key_pat = f'\\b{key}\\b' + # try: + # if re.search(key_pat, value, flags=re.IGNORECASE): + # return True + # else: + # return False + # except re.error as e: + # logging.error( + # f'Failed re.search({key_pat}, {value}) with exception: {e}' + # ) + # return False + + # Simple substring without word boundary checks. + if key.lower() in value.lower(): + return True + return False + + def get_pvs_for_key_substring(self, + value: str, + namespace: str = 'GLOBAL') -> dict: + """Return a dict of property-values for any key is a substring of value + + Args: + value: input string to be mapped to property:values + namespace: column header or context for the value string used as the key + for the first level dictionary in the pv_map. + + Returns: + List of dictionary of property:values that apply to the input string + after collecting all PVs for any key that is a substring of the value. + """ + # Get a list of namespaces to lookup. + # If none given, lookup in all namespaces. + namespaces = [] + if namespace and namespace in self._pv_map: + namespaces.append(namespace) + else: + namespaces = list(self._pv_map.keys()) + pvs_list = [] + keys_list = [] + for n in namespaces: + # Lookup keys from shortest to longest. + # Caller will merge PVs in the reverse order. + pv_map = self._pv_map[n] + sorted_keys = sorted(pv_map.keys(), key=len, reverse=True) + for key in sorted_keys: + if self._is_key_in_value(key, value): + pvs_list.append(pv_map[key]) + keys_list.append(key) + logging.level_debug() and logging.log( + 3, f'Got PVs for {key} in {value}: {pvs_list}') + value = value.replace(key, ' ') + logging.level_debug() and logging.log( + 2, + f'Returning pvs for substrings of {value} from {keys_list}:{pvs_list}' + ) + return pvs_list + + def get_all_pvs_for_value(self, + value: str, + namespace: str = 'GLOBAL', + max_fragment_size: int = None) -> list: + """Return a list of property:value dictionaries for an input string. + + Args: + value: input string to be mapped to property:values + namespace: context for the input string such as the column header. + max_fragment_size: the maximum number of words into which value can be + fragmented when looking for matching keys in the pv_map. + + Returns: + a list of dictionary of property:values. + """ + logging.level_debug() and logging.log( + 1, f'Looking up PVs for {namespace}:{value}') + pvs = self.get_pvs_for_key_variants(value, namespace) + if pvs: + return pvs + # Split the value into n-grams and lookup PVs for each fragment. + word_delimiter = self._config.get('word_delimiter', ' ') + if not word_delimiter: + # Splitting of words is disabled. Don't match substrings. + return None + word_joiner = pv_utils.get_delimiter_char(word_delimiter) + words = pv_utils.get_words(value, word_delimiter) + if len(words) <= 1: + return None + max_fragment_words = len(words) - 1 + if not max_fragment_size: + max_fragment_size = self._max_words_in_keys + max_fragment_words = min(max_fragment_words, max_fragment_size) + + num_grams = (len(words) - max_fragment_size)**2 + if self._num_pv_map_keys < num_grams: + # Fewer keys than n-grams in input. + # Get PVs for keys in pv_map that are a substring of the input value. + return self.get_pvs_for_key_substring(value, namespace) + # Fewer n-grams than number of keys in map. + # Check if any input n-gram matches a key. + logging.level_debug() and logging.log( + 3, f'Looking up PVs for {max_fragment_words} words in {words}') + for num_words in range(max_fragment_words, 0, -1): + for start_index in range(0, len(words) - num_words + 1): + sub_value = word_joiner.join(words[start_index:start_index + + num_words]) + sub_pvs = self.get_pvs_for_key_variants(sub_value, namespace) + if sub_pvs: + # Got PVs for a fragment. + # Also lookup remaining fragments before and after this. + pvs_list = [] + before_value = word_delimiter.join(words[0:start_index]) + after_value = word_delimiter.join(words[start_index + + num_words:]) + logging.level_debug() and logging.log( + 3, + f'Got PVs for {start_index}:{num_words} in' + f' {words}:{sub_value}:{sub_pvs}, lookup pvs for {before_value},' + f' {after_value}', + ) + before_pvs = self.get_all_pvs_for_value( + # before_value, namespace, max_fragment_size=None) + before_value, + namespace, + max_fragment_size=num_words, + ) + after_pvs = self.get_all_pvs_for_value( + # after_value, namespace, max_fragment_size=None) + after_value, + namespace, + max_fragment_size=num_words, + ) + if before_pvs: + pvs_list.extend(before_pvs) + pvs_list.extend(sub_pvs) + if after_pvs: + pvs_list.extend(after_pvs) + logging.level_debug() and logging.log( + 2, f'Got PVs for fragments {before_value}:{before_pvs},' + f' {sub_value}:{sub_pvs}, {after_value}:{after_pvs}') + return pvs_list + return None + + +# Local utility functions +def _get_variable_expr(stmt: str, default_var: str = 'Data') -> (str, str): + """Parses a statement of the form = and returns variable, expr.""" + if '=' in stmt: + (var, expr) = stmt.split('=', 1) + return (var.strip(), expr) + return (default_var, stmt) + + +# PVMap utility functions +def load_pv_map(file: str) -> dict: + """Returns a PV map loaded from a file.""" + pvmap = PropertyValueMapper() + for file in file_util.file_get_matching(file): + pvmap.load_pvs_from_file(file) + pvs = pvmap.get_pv_map() + # Return the pvmap for the first namespace + if pvs: + return pvs[list(pvs.keys())[0]] + return {} + + +def write_pv_map(pvmap: dict, file: str) -> str: + """Write the PV map into a file.""" + if file_util.file_is_csv(file): + # Write pvmap as csv file with rows as : key,prop1,value1,prop2,value2 + with file_util.FileIO(file, 'w') as csv_file: + csv_writer = csv.writer(csv_file) + # Set CSV header as 'key, prop, value' + csv_writer.writerow(['key', 'property', 'value']) + # Write each pvmap node as a row. + for key, pvs in pvmap.items(): + row = [key] + for prop, value in pvs.items(): + row.append(prop) + row.append(value) + csv_writer.writerow(row) + else: + file_util.file_write_py_dict(pvmap, file) + logging.info(f'Wrote {len(pvmap)} rows of PVs into {file}') diff --git a/tools/statvar_importer/property_value_mapper_test.py b/tools/statvar_importer/property_value_mapper_test.py new file mode 100644 index 0000000000..1cc441f667 --- /dev/null +++ b/tools/statvar_importer/property_value_mapper_test.py @@ -0,0 +1,112 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for property_value_mapper.py.""" + +import unittest + +import os +import sys + +from absl import app +from absl import logging +from property_value_mapper import PropertyValueMapper + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + + +class PropertyValueMapperTest(unittest.TestCase): + + def test_load_pvmap(self): + pv_mapper = PropertyValueMapper(pv_map_files=[ + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py') + ]) + + # Verify PVmap has key 'GLOBAL' + pv_map = pv_mapper.get_pv_map() + self.assertTrue('GLOBAL' in pv_map) + self.assertTrue(len(pv_map['GLOBAL']) > 0) + + # Lookup PV Map for known key + pvs = pv_mapper.get_pvs_for_key('Males') + self.assertEqual(pvs, {'gender': 'dcs:Male'}) + + # Lookup PV Map for case mismatched key fails + pvs = pv_mapper.get_pvs_for_key('males') + self.assertEqual(pvs, None) + + # Load PVMap for a different namespace: Variable + pv_mapper.load_pvs_from_file( + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'), + 'Variable') + self.assertTrue('Variable' in pv_mapper.get_pv_map()) + + # Lookup PVMap for 'Variable' column + pvs = pv_mapper.get_pvs_for_key('total', 'Variable') + self.assertEqual(pvs, {'populationType': 'dcs:Person'}) + # Verify keys from Variable are not retruned for GLOBAL + pvs = pv_mapper.get_pvs_for_key('total') + self.assertEqual(pvs, None) + + def test_pvmap_get_all_pvs(self): + pv_mapper = PropertyValueMapper(pv_map_files=[ + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py'), + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'), + ]) + self.assertEqual(len(pv_mapper.get_pv_map()), 1) + + # Verify matches for words in long key not in pv_map + pvs = pv_mapper.get_all_pvs_for_value('Total Males') + expected_pvs = [ + # PVs for 'total' + { + 'populationType': 'dcs:Person' + }, + { + 'Key': 'Total' + }, + # PVs for Male + { + 'gender': 'dcs:Male' + }, + { + 'Key': 'Males' + } + ] + self.assertEqual(pvs, expected_pvs) + + def test_process_pvs(self): + pv_mapper = PropertyValueMapper(pv_map_files=[ + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py'), + os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'), + ]) + + pvs = pv_mapper.get_pvs_for_key('Person Age') + self.assertEqual( + pvs, { + '#Regex': '(?P[0-9]+)-(?P[0-9]+)', + 'age': 'dcid:{@StartAge}To{@EndAge}Years' + }) + # Verify processing of regex for range + self.assertTrue(pv_mapper.process_pvs_for_data('10-20', pvs)) + self.assertEqual( + pvs, { + 'EndAge': '20', + 'StartAge': '10', + 'age': 'dcid:{@StartAge}To{@EndAge}Years' + }) diff --git a/tools/statvar_importer/property_value_utils.py b/tools/statvar_importer/property_value_utils.py new file mode 100644 index 0000000000..a08bd0e65a --- /dev/null +++ b/tools/statvar_importer/property_value_utils.py @@ -0,0 +1,156 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for proerty:values.""" + +import os +import re +import sys + +from typing import Union + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +from mcf_file_util import get_value_list, add_pv_to_node, strip_namespace + + +def is_valid_property(prop: str, schemaless: bool = False) -> bool: + """Returns True if the property begins with a letter, lowercase. + + If schemaless is true, property can begin with uppercase as well. + """ + if prop and isinstance(prop, str) and prop[0].isalpha(): + if schemaless or prop[0].islower(): + return True + return False + + +def is_valid_value(value: str) -> bool: + """Returns True if the value is valid without any references.""" + if value is None: + return False + if isinstance(value, str): + # Check there are no unresolved references. + if not value or value == '""': + return False + if '@' in value: + # Quoted strings can have @<2-letter-lang> suffix. + if not re.search('@[a-z]{2}"$', value): + return False + if '{' in value and '}' in value: + return False + return True + + +def is_schema_node(value: str) -> bool: + """Returns True if the value is a schema node reference.""" + if not value or not isinstance(value, str): + return False + if not value[0].isalpha() and value[0] != '[': + # Numbers or quoted strings are not schema nodes. + return False + # Check if string has any non alpha or non numeric codes + non_alnum_chars = [ + c for c in strip_namespace(value) + if not c.isalnum() and c not in ['_', '/', '[', ']', '.'] + ] + if non_alnum_chars: + return False + return True + + +def has_namespace(value: str) -> bool: + """Returns True if the value has a namespace of letters followed by ':'.""" + if not value or not isinstance(value, str): + return False + len_value = len(value) + pos = 0 + while pos < len_value: + if not value[pos].isalpha(): + break + pos += 1 + if pos < len_value and value[pos] == ':': + return True + return False + + +def add_key_value( + key: str, + value: str, + pvs: dict, + multi_value_keys: set = {}, + overwrite: bool = True, + normalize: bool = True, +) -> dict: + """Adds a key:value to the dict. + + If the key already exists, adds value to a list if key is a multi_value key, + else replaces the value if overwrite is True. + """ + append_value = False + if key in multi_value_keys: + append_value = True + if not append_value and not overwrite and key in pvs: + # Do not add value if one exists and overwrite and append is disabled. + return pvs + return add_pv_to_node(key, + value, + pvs, + append_value=append_value, + normalize=normalize) + + +def get_value_as_list(value: str) -> Union[str, list]: + """Returns the value as a list or string.""" + if isinstance(value, list): + return value + if isinstance(value, str) and value: + if "," in value: + # Get a list of unique values + values = set() + values.update(get_value_list(value)) + value_list = list(values) + if len(value_list) == 1: + return value_list[0] + return value_list + return value + + +def pvs_update(new_pvs: dict, pvs: dict, multi_value_keys: set = {}) -> dict: + """Add the key:value pairs from the new_pvs into the pvs dictionary.""" + for prop, value in new_pvs.items(): + add_key_value(prop, value, pvs, multi_value_keys) + return pvs + + +def get_words(value: str, word_delimiter: str) -> list: + """Returns the list of non-empty words separated by the delimiter.""" + return [w for w in re.split(word_delimiter, value) if w] + + +def get_delimiter_char(re_delimiter: str) -> str: + """Returns a single delimiter character that can be used to join words + + from the first character in the delimiter regex. + """ + if re_delimiter: + if '|' in re_delimiter: + return re_delimiter.split('|')[0] + if re_delimiter[0] == '[': + return re_delimiter[1] + return ' ' diff --git a/tools/statvar_importer/test_data/sample_pv_map.csv b/tools/statvar_importer/test_data/sample_pv_map.csv new file mode 100644 index 0000000000..109debe6a0 --- /dev/null +++ b/tools/statvar_importer/test_data/sample_pv_map.csv @@ -0,0 +1,5 @@ +key,property,value +#Sample PVMap for test,lines begining with #,ignored +total,populationType,dcs:Person, +woman,gender,dcs:Female,age,[18 - Years] +man,gender,dcs:Male,age,[18 - Years] diff --git a/tools/statvar_importer/test_data/sample_pv_map.py b/tools/statvar_importer/test_data/sample_pv_map.py new file mode 100644 index 0000000000..1930c0cf44 --- /dev/null +++ b/tools/statvar_importer/test_data/sample_pv_map.py @@ -0,0 +1,71 @@ +{ + # Sample column map. + # Key is a substring of a row or column header. + # Value is a dictionary of property-value tuples to be applied to + # all elements in the row or column. + # If keys are overlapping, the longest key as a substring of a column is used. + # A column name can map to multiple keys for different parts of the string + # and all property-values for matching keys will be applied. + # + # Values can have references in the syntax "{variable}". + # The variable is replaced with the value from the final set of PVs. + # + # There are special references: + # {Number}: refers to the numeric value in a cell. + # {Data}: refers to other values in a cell that is not mapped to any PVs. + # : Use properties starting with a Capital letter to create + # local variables that are not emitted in the final output, but are place + # holders for replacements. + + # Columns with StatVarObservations should map "value" to "@Number". + + # Place + # Applied to all data values in the row. + "Fips Code": { + "observationAbout": "dcid:geoId/{@Number}" + }, + + # Time of observation + # Applied to all data values in the row. + "Year": { + "observationDate": "@Number", + }, + + # Extract age bucket from a range of values. + "Person Age": { + "#Regex": "(?P[0-9]+)-(?P[0-9]+)", + "age": "dcid:{@StartAge}To{@EndAge}Years", + }, + + # Race: Mapping for values in Column Person Race + "WH": { + "race": "dcs:WhiteAlone", + }, + "A-PI": { + "race": "dcs:AsianOrPacificIslander", + }, + + # Population count observations fom column: "Total Persons". + # key can be normalized to lower case as well. + "total persons": { + "value": "@Number", + "populationType": "dcs:Person", + "measuredProperty": "dcs:count", + }, + + # Another observation for column: Fraction of population + "fraction": { + "populationType": "dcs:Person", + # "measuredProperty" : "dcs:count", # Is the default value for SVObs. + "measurementDenominator": "dcid:Count_Person", + "value": "@Number", + }, + + # Extract PVs from section headers + 'Males': { + 'gender': "dcs:Male", + }, + 'Females': { + 'gender': "dcs:Female", + }, +} diff --git a/util/config_map.py b/util/config_map.py index 1ea640107e..dc75566d0d 100644 --- a/util/config_map.py +++ b/util/config_map.py @@ -55,28 +55,32 @@ ''' import ast +from collections import OrderedDict import collections.abc import pprint import sys +from typing import Union from absl import logging -from collections import OrderedDict -from typing import Union +import file_util class ConfigMap: - '''Class to store config mapping of named parameters to values as a dictionary.''' - - def __init__(self, - config_dict: dict = None, - filename: str = None, - config_string: str = None): - '''Create a Config Map object. - Args: - config_dict: dictionary with key:values to be loaded into the config map. - filename: override the dictionary with key:values from the file. - config_string: string of dictionary parameters to override key:values. - ''' + """Class to store config mapping of named parameters to values as a dictionary.""" + + def __init__( + self, + config_dict: dict = None, + filename: str = None, + config_string: str = None, + ): + """Create a Config Map object. + + Args: + config_dict: dictionary with key:values to be loaded into the config map. + filename: override the dictionary with key:values from the file. + config_string: string of dictionary parameters to override key:values. + """ self._config_dict = dict() # Add configs from input args. if config_dict: @@ -89,181 +93,191 @@ def __init__(self, logging.debug(f'Loaded ConfigMap: {self.get_configs()}') def load_config_file(self, filename: str) -> dict: - '''Load configs from a file overwriting any existing parameter with a new value. + """Load configs from a file overwriting any existing parameter with a new value. - Args: - filename: a py or json file with a dictionary of parameter:value mappings. + Args: + filename: a py or json file with a dictionary of parameter:value + mappings. - Returns: - dictionary with all config parameters after updates from the file. - ''' + Returns: + dictionary with all config parameters after updates from the file. + """ if filename: self.add_configs(read_py_dict_from_file(filename)) return self._config_dict def load_config_string(self, config_params_str: str) -> dict: - '''Loads a JSON config dictionary overriding existing configs. + """Loads a JSON config dictionary overriding existing configs. - Args: - config_params_str: JSON string with a dictionary of parameter:value mappings. + Args: + config_params_str: JSON string with a dictionary of parameter:value + mappings. - Returns: - dictionary with all config parameters after updates. - ''' + Returns: + dictionary with all config parameters after updates. + """ if config_params_str: param_dict = ast.literal_eval(config_params_str) self.add_configs(param_dict) return self._config_dict def add_configs(self, configs: dict) -> dict: - '''Add new or replace existing config parameters - - Nested parameters with dict, or list values are replaced. - Use update_config() for a deep-update of nested parameters. + """Add new or replace existing config parameters + + Nested parameters with dict, or list values are replaced. + Use update_config() for a deep-update of nested parameters. + + For example, assume config-dict has a nested dict: + with an config dict set as follows: self._config_dict = { + 'int-param': 10, + 'nested-dict1': { + 'param1': 123, + } + } + add_config({ 'nested-dict1': { 'param2': abc }) + will return { + 'int-param': 10, + 'nested-dict1': { + 'param2': abc, # older key:values from nested-dict removed. + } + } - For example, assume config-dict has a nested dict: - with an config dict set as follows: self._config_dict = { - 'int-param': 10, - 'nested-dict1': { - 'param1': 123, - } - } - add_config({ 'nested-dict1': { 'param2': abc }) - will return { - 'int-param': 10, - 'nested-dict1': { - 'param2': abc, # older key:values from nested-dict removed. - } - } - - Args: - configs: dictionary with new parameter:value mappings - that are updated into existing dict. - Nested dict objects within the dict are replaced. + Args: + configs: dictionary with new parameter:value mappings that are updated + into existing dict. Nested dict objects within the dict are replaced. - Returns: - dictionary with all parameter:value mappings. - ''' + Returns: + dictionary with all parameter:value mappings. + """ if configs: self._config_dict.update(configs) return self._config_dict def update_config(self, configs: dict) -> dict: - '''Does a deep update of the dict updating nested dicts as well. - For example, assume config-dict has a nested dict: - self._config_dict = { - 'nested-dict1': { - 'param1': 123, - 'nested-dict2': { - 'param2': 345, - } - } + """Does a deep update of the dict updating nested dicts as well. + + For example, assume config-dict has a nested dict: + + self._config_dict = { + 'nested-dict1': { + 'param1': 123, + 'nested-dict2': { + 'param2': 345, } + } + } + + update_config(configs={ + 'nested-dict1': { + 'param1': 321, + 'param1-2': 456, + 'nested-dict2': { + 'param2-1': 789, + }, + }) + + will result in an updated config_dict: + { + 'nested-dict1': { + 'param1': 321, # updated + 'param1-2': 456, # added + 'nested-dict2': { + 'param2': 345, # original + 'param2-1': 789, # added + }, + } + + Args: + configs: dictionary with additional parameter:value mappings. - update_config(configs={ - 'nested-dict1': { - 'param1': 321, - 'param1-2': 456, - 'nested-dict2': { - 'param2-1': 789, - }, - }) - - will result in an updated config_dict: - { - 'nested-dict1': { - 'param1': 321, # updated - 'param1-2': 456, # added - 'nested-dict2': { - 'param2': 345, # original - 'param2-1': 789, # added - }, - } - - Args: - configs: dictionary with additional parameter:value mappings. - - Returns: - dictionary with all parameter:value mappings. - ''' + Returns: + dictionary with all parameter:value mappings. + """ return _deep_update(self._config_dict, configs) def get(self, parameter: str, default_value=None) -> Union[str, int, float, list, dict]: - '''Return the value of a named config parameter. + """Return the value of a named config parameter. - Args: - parameter: name of the parameter to lookup - default_value: Default value to be returned if the parameter doesn't exist. + Args: + parameter: name of the parameter to lookup + default_value: Default value to be returned if the parameter doesn't + exist. - Returns: - value of the parameter in the config dict if it exists or the default_value. - ''' + Returns: + value of the parameter in the config dict if it exists or the + default_value. + """ return self._config_dict.get(parameter, default_value) def get_configs(self) -> dict: - '''Return a reference to the config dictionary. + """Return a reference to the config dictionary. - Any modifications to the dict is reflected within this object as well. - ''' + Any modifications to the dict is reflected within this object as well. + """ return self._config_dict def set_config(self, parameter: str, value): - '''Set the value for a parameter overwriting one if it already exists - Args: - parameter: Name of the parameter - value: Value to be set. - ''' + """Set the value for a parameter overwriting one if it already exists + + Args: + parameter: Name of the parameter + value: Value to be set. + """ self._config_dict[parameter] = value def get_config_str(self) -> str: - '''Returns the config dictionary as a pretty string.''' + """Returns the config dictionary as a pretty string.""" return pprint.pformat(self._config_dict, indent=4) def write_config(filename: str): - '''Write the config dictionary into a file. + """Write the config dictionary into a file. - Args: - filename: name of the file to write. - ''' + Args: + filename: name of the file to write. + """ with open(filename, 'w') as file: file.write(self.get_config_str()) def get_config_map_from_file(filename: str) -> ConfigMap: - '''Returns a ConfigMap object with parameters loaded from a file. + """Returns a ConfigMap object with parameters loaded from a file. - Args: - filename: name of the file to load. + Args: + filename: name of the file to load. - Returns: - ConfigMap object with all the parameters loaded into the config_dict. - ''' + Returns: + ConfigMap object with all the parameters loaded into the config_dict. + """ return ConfigMap(filename=filename) def _deep_update(src: dict, add_dict: dict) -> dict: - '''Deep update of parameters in add_dict into src. + """Deep update of parameters in add_dict into src. - Args: - src: source dictionary into which new parameters are added. - add_dict: dictionary with new parameters to be added. + Args: + src: source dictionary into which new parameters are added. + add_dict: dictionary with new parameters to be added. - Returns: - src dictionary with updated parameters. + Returns: + src dictionary with updated parameters. - Note: - Assumes the new dictionary has same type(dict/list) for updated parameters. - ''' + Note: + Assumes the new dictionary has same type(dict/list) for updated parameters. + """ for k, v in add_dict.items(): if isinstance(v, collections.abc.Mapping): src[k] = _deep_update(src.get(k, {}), v) elif isinstance(v, list): # TODO: deep update of list + if k not in src: + src[k] = list() src[k].extend(v) elif isinstance(v, set): # TODO: deep update of set + if k not in src: + src[k] = set() src[k].update(v) else: src[k] = v @@ -271,38 +285,31 @@ def _deep_update(src: dict, add_dict: dict) -> dict: def read_py_dict_from_file(filename: str) -> dict: - '''Read a python dict from a file. - - Args: - filename: JSON or a python file containing dict of parameter to value mappings. - The file can have comments and extra commas at the end. - Example: '{ 'abc': 123, 'def': 'lmn' } - Note: It assumes bools are in Python: True, False and None is used for 'null'. - - Returns: - dictionary loaded from the file. - - Raises: - exceptions on parsing errors string dict from literal_eval() - ''' - logging.info(f'Reading python dict from {filename}...') - with open(filename) as file: - dict_str = file.read() - - # Load the map assuming a python dictionary. - # Can also be used with JSON with trailing commas and comments. - param_dict = ast.literal_eval(dict_str) + """Read a python dict from a file. + + Args: + filename: JSON or a python file containing dict of parameter to value + mappings. The file can have comments and extra commas at the end. + Example: '{ 'abc': 123, 'def': 'lmn' } + Note: It assumes bools are in Python: True, False and None is used for + 'null'. + + Returns: + dictionary loaded from the file. + + Raises: + exceptions on parsing errors string dict from literal_eval() + """ + param_dict = file_util.file_load_py_dict(filename) logging.debug(f'Loaded {filename} into dict {param_dict}') return param_dict def write_py_dict_to_file(py_dict: dict, filename: str): - '''Write a python dict into a file. + """Write a python dict into a file. - Args: - py_dict: Dictionary to save into the file. - filename: file to write into. - ''' - logging.info(f'Writing python dict into {filename}') - with open(filename, 'w') as file: - file.write(pprint.pformat(py_dict, indent=4)) + Args: + py_dict: Dictionary to save into the file. + filename: file to write into. + """ + file_util.file_write_py_dict(py_dict, filename) diff --git a/util/counters.py b/util/counters.py index 65e43c4801..6f7a0a3285 100644 --- a/util/counters.py +++ b/util/counters.py @@ -13,6 +13,8 @@ # limitations under the License. '''Class for dictionary of named counters.''' +import os +import psutil import sys import time @@ -89,7 +91,7 @@ def __init__(self, def __del__(self): '''Log the counters.''' - self._update_processing_rate() + self._update_periodic_counters() logging.info(self.get_counters_string()) def add_counter(self, @@ -212,7 +214,7 @@ def print_counters(self, file=sys.stderr): Args: file: file handle to emit counters string. ''' - self._update_processing_rate() + self._update_periodic_counters() print(self.get_counters_string(), file=file) def print_counters_periodically(self): @@ -234,7 +236,7 @@ def reset_start_time(self): def set_prefix(self, prefix: str): '''Set the prefix for the counter names. Also resets the start_time and processing rate counters.''' - self._update_processing_rate() + self._update_periodic_counters() self._prefix = prefix self.reset_start_time() logging.info(self.get_counters_string()) @@ -251,6 +253,11 @@ def _get_counter_name(self, name: str, debug_context: str = None): name = name + f'_{debug_context}' return name + def _update_periodic_counters(self): + '''Update periodic counters.''' + self._update_processing_rate() + self._update_process_counters() + def _update_processing_rate(self): '''Update the processing rate and remaining time. Uses the option: 'processed' to get the counter for processing rate @@ -271,3 +278,13 @@ def _update_processing_rate(self): if totals: self.set_counter('process_remaining_time', max(0, (totals - num_processed)) / rate) + + def _update_process_counters(self): + '''Update process counters for memory and time.''' + process = psutil.Process(os.getpid()) + mem = process.memory_info() + self.max_counter('process-mem-rss', mem.rss) + self.max_counter('process-mem', mem.vms) + cpu_times = process.cpu_times() + self.set_counter('process-time-user-secs', cpu_times.user) + self.set_counter('process-time-sys-secs', cpu_times.system) diff --git a/util/dc_api_wrapper.py b/util/dc_api_wrapper.py index b0f12dfb8b..9024fa123f 100644 --- a/util/dc_api_wrapper.py +++ b/util/dc_api_wrapper.py @@ -11,17 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -'''Wrapper utilities for data commons API.''' +"""Wrapper utilities for data commons API.""" -import sys +from collections import OrderedDict import os -import datacommons as dc -import requests_cache +import sys import time import urllib from absl import logging -from collections import OrderedDict +import datacommons as dc +import requests_cache _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(_SCRIPT_DIR) @@ -39,28 +39,33 @@ _DC_API_PATH_RESOLVE_COORD = '/v1/recon/resolve/coordinate' -def dc_api_wrapper(function, - args: dict, - retries: int = 3, - retry_secs: int = 1, - use_cache: bool = False, - api_root: str = None): - '''Wrapper for a DC APi call with retries and caching. - Returns the result from the DC APi call function. - In case of errors, retries the function with a delay a fixed number of times. - - Args: - function: The DataCommons API function. - args: dictionary with any the keyword arguments for the DataCommons API function. - retries: Number of retries in case of HTTP errors. - retry_sec: Interval in seconds between retries for which caller is blocked. - use_cache: If True, uses request cache for faster response. - api_root: The API server to use. Default is 'http://api.datacommons.org'. - To use autopush with more recent data, set it to 'http://autopush.api.datacommons.org' - - Returns: - The response from the DataCommons API call. - ''' +def dc_api_wrapper( + function, + args: dict, + retries: int = 3, + retry_secs: int = 1, + use_cache: bool = False, + api_root: str = None, +): + """Wrapper for a DC APi call with retries and caching. + + Returns the result from the DC APi call function. In case of errors, retries + the function with a delay a fixed number of times. + + Args: + function: The DataCommons API function. + args: dictionary with any the keyword arguments for the DataCommons API + function. + retries: Number of retries in case of HTTP errors. + retry_sec: Interval in seconds between retries for which caller is blocked. + use_cache: If True, uses request cache for faster response. + api_root: The API server to use. Default is 'http://api.datacommons.org'. To + use autopush with more recent data, set it to + 'http://autopush.api.datacommons.org' + + Returns: + The response from the DataCommons API call. + """ if api_root: dc.utils._API_ROOT = api_root logging.debug(f'Setting DC API root to {api_root} for {function}') @@ -80,42 +85,54 @@ def dc_api_wrapper(function, for attempt in range(retries): try: logging.debug( - f'Invoking DC API {function}, #{attempt} with {args}, retries={retries}' - ) + f'Invoking DC API {function}, #{attempt} with {args},' + f' retries={retries}') response = function(**args) logging.debug( f'Got API response {response} for {function}, {args}') return response - except KeyError: - # Exception in case of API error. + except KeyError as e: + # Exception in case of missing dcid. Don't retry. + logging.error(f'Got exception for api: {function}, {e}') return None - except urllib.error.URLError: + except (urllib.error.URLError, urllib.error.HTTPError, + ValueError) as e: # Exception when server is overloaded, retry after a delay if attempt >= retries: + logging.error( + f'Got exception for api: {function}, {e}, no more retries' + ) raise urllib.error.URLError else: logging.debug( - f'Retrying API {function} after {retry_secs}...') + f'Got exception {e}, retrying API {function} after' + f' {retry_secs}...') time.sleep(retry_secs) return None -def dc_api_batched_wrapper(function, - dcids: list, - args: dict, - config: dict = None) -> dict: - '''A wrapper for DC API on dcids with batching support. +def dc_api_batched_wrapper( + function, + dcids: list, + args: dict, + dcid_arg_kw: str = 'dcid', + headers: dict = {}, + config: dict = None, +) -> dict: + """A wrapper for DC API on dcids with batching support. + Returns the dictionary result for the function call across all arguments. - It batches the dcids to make multiple calls to the DC API and merges all results. + It batches the dcids to make multiple calls to the DC API and merges all + results. Args: function: DC API to be invoked. It should have dcids as one of the arguments and should return a dictionary with dcid as the key. - dcids: List of dcids to be invoked with the function. - The namespace is stripped from the dcid before the call to the DC API. + dcids: List of dcids to be invoked with the function. The namespace is + stripped from the dcid before the call to the DC API. args: Additional arguments for the function call. - config: dictionary of DC API configuration settings. - The supported settings are: + config: dictionary of DC API configuration settings. The supported settings + are: dc_api_batch_size: Number of dcids to invoke per API call. dc_api_retries: Number of times an API can be retried. dc_api_retry_sec: Interval in seconds between retries. @@ -124,7 +141,7 @@ def dc_api_batched_wrapper(function, Returns: Merged function return values across all dcids. - ''' + """ if not config: config = {} api_result = {} @@ -132,8 +149,8 @@ def dc_api_batched_wrapper(function, num_dcids = len(dcids) api_batch_size = config.get('dc_api_batch_size', dc.utils._MAX_LIMIT) logging.debug( - f'Calling DC API {function} on {len(dcids)} dcids in batches of {api_batch_size} with args: {args}...' - ) + f'Calling DC API {function} on {len(dcids)} dcids in batches of' + f' {api_batch_size} with args: {args}...') while index < num_dcids: # dcids in batches. dcids_batch = [ @@ -141,11 +158,14 @@ def dc_api_batched_wrapper(function, ] index += api_batch_size args['dcids'] = dcids_batch - batch_result = dc_api_wrapper(function, args, - config.get('dc_api_retries', 3), - config.get('dc_api_retry_secs', 5), - config.get('dc_api_use_cache', False), - config.get('dc_api_root', None)) + batch_result = dc_api_wrapper( + function, + args, + config.get('dc_api_retries', 3), + config.get('dc_api_retry_secs', 5), + config.get('dc_api_use_cache', False), + config.get('dc_api_root', None), + ) if batch_result: api_result.update(batch_result) logging.debug(f'Got DC API result for {function}: {batch_result}') @@ -155,17 +175,19 @@ def dc_api_batched_wrapper(function, def dc_api_is_defined_dcid(dcids: list, wrapper_config: dict = None) -> dict: - '''Returns a dicttionary with dcids mapped to True/False based on whether - the dcid is defined in the API and has a 'typeOf' property. - Uses the property_value() DC API to lookup 'typeOf' for each dcid. - dcids not defined in KG get a value of False. - Args: - dcids: List of dcids. The namespace is stripped from the dcid. - wrapper_config: dictionary of configurationparameters for the wrapper. - See dc_api_batched_wrapper and dc_api_wrapper for details. - Returns: - dictionary with each input dcid mapped to a True/False value. - ''' + """Returns a dictionary with dcids mapped to True/False based on whether + + the dcid is defined in the API and has a 'typeOf' property. + Uses the property_value() DC API to lookup 'typeOf' for each dcid. + dcids not defined in KG get a value of False. + Args: + dcids: List of dcids. The namespace is stripped from the dcid. + wrapper_config: dictionary of configurationparameters for the wrapper. See + dc_api_batched_wrapper and dc_api_wrapper for details. + + Returns: + dictionary with each input dcid mapped to a True/False value. + """ api_function = dc.get_property_values args = { 'prop': 'typeOf', @@ -183,26 +205,68 @@ def dc_api_is_defined_dcid(dcids: list, wrapper_config: dict = None) -> dict: return response +def dc_api_get_node_property(dcids: list, + prop: str, + wrapper_config: dict = None) -> dict: + """Returns a dictionary keyed by dcid with { prop:value } for each dcid. + + Uses the get_property_values() DC API to lookup the property for each dcid. + + Args: + dcids: List of dcids. The namespace is stripped from the dcid. + wrapper_config: dictionary of configurationparameters for the wrapper. See + dc_api_batched_wrapper and dc_api_wrapper for details. + + Returns: + dictionary with each input dcid mapped to a True/False value. + """ + api_function = dc.get_property_values + args = { + 'prop': prop, + 'out': True, + } + api_result = dc_api_batched_wrapper(api_function, dcids, args, + wrapper_config) + response = {} + for dcid in dcids: + dcid_stripped = _strip_namespace(dcid) + value = api_result.get(dcid_stripped) + if value: + response[dcid] = {prop: value} + return response + + def dc_api_get_node_property_values(dcids: list, wrapper_config: dict = None) -> dict: - '''Returns all the property values for a set of dcids from the DC API. - Args: - dcids: list of dcids to lookup - wrapper_config: configuration parameters for the wrapper. - See dc_api_batched_wrapper() and dc_api_wrapper() for details. - Returns: - dictionary with each dcid with the namspace 'dcid:' as the key - mapped to a dictionary of property:value. - ''' + """Returns all the property values for a set of dcids from the DC API. + + Args: + dcids: list of dcids to lookup + wrapper_config: configuration parameters for the wrapper. See + dc_api_batched_wrapper() and dc_api_wrapper() for details. + + Returns: + dictionary with each dcid with the namspace 'dcid:' as the key + mapped to a dictionary of property:value. + """ predefined_nodes = OrderedDict() api_function = dc.get_triples api_triples = dc_api_batched_wrapper(api_function, dcids, {}, wrapper_config) if api_triples: for dcid, triples in api_triples.items(): + if (_strip_namespace(dcid) not in dcids and + _add_namespace(dcid) not in dcids): + continue pvs = {} for d, prop, val in triples: - pvs[prop] = val + if d == dcid and val: + # quote string values with spaces if needed + if ' ' in val and val[0] != '"': + val = '"' + val + '"' + if prop in pvs: + val = pvs[prop] + ',' + val + pvs[prop] = val if len(pvs) > 0: if 'Node' not in pvs: pvs['Node'] = _add_namespace(dcid) @@ -210,16 +274,16 @@ def dc_api_get_node_property_values(dcids: list, return predefined_nodes -def dc_api_resolve_placeid(dcids: list) -> dict: - '''Returns the resolved dcid for each of the placeid. +def dc_api_resolve_placeid(dcids: list, in_prop: str = 'placeId') -> dict: + """Returns the resolved dcid for each of the placeid. - Args: - dcids: list of placeids to be resolved. + Args: + dcids: list of placeids to be resolved. - Returns: - dictionary keyed by input placeid with reoslved dcid as value. - ''' - data = {'in_prop': 'placeId', 'out_prop': 'dcid'} + Returns: + dictionary keyed by input placeid with reoslved dcid as value. + """ + data = {'in_prop': in_prop, 'out_prop': 'dcid'} data['ids'] = dcids num_ids = len(dcids) api_url = dc.utils._API_ROOT + _DC_API_PATH_RESOLVE_ID @@ -241,14 +305,14 @@ def dc_api_resolve_placeid(dcids: list) -> dict: def dc_api_resolve_latlng(dcids: list) -> dict: - '''Returns the resolved dcid for each of the placeid. + """Returns the resolved dcid for each of the placeid. - Args: - dcids: list of placeids to be resolved. + Args: + dcids: list of placeids to be resolved. - Returns: - dictionary keyed by input placeid with reoslved dcid as value. - ''' + Returns: + dictionary keyed by input placeid with reoslved dcid as value. + """ data = {} data['coordinates'] = dcids num_ids = len(dcids) @@ -264,8 +328,8 @@ def dc_api_resolve_latlng(dcids: list) -> dict: if recon_resp: for entity in recon_resp.get('placeCoordinates', []): dcids = entity.get('placeDcids', '') - lat = entity.get("latitude", "") - lng = entity.get("longitude", "") + lat = entity.get('latitude', '') + lng = entity.get('longitude', '') place_id = f'{lat}{lng}' if place_id and dcids: results[place_id] = entity @@ -273,17 +337,19 @@ def dc_api_resolve_latlng(dcids: list) -> dict: def _add_namespace(value: str, namespace: str = 'dcid') -> str: - '''Returns the value with a namespace prefix for references. - Args: - value: string to which namespace is to be added. - Returns: - value with the namespace prefix if the value is not a quoted string - and doesn't have a namespace already. - O/w return the value as is. - - Any sequence of letters followed by a ':' is treated as a namespace. - Quoted strings are assumed to start with '"' and won't get a namespace. - ''' + """Returns the value with a namespace prefix for references. + + Args: + value: string to which namespace is to be added. + + Returns: + value with the namespace prefix if the value is not a quoted string + and doesn't have a namespace already. + O/w return the value as is. + + Any sequence of letters followed by a ':' is treated as a namespace. + Quoted strings are assumed to start with '"' and won't get a namespace. + """ if value and isinstance(value, str): if value[0].isalpha() and value.find(':') < 0: return f'{namespace}:{value}' @@ -291,15 +357,17 @@ def _add_namespace(value: str, namespace: str = 'dcid') -> str: def _strip_namespace(value: str) -> str: - '''Returns the value without the namespace prefix. - Args: - value: string from which the namespace prefix is to be removed. - Returns: - value without the namespace prefix if there was a namespace - - Any sequence of letters followed by a ':' is treated as a namespace. - Quoted strings are assumed to start with '"' and won't be filtered. - ''' + """Returns the value without the namespace prefix. + + Args: + value: string from which the namespace prefix is to be removed. + + Returns: + value without the namespace prefix if there was a namespace + + Any sequence of letters followed by a ':' is treated as a namespace. + Quoted strings are assumed to start with '"' and won't be filtered. + """ if value and isinstance(value, str) and value[0].isalnum(): return value[value.find(':') + 1:].strip() return value diff --git a/util/dc_api_wrapper_test.py b/util/dc_api_wrapper_test.py index 099f1be159..2a38d41e41 100644 --- a/util/dc_api_wrapper_test.py +++ b/util/dc_api_wrapper_test.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -'''Tests for dc_api_wrapper.''' +"""Tests for dc_api_wrapper.""" import os import sys @@ -30,7 +30,7 @@ class TestDCAPIWrapper(unittest.TestCase): def test_dc_api_wrapper(self): - '''Test the wrapper for DC API.''' + """Test the wrapper for DC API.""" api_function = dc.get_property_labels dcids = [ 'Count_Person', # 'dcid:' namespace will be removed. @@ -42,7 +42,7 @@ def test_dc_api_wrapper(self): self.assertTrue('typeOf' in response['Count_Person']) def test_dc_api_batched_wrapper(self): - '''Test DC API wrapper for batched calls.''' + """Test DC API wrapper for batched calls.""" api_function = dc.get_property_values dcids = [ 'Count_Person', # Statvar defined in DC @@ -58,7 +58,7 @@ def test_dc_api_batched_wrapper(self): self.assertFalse(response['NewStatVar_NotInDC']) def test_dc_api_is_defined_dcid(self): - '''Test API wrapper for defined DCIDs.''' + """Test API wrapper for defined DCIDs.""" dcids = [ 'geoId/06', # Geo Id defined. 'country/ZZZ', # Geo Id not defined. @@ -66,10 +66,12 @@ def test_dc_api_is_defined_dcid(self): 'schema:Year', # Class ] response = dc_api.dc_api_is_defined_dcid( - dcids, { + dcids, + { 'dc_api_batch_size': 2, - 'dc_api_root': 'http://autopush.api.datacommons.org' - }) + 'dc_api_root': 'http://autopush.api.datacommons.org', + }, + ) self.assertTrue(response is not None) self.assertEqual(len(response), len(dcids)) self.assertTrue(response['geoId/06']) @@ -79,8 +81,8 @@ def test_dc_api_is_defined_dcid(self): self.assertTrue(response['dcs:value']) def test_dc_get_node_property_values(self): - '''Test API wrapper to get all property:values for a node.''' - node_pvs = dc_api.dc_api_get_node_property_values(['dcs:Count_Person']) + """Test API wrapper to get all property:values for a node.""" + node_pvs = dc_api.dc_api_get_node_property_values(['dcid:Count_Person']) self.assertTrue(node_pvs) # Verify the resposnse has dcid with the namespace prefix 'dcid:' self.assertTrue('dcid:Count_Person' in node_pvs) diff --git a/util/download_util.py b/util/download_util.py index aac96a10df..091a251773 100644 --- a/util/download_util.py +++ b/util/download_util.py @@ -90,6 +90,7 @@ def test_my_function(self): def request_url(url: str, params: dict = {}, method: str = 'GET', + headers: dict = {}, output: str = 'text', timeout: int = 30, retries: int = 3, @@ -147,12 +148,18 @@ def request_url(url: str, for attempt in range(retries): try: logging.debug( - f'Downloading URL {url}, params:{params}, {method} #{attempt}, retries={retries}' + f'Downloading URL {url}, headers:{headers} params:{params}, {method} #{attempt}, retries={retries}' ) if 'get' in method.lower(): - response = requests.get(url, params=params, timeout=timeout) + response = requests.get(url, + headers=headers, + params=params, + timeout=timeout) else: - response = requests.post(url, json=params, timeout=timeout) + response = requests.post(url, + headers=headers, + json=params, + timeout=timeout) logging.debug( f'Got API response {response} for {url}, {params}') if response.ok: @@ -166,14 +173,16 @@ def request_url(url: str, # Exception in case of API error. return None except (requests.exceptions.ConnectTimeout, - urllib.error.URLError) as e: - # Exception when server is overloaded, retry after a delay - if attempt >= retries: - raise urllib.error.URLError - else: - logging.debug( - f'Retrying URL {url} after {retry_secs} secs ...') - time.sleep(retry_secs) + requests.exceptions.ConnectionError, urllib.error.URLError, + urllib.error.HTTPError) as e: + logging.debug(f'Got exception {e} for {url}, {params}') + + # retry in case of errors + if attempt >= retries: + raise urllib.error.URLError + else: + logging.debug(f'Retrying URL {url} after {retry_secs} secs ...') + time.sleep(retry_secs) return None diff --git a/util/file_util.py b/util/file_util.py index d961d13e95..d858c36a06 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -18,22 +18,27 @@ """ import ast +import chardet import csv import fnmatch import glob +import gspread +import io import json import os import pickle import pprint import sys import tempfile -from typing import Union + +import numpy as np from absl import app from absl import logging from aggregation_util import aggregate_dict, aggregate_value from google.cloud import storage -import gspread +from retry.api import retry_call +from typing import Union class FileIO: @@ -428,9 +433,12 @@ def file_get_name(file_path: str, Returns: file name combined from path, suffix and extension. """ - # Create the file directory if it doesn't exist. + if not file_path: + return None if file_is_google_spreadsheet(file_path): + # Don't modify spreadsheets return file_path + # Create the file directory if it doesn't exist. file_makedirs(file_path) file_prefix, ext = os.path.splitext(file_path) if file_prefix.endswith(suffix): @@ -508,6 +516,7 @@ def file_load_csv_dict( value_column: str = None, delimiter: str = ',', config: dict = {}, + key_index: bool = False, ) -> dict: """Returns a dictionary loaded from a CSV file. @@ -540,18 +549,26 @@ def file_load_csv_dict( config: dictionary of aggregation settings in case there are multiple rows with the same key. refer to aggregation_util.aggregate_dict for config settings. - + key_index: if True, each row is loaded with a unique key for row index. + Overrides key_column and uses index as key. Returns: dictionary of {key:value} loaded from the CSV file. """ csv_dict = {} input_files = file_get_matching(filename) logging.debug(f'Loading dict from csv files: {input_files}') + if key_column and key_index: + raise ValueError( + f'Both Key_column: {key_column} and key_index set for {filename}') + for filename in input_files: num_rows = 0 # Load each CSV file with FileIO(filename) as csvfile: - reader = csv.DictReader(csvfile, delimiter=delimiter) + reader = csv.DictReader( + csvfile, + **file_get_csv_reader_options(csvfile, + {'delimiter': delimiter})) if reader.fieldnames: # Get the key and value column names if not key_column: @@ -567,7 +584,9 @@ def file_load_csv_dict( for row in reader: # Get the key for the row. key = None - if key_column in row: + if key_index: + key = len(csv_dict) + elif key_column in row: key = row.pop(key_column) # Get the value for the key value = None @@ -597,7 +616,8 @@ def file_load_csv_dict( def file_write_csv_dict(py_dict: dict, filename: str, - columns: list = None) -> list: + columns: list = None, + key_column_name: str = 'key') -> list: """Returns the filename after writing py_dict with a csv row per item. Each dictionary items is written as a row in the CSV file. @@ -629,6 +649,9 @@ def file_write_csv_dict(py_dict: dict, is used as the key's column name. If no columns are specified for values, column names are picked from each entry's value if the value is a dict. Else the value is written as column name 'value'. + key_column_name: name of the column used as key. + if '', the first column is used as key. + if set to None, the key is ignored. Returns: list of columns written to the output csv @@ -638,8 +661,10 @@ def file_write_csv_dict(py_dict: dict, # Get the list of columns value_column_name = '' if not columns: + columns = [] # Add a columns for key. - columns = ['key'] + if key_column_name: + columns.append(key_column_name) if len(columns) <= 1: # Get columns across all entries. for key, value in py_dict.items(): @@ -652,7 +677,8 @@ def file_write_csv_dict(py_dict: dict, value_column_name = 'value' columns.append(value_column_name) # Use the first column for the key. - key_column_name = columns[0] + if key_column_name is '': + key_column_name = columns[0] # Get the output filename output_files = file_get_matching(filename) @@ -799,7 +825,9 @@ def file_is_google_spreadsheet(filename: str) -> bool: return False -def file_open_google_spreadsheet(url: str) -> gspread.spreadsheet.Spreadsheet: +def file_open_google_spreadsheet(url: str, + retries: int = 3 + ) -> gspread.spreadsheet.Spreadsheet: """Returns the google spreasheet handle. Assumes caller has access to the spreadsheet. @@ -811,7 +839,14 @@ def file_open_google_spreadsheet(url: str) -> gspread.spreadsheet.Spreadsheet: google spreadsheet object for the given url """ # Get a handle for the whole spreadsheet - gs = _file_get_gspread_client().open_by_url(url) + gs = retry_call( + _file_get_gspread_client().open_by_url, + f_args=[url], + exceptions=gspread.exceptions.APIError, + tries=retries, + ) + if gs is None: + logging.error(f'Failed to open {url}') return gs @@ -943,7 +978,10 @@ def file_copy_to_spreadsheet(filename: str, # Read the rows from the source file rows = [] with FileIO(filename) as file: - csv_reader = csv.reader(file, skipinitialspace=True, escapechar='\\') + csv_reader = csv.reader(file, + skipinitialspace=True, + escapechar='\\', + **file_get_csv_reader_options(file)) for row in csv_reader: rows.append(row) @@ -963,6 +1001,120 @@ def file_copy_to_spreadsheet(filename: str, return ws.url +def file_get_sample_bytes(file: str, byte_count: int = 4096) -> bytes: + """Returns sample bytes from file. + + Args: + file: a file name or an open file handle. + byte_count: buyes to be returned. + + Returns: + bytes of the given byte_count. + The file handle is reset to the start. + """ + if isinstance(file, io.TextIOWrapper): + # File is a handle. Get the filename + file = file.name + if isinstance(file, str): + logging.debug(f'Getting sample {byte_count} bytes from {file}') + with FileIO(file, 'rb') as fh: + return fh.read(byte_count) + else: + return b'' + + +def file_get_encoding(file: str, + rawdata: bytes = None, + default: str = 'utf-8-sig') -> str: + """Returns the encoding for the file + + Args: + file: filename whose encoding is required. + rawdata: content whose encoding is to be detected if available. + default: default encoding to be retruned if it can't be determined. + + Returns: + string with encoding such as 'utf8' + """ + if rawdata is None: + rawdata = file_get_sample_bytes(file) + encoding_result = chardet.detect(rawdata) + if encoding_result: + encoding = encoding_result.get('encoding') + if encoding: + return encoding + return default + + +def file_get_csv_reader_options( + file: str, + default_options: dict = {}, + data: str = None, + encoding: str = None, + delim_chars: list = [',', ' ', ';', '|', ':']) -> dict: + """Returns a dictionary with options for the CSV file reader. + + Args: + file: name of the csv file to get encoding + default_options: default options returned if not detected + such as 'delimiter'. + data: string for which delimiter is to be detected + If data is not given, sample data is read from the file. + encoding: character encoding in the file. + delim_chars: list of possible delimiter characters. + If not set, non-alphanumeric characters from the first line + are used as candidate delimiter characters. + + Returns: + dict with the following: + 'delimiter': delimiter character for CSV files. + 'dialect': File dialect, such as 'unix', 'excel' + """ + result = dict(default_options) + + if data is None: + # Get data from file decoded with the right encoding + rawdata = file_get_sample_bytes(file) + if encoding is None: + encoding = file_get_encoding(file, rawdata=rawdata) + data = rawdata.decode(encoding) + + # Get the dialect for the data + try: + dialect = csv.Sniffer().sniff(data) + except csv.Error: + # Use default as excel as it may not be detected well. + dialect = 'excel' + if dialect: + result['dialect'] = dialect + + # Get CSV delimiter by counting possible delimiter characters + # across rows and picking the most common delimiter. + rows = data.split('\n') + if not delim_chars: + # Get non alphanumeric characters from data. + delim_chars = {c for c in rows[0].strip() if not c.isalnum()} + logging.debug(f'Looking for delimiter in %s among %s', file, + delim_chars) + char_counts = {c: [] for c in delim_chars} + for index in range(1, len(rows) - 1): + # Count possible delimiter characters per row + row = rows[index] + for char in delim_chars: + char_counts[char].append(row.count(char)) + # Get the char with the same count across rows. + for c in char_counts.keys(): + c_counts = char_counts[c] + if c_counts: + c_min = min(c_counts) + c_med = np.median(c_counts) + if c_min > 0 and c_min == c_med: + result['delimiter'] = c + break + logging.debug('Got options for file: %s: result = %s', file, result) + return result + + def file_is_csv(filename: str) -> bool: """Returns True is the file has a .csv extension or is a spreadsheet.""" if filename.endswith('.csv') or file_is_google_spreadsheet(filename): diff --git a/util/statvar_dcid_generator.py b/util/statvar_dcid_generator.py index 52a59d05fe..78c16a830d 100644 --- a/util/statvar_dcid_generator.py +++ b/util/statvar_dcid_generator.py @@ -14,20 +14,20 @@ """A utility to generate dcid for statistical variables.""" import copy -import re import os +import re import sys -#pylint: disable=wrong-import-position -#pylint: disable=import-error +# pylint: disable=wrong-import-position +# pylint: disable=import-error # Allows the following module imports to work when running as a script _SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(_SCRIPT_PATH, '.')) # For soc_codes_names from soc_codes_names import SOC_MAP -#pylint: enable=wrong-import-position -#pylint: enable=import-error +# pylint: enable=wrong-import-position +# pylint: enable=import-error # Global constants # Regex to match the quantity notations - [value quantity], [quantity value] @@ -48,10 +48,20 @@ r'(?P-|-?\d+(\.\d+)?)\]') # These are the default properties ignored during dcid generation -_DEFAULT_IGNORE_PROPS = ('unit', 'Node', 'memberOf', 'typeOf', - 'constraintProperties', 'name', 'description', - 'descriptionUrl', 'label', 'url', 'alternateName', - 'scalingFactor') +_DEFAULT_IGNORE_PROPS = ( + 'unit', + 'Node', + 'memberOf', + 'typeOf', + 'constraintProperties', + 'name', + 'description', + 'descriptionUrl', + 'label', + 'url', + 'alternateName', + 'scalingFactor', +) # Regex to match prefixes to be removed from constraints. The regex checks for # specific prefixes followed by an upper case letter or underscore. This helps @@ -109,7 +119,7 @@ '1026': 'LeisureHospitality', '1027': 'OtherServices', '1028': 'PublicAdministration', - '1029': 'Unclassified' + '1029': 'Unclassified', } # Regex to match NAICS Codes. These codes could be a single code or a range @@ -201,12 +211,12 @@ 'householderRelatedChildrenUnder18Years': { 'prepend': 'Householder', 'replace': 'Child', - 'replacement': 'RelatedChildren' + 'replacement': 'RelatedChildren', }, 'householderOwnChildrenUnder18Years': { 'prepend': 'Householder', 'replace': 'Child', - 'replacement': 'OwnChildren' + 'replacement': 'OwnChildren', }, 'occupation': { 'append': 'Occupation' @@ -220,7 +230,7 @@ 'dateOfEntry': { 'prepend': 'DateOfEntry', 'replace': 'Date', - 'replacement': '' + 'replacement': '', }, 'placeOfBirth': { 'prepend': 'PlaceOfBirth' @@ -228,7 +238,7 @@ 'dateMovedIn': { 'prepend': 'MovedInDate', 'replace': 'Date', - 'replacement': '' + 'replacement': '', }, 'bachelorsDegreeMajor': { 'prepend': 'BachelorOf' @@ -265,17 +275,36 @@ }, 'mothersEducation': { 'prepend': 'Mother' - } + }, + 'importSource': { + 'prepend': 'ImportFrom', + }, + 'exportDestination': { + 'prepend': 'ExportTo', + }, + 'lendingEntity': { + 'prepend': 'Lender', + }, } # This is a list of boolean properties _BOOLEAN_PROPS = [ - 'hasComputer', 'hasFunctionalToilet', 'isAccessibleForFree', - 'isEnergyStored', 'isFDAReferenceStandard', 'isFamilyFriendly', - 'isGenomeRepresentationFull', 'isGift', 'isInternetUser', - 'isLiquefiedNaturalGasStored', 'isLiveBroadcast', 'isNaturalGasStored', - 'isPharmacodynamicRelationship', 'isPharmacokineticRelationship', - 'isRefSeqGenBankAssembliesIdentical', 'isHateCrime' + 'hasComputer', + 'hasFunctionalToilet', + 'isAccessibleForFree', + 'isEnergyStored', + 'isFDAReferenceStandard', + 'isFamilyFriendly', + 'isGenomeRepresentationFull', + 'isGift', + 'isInternetUser', + 'isLiquefiedNaturalGasStored', + 'isLiveBroadcast', + 'isNaturalGasStored', + 'isPharmacodynamicRelationship', + 'isPharmacokineticRelationship', + 'isRefSeqGenBankAssembliesIdentical', + 'isHateCrime', ] # To map stat vars which do not follow the conventions of stat var dcid naming @@ -283,29 +312,30 @@ # the replacement dcid. _LEGACY_MAP = { 'Count_Person_WithDisability_NoHealthInsurance': - 'Count_Person_NoHealthInsurance_WithDisability', + ('Count_Person_NoHealthInsurance_WithDisability'), 'Count_Person_NoDisability_NoHealthInsurance': - 'Count_Person_NoHealthInsurance_NoDisability' + ('Count_Person_NoHealthInsurance_NoDisability'), } def _capitalize_process(word: str) -> str: """Capitalizes, removes namespaces, measurement constraint prefixes and - underscores from a word. - Manual upper casing is preferred compared to the builtin function - str.capitalize() because we want to change only the case of the first - character and ignore the case of other characters. Firstly, all namespaces - are removed from the string. Then, constraint prefixes and underscores - are removed. Lastly, the first character is upper cased. + underscores from a word. + + Manual upper casing is preferred compared to the builtin function + str.capitalize() because we want to change only the case of the first + character and ignore the case of other characters. Firstly, all namespaces + are removed from the string. Then, constraint prefixes and underscores + are removed. Lastly, the first character is upper cased. - Args: - word: A string literal to capitalize and process. + Args: + word: A string literal to capitalize and process. - Returns: - Returns a string that can be used in dcid generation. - Returns None if the string is empty. - """ + Returns: + Returns a string that can be used in dcid generation. + Returns None if the string is empty. + """ if word: # Removing namespaces word = word[word.find(':') + 1:] @@ -319,6 +349,15 @@ def _capitalize_process(word: str) -> str: # Removing all underscores word = word.replace('_', '') + # Remove '/' or replace with '-' when used as number separator + words = [] + for tok in word.split('/'): + if tok: + if tok[0].isdigit() and len( + words) > 0 and words[-1][-1].isdigit(): + words.append('-') + words.append(tok[0].upper() + tok[1:]), + word = ''.join(words) # Upper casing the first character word = word[0].upper() + word[1:] @@ -329,19 +368,15 @@ def _capitalize_process(word: str) -> str: def _generate_quantity_range_name(match_dict: dict) -> str: """Generate a name for a quantity range. - Args: - match_dict: A dictionary containing quantity range regex groups. - Expected syntax of match_dict is - { - 'lower_limit': , - 'upper_limit': , - 'quantity': - } - - Returns: - A string representing the quantity range name to be used in the dcid. - Returns None if any of the expected keys are not in the dictionary. - """ + Args: + match_dict: A dictionary containing quantity range regex groups. Expected + syntax of match_dict is { 'lower_limit': , 'upper_limit': + , 'quantity': } + + Returns: + A string representing the quantity range name to be used in the dcid. + Returns None if any of the expected keys are not in the dictionary. + """ try: lower_limit = match_dict['lower_limit'] upper_limit = match_dict['upper_limit'] @@ -369,19 +404,20 @@ def _generate_quantity_range_name(match_dict: dict) -> str: def _naics_code_to_name(naics_val: str) -> str: """Converts NAICS codes to their industry using the _NAICS_MAP. - Args: - naics_val: A NAICS string literal to process. - Expected syntax of naics_val - NAICS/{codes} - '-' can be used to denote range of codes that may or may not belong - to the same industry. For eg, 44-45 will be mapped to 'RetailTrade'. - '_' can be used to represent multiple industries. For eg, 51_52 will - be mapped to 'InformationFinanceInsurance'. A combination of '-' and - '_' is acceptable. - Returns: - A string with all NAICS codes changed to their respective industry. - This string can be used in dcid generation. Returns None if the string - is empty or if the string does not follow the expected syntax. - """ + + Args: + naics_val: A NAICS string literal to process. Expected syntax of naics_val + - NAICS/{codes} '-' can be used to denote range of codes that may or may + not belong to the same industry. For eg, 44-45 will be mapped to + 'RetailTrade'. '_' can be used to represent multiple industries. For eg, + 51_52 will be mapped to 'InformationFinanceInsurance'. A combination of + '-' and '_' is acceptable. + + Returns: + A string with all NAICS codes changed to their respective industry. + This string can be used in dcid generation. Returns None if the string + is empty or if the string does not follow the expected syntax. + """ # Helper function to process NAICS ranges def _process_naics_range(range_str: str) -> str: @@ -419,7 +455,9 @@ def _process_naics_range(range_str: str) -> str: if match_str.find('-') != -1: # Range industry_str = _process_naics_range(match_str) else: - industry_str = _NAICS_MAP[match_str] + industry_str = _NAICS_MAP.get(match_str) + if not industry_str: + return None processed_str = processed_str + industry_str return processed_str return None @@ -427,16 +465,18 @@ def _process_naics_range(range_str: str) -> str: def _soc_code_to_name(soc_val: str) -> str: """Converts SOCv2018 codes to their industry using the SOC_MAP from - soc_codes_names.py - - Args: - soc_val: A SOCv2018 string literal to process. - Expected syntax of soc_val - SOCv2018/{code} - Returns: - A string with SOC code changed to it's occupation. - This string can be used in dcid generation. Returns the original string - if the code is not in the SOC_MAP. Returns None if the string is empty. - """ + + soc_codes_names.py + + Args: + soc_val: A SOCv2018 string literal to process. Expected syntax of soc_val + - SOCv2018/{code} + + Returns: + A string with SOC code changed to it's occupation. + This string can be used in dcid generation. Returns the original string + if the code is not in the SOC_MAP. Returns None if the string is empty. + """ if soc_val: processed_str = soc_val @@ -458,20 +498,22 @@ def _prepend_append_replace(word, replace='', replacement=''): """Prepends, appends and replaces text in a word. - Args: - word: A string literal to prepend, append or replace on. - prepend: A string literal to prepend to word. - append: A string literal to append to word. - replace: A string literal that repersents a substring in word to be - replaced. - replacement: A string literal. In word, all occurances of replace will - be changed to replacement. - Returns: - A string after appending, prepending and replacing to word. - """ + + Args: + word: A string literal to prepend, append or replace on. + prepend: A string literal to prepend to word. + append: A string literal to append to word. + replace: A string literal that repersents a substring in word to be + replaced. + replacement: A string literal. In word, all occurances of replace will be + changed to replacement. + + Returns: + A string after appending, prepending and replacing to word. + """ if replace: word = word.replace(replace, replacement) - if prepend: + if prepend and not word.lower().startswith(prepend.lower()): word = prepend + word if append: word = word + append @@ -481,18 +523,14 @@ def _prepend_append_replace(word, def _generate_quantity_name(match_dict: dict) -> str: """Generate a name for a quantity. - Args: - match_dict: A dictionary containing quantity regex groups. - Expected syntax of match_dict - { - 'value': , - 'quantity': - } - - Returns: - A string representing the quantity name to be used in the dcid. - Returns None if any of the expected keys are not in the dictionary. - """ + Args: + match_dict: A dictionary containing quantity regex groups. Expected syntax + of match_dict { 'value': , 'quantity': } + + Returns: + A string representing the quantity name to be used in the dcid. + Returns None if any of the expected keys are not in the dictionary. + """ try: value = match_dict['value'] quantity = match_dict['quantity'] @@ -505,37 +543,41 @@ def _generate_quantity_name(match_dict: dict) -> str: def _generate_boolean_value_name(prop: str, value: str) -> str: """Generates a name given a boolean property and value. - Args: - prop: A string literal representing the boolean property name. - value: A string literal representing the boolean property value. - Returns: - A string that can be used in dcid generation - """ + + Args: + prop: A string literal representing the boolean property name. + value: A string literal representing the boolean property value. + + Returns: + A string that can be used in dcid generation + """ if value in ('True', 'False'): - constraint_value = value == "True" + constraint_value = value == 'True' pop = None prefix = None - if prop.startswith("has"): + if prop.startswith('has'): pop = prop[3:] - prefix = "Has" if constraint_value else "No" - elif prop.startswith("is"): + prefix = 'Has' if constraint_value else 'No' + elif prop.startswith('is'): pop = prop[2:] - prefix = "Is" if constraint_value else "Not" + prefix = 'Is' if constraint_value else 'Not' else: - assert False, f"Unhandled prefix {prop}" + assert False, f'Unhandled prefix {prop}' return prefix + pop return None def _process_constraint_property(prop: str, value: str) -> str: """Processes constraint property, value and returns a name that can be used - in dcid generation. - Args: - prop: A string literal representing the constraint property name. - value: A string literal representing the constraint property value. - Returns: - A string that can be used in dcid generation. - """ + + in dcid generation. + Args: + prop: A string literal representing the constraint property name. + value: A string literal representing the constraint property value. + + Returns: + A string that can be used in dcid generation. + """ if 'NAICS' in value: name = _naics_code_to_name(value) elif 'SOCv2018/' in value: @@ -568,68 +610,66 @@ def _process_constraint_property(prop: str, value: str) -> str: def get_statvar_dcid(stat_var_dict: dict, ignore_props: list = None) -> str: """Generates the dcid given a statistical variable. - The generated dcid will follow the pattern - ____ - - 1. measurementQualifier is added as a prefix to the dcid. - 2. statType is included when it is not measuredValue. - 3. measurementDenominator is added as a suffix to the dcid. - 4. Constraints are sorted alphabetically based on the prop and values are - added to the dcid. - 5. Existing dcids may not follow the above conventions. The _LEGACY_MAP maps - generated dcids to their existing dcid. - 6. NAICS and SOC codes are replaced with their industry and occupation names - respectively. See _NAICS_MAP and util/soc_codes_names.py for the - mapping. - 7. Boolean constraints are replaced by their populations. For example, - p=isInternetUser and v=True/False becomes v=isInternetUser/ - notInternetUser. See _BOOLEAN_PROPS for the properties that are - considered for this renaming. - 8. Quantities and Quantity Ranges are changed into a name to be used in the - dcid. For example p=age and v=[10 20 Years] becomes v=10To20Years. - 9. Certain variables have text prepended or appended to their constraints to - improve readability. See _PREPEND_APPEND_REPLACE_MAP for more details. - - Args: - stat_var_dict: A dictionary with property: value of the statistical - variable as key-value pairs. - ignore_props: A list of properties to ignore from stat_var_dict when - generating the dcid. This list of ignore_props will be added to the - default set of properties that are ignored. The ignore_props can be - used to account for dependent properties to ignore when generating - the dcid. For example in the following statVar, - { - populationType: Person - measuredProperty: count - statType: measuredValue - healthInsurance: NoHealthInsurance - armedForceStatus: Civilian - institutionalization: USC_NonInstitutionalized - } - since the healthInsurance property indicates they are Civilian and - USC_NonInstitutionalized, ignore_props can be the list - ['armedForceStatus', 'institutionalization']. During the dcid - generation process, these properties will not be considered. - - Returns: - A string representing the dcid of the statistical variable. - - Caveats: - 1. Currently, there is no support for renaming ICD10 cause of death - values and DEA drug names. - 2. MeasuredProp=InsuredUnemploymentRate is not changed to - Rate_InsuredUnemployment. - 3. The generated dcids can get too long due to the large number of - constraint props. In such cases, manual generation or the - ignore_props arg can be used to exclude a few props from the - generation process. It is recommended to limit the length of - statvar dcids to 80 characters or less. - 4. This function does not differentiate between property names and only - uses the values to generate the dcid. Two props having the same - value, say p1=fuel, v1=Coal and p2=energy, v2=Coal will result in - the same dcid. The _PREPEND_APPEND_REPLACE_MAP can be modified to - disambiguate in this case. - """ + The generated dcid will follow the pattern + ____ + + 1. measurementQualifier is added as a prefix to the dcid. + 2. statType is included when it is not measuredValue. + 3. measurementDenominator is added as a suffix to the dcid. + 4. Constraints are sorted alphabetically based on the prop and values are + added to the dcid. + 5. Existing dcids may not follow the above conventions. The _LEGACY_MAP maps + generated dcids to their existing dcid. + 6. NAICS and SOC codes are replaced with their industry and occupation names + respectively. See _NAICS_MAP and util/soc_codes_names.py for the + mapping. + 7. Boolean constraints are replaced by their populations. For example, + p=isInternetUser and v=True/False becomes v=isInternetUser/ + notInternetUser. See _BOOLEAN_PROPS for the properties that are + considered for this renaming. + 8. Quantities and Quantity Ranges are changed into a name to be used in the + dcid. For example p=age and v=[10 20 Years] becomes v=10To20Years. + 9. Certain variables have text prepended or appended to their constraints to + improve readability. See _PREPEND_APPEND_REPLACE_MAP for more details. + + Args: + stat_var_dict: A dictionary with property: value of the statistical + variable as key-value pairs. + ignore_props: A list of properties to ignore from stat_var_dict when + generating the dcid. This list of ignore_props will be added to the + default set of properties that are ignored. The ignore_props can be used + to account for dependent properties to ignore when generating the dcid. + For example in the following statVar, { + populationType: Person + measuredProperty: count + statType: measuredValue + healthInsurance: NoHealthInsurance + armedForceStatus: Civilian + institutionalization: USC_NonInstitutionalized } since the + healthInsurance property indicates they are Civilian and + USC_NonInstitutionalized, ignore_props can be the list + ['armedForceStatus', 'institutionalization']. During the dcid + generation process, these properties will not be considered. + + Returns: + A string representing the dcid of the statistical variable. + + Caveats: + 1. Currently, there is no support for renaming ICD10 cause of death + values and DEA drug names. + 2. MeasuredProp=InsuredUnemploymentRate is not changed to + Rate_InsuredUnemployment. + 3. The generated dcids can get too long due to the large number of + constraint props. In such cases, manual generation or the + ignore_props arg can be used to exclude a few props from the + generation process. It is recommended to limit the length of + statvar dcids to 80 characters or less. + 4. This function does not differentiate between property names and only + uses the values to generate the dcid. Two props having the same + value, say p1=fuel, v1=Coal and p2=energy, v2=Coal will result in + the same dcid. The _PREPEND_APPEND_REPLACE_MAP can be modified to + disambiguate in this case. + """ # TODO: Renaming cause of death properties # TODO: Renaming DEA drug names @@ -693,7 +733,6 @@ def add_prop_to_list(prop: str, svd: dict, dcid_list: list): if denominator_suffix: dcid_list.append(denominator_suffix) - dcid = '_'.join(dcid_list) dcid = _LEGACY_MAP.get(dcid, dcid) return dcid